From 3a37277060e51a023887d0226eef01ed81aba90d Mon Sep 17 00:00:00 2001
From: Andre Ramnitz <andre.ramnitz@mail.de>
Date: Thu, 23 Apr 2026 08:24:35 +0200
Subject: [PATCH] Bump everything

---
 ...nt-customizable-terminal-title.patch.skip} |     0
 ...nboard_fix_addressable_only_mainboard.diff |    33 -
 ...001-messageParser-fix-varInt-decoding.skip |    49 +
 ...ull-wl_output-in-QWaylandScreen-surf.patch |    52 +
 ..._package-to-resolve-GuiPrivate-not-b.patch |    24 +
 gui-apps/noctalia-shell/mod_theme_foot.patch  |    29 +
 .../noctalia-shell/mod_theme_hyprland.patch   |    43 +
 gui-wm/hyprland/5874-enable_LTO.patch         |    10 -
 media-gfx/krita/0001-krita-use_wayland.patch  |   110 +
 .../0001-fix_build_with_gcc13+.patch          |    10 -
 net-misc/networkmanager/1966.patch            |    67 -
 sys-kernel/gentoo-sources                     |     2 +-
 .../{0000-bore.patch.skip => 0001-bore.patch} |     0
 .../gentoo-sources-6.10.3/0003-block.patch    |   485 -
 .../0001-eevdf-next.patch                     |  4444 +++
 .../gentoo-sources-6.11.3+/0003-bbr3.patch    |  3386 ++
 .../gentoo-sources-6.11.3+/0007-ksm.patch     |   433 +
 .../gentoo-sources-6.11.3+/0012-zstd.patch    | 18652 ++++++++++
 .../gentoo-sources-6.11/0001-eevdf-next.patch |  4374 +++
 .../0002-bbr3.patch}                          |    83 +-
 .../0007-ntsync.patch}                        |    23 +-
 .../0008-perf-per-core.patch}                 |    14 +-
 .../0010-zstd.patch                           |     7 +-
 ...ed-additional-timer-tick-frequencies.patch |    55 +
 .../0001-preempt-lazy.patch                   |   958 +
 .../gentoo-sources-6.12/0002-amd-pstate.patch |   902 +
 .../gentoo-sources-6.12/0004-bbr3.patch       |  3386 ++
 .../gentoo-sources-6.12/0006-crypto.patch     |  1606 +
 .../gentoo-sources-6.12/0007-fixes.patch      |   955 +
 .../0008-ntsync.patch}                        |   611 +-
 .../0009-perf-per-core.patch                  |   997 +
 .../gentoo-sources-6.12/0010-pksm.patch       |   433 +
 .../gentoo-sources-6.12/0012-zstd.patch       | 18652 ++++++++++
 ...e_increase_maximum_concurrency_limit.patch |    11 +
 .../gentoo-sources-6.13/0001-amd-pstate.patch |   885 +
 .../0002-amd-tlb-broadcast.patch              |  1350 +
 .../gentoo-sources-6.13/0003-bbr3.patch       |  3386 ++
 .../gentoo-sources-6.13/0005-crypto.patch     |   774 +
 .../0007-itmt-core-ranking.patch              |   365 +
 .../gentoo-sources-6.13/0008-ntsync.patch     |  3050 ++
 .../0009-perf-per-core.patch                  |   898 +
 .../gentoo-sources-6.13/0010-pksm.patch       |   433 +
 .../gentoo-sources-6.13/0012-zstd.patch       | 23530 +++++++++++++
 .../gentoo-sources-6.14/0001-bore.patch       |  1006 +
 .../gentoo-sources-6.14/0004-bbr3.patch       |  3387 ++
 .../gentoo-sources-6.14/0006-crypto.patch     |  2495 ++
 .../gentoo-sources-6.14/0009-zstd.patch       | 23554 +++++++++++++
 .../gentoo-sources-6.14/gentoo-sources-6.15   |     1 +
 .../0001-amd-pstate.patch.skip                |   402 +
 .../gentoo-sources-6.15/0004-bbr3.patch       |  3404 ++
 .../gentoo-sources-6.15/0005-block.patch.skip |   288 +
 ...-20-sched-Cache-aware-load-balancing.patch |   803 +
 ...ral-fixes-for-cache-aware-scheduling.patch |   230 +
 ...k-migration-within-its-preferred-LLC.patch |   112 +
 ...-cpumask-if-the-system-is-overloaded.patch |   122 +
 ...sis-to-switch-a-task-s-preferred-LLC.patch |   157 +
 ...on-for-better-cache-aware-scheduling.patch |   195 +
 ...ther-to-allow-cache-aware-scheduling.patch |   279 +
 ...h-v3-08-20-sched-Set-up-LLC-indexing.patch |   224 +
 ...d-Introduce-task-preferred-LLC-field.patch |   148 +
 ...at-have-LLC-preference-on-a-runqueue.patch |   238 +
 ...runqueue-task-LLC-preference-counter.patch |   180 +
 ...ferred-LLC-tasks-during-load-balance.patch |   139 +
 ...nce-if-it-has-tasks-prefer-other-LLC.patch |   169 +
 ...th-groups-having-preferred-LLC-tasks.patch |   173 +
 ...track-the-preferred-LLC-load-balance.patch |   183 +
 ...ider-LLC-locality-for-active-balance.patch |   182 +
 ...hen-picking-tasks-from-busiest-queue.patch |   193 +
 ...t-is-moving-out-of-its-preferred-LLC.patch |   155 +
 ...-to-control-cache-aware-load-balance.patch |   185 +
 ...o-control-LLC-aggregation-on-wake-up.patch |   136 +
 .../gentoo-sources-6.16/0001-bore.patch.skip  |  1032 +
 .../gentoo-sources-6.16/0002-bbr3.patch       |  3404 ++
 .../gentoo-sources-6.16/0003-block.patch      |   288 +
 .../gentoo-sources-6.16/0005-fixes.patch      |    59 +
 .../gentoo-sources-6.16/0006-s5-power.patch   |   329 +
 ...-28-sched-Cache-aware-load-balancing.patch |   810 +
 ...ral-fixes-for-cache-aware-scheduling.patch |   318 +
 ...k-migration-within-its-preferred-LLC.patch |   117 +
 ...-cpumask-if-the-system-is-overloaded.patch |   131 +
 ...sis-to-switch-a-task-s-preferred-LLC.patch |   165 +
 ...on-for-better-cache-aware-scheduling.patch |   200 +
 ...ther-to-allow-cache-aware-scheduling.patch |   293 +
 ...H-v4-08-28-sched-Set-up-LLC-indexing.patch |   232 +
 ...d-Introduce-task-preferred-LLC-field.patch |   156 +
 ...at-have-LLC-preference-on-a-runqueue.patch |   255 +
 ...runqueue-task-LLC-preference-counter.patch |   217 +
 ...ferred-LLC-tasks-during-load-balance.patch |   147 +
 ...nce-if-it-has-tasks-prefer-other-LLC.patch |   177 +
 ...th-groups-having-preferred-LLC-tasks.patch |   181 +
 ...track-the-preferred-LLC-load-balance.patch |   191 +
 ...ider-LLC-locality-for-active-balance.patch |   190 +
 ...hen-picking-tasks-from-busiest-queue.patch |   201 +
 ...t-is-moving-out-of-its-preferred-LLC.patch |   163 +
 ...-to-control-cache-aware-load-balance.patch |   194 +
 ...o-control-LLC-aggregation-on-wake-up.patch |   145 +
 ...able-cache-aware-only-for-multi-LLCs.patch |   299 +
 ...-and-EPOCH_OLD-into-tunnable-debugfs.patch |   164 +
 ...k-s-preferred-node-for-preferred-LLC.patch |   171 +
 ...number-of-runninhg-tasks-per-process.patch |   169 +
 ...-the-process-has-many-active-threads.patch |   160 +
 ...cheduling-for-process-with-large-RSS.patch |   196 +
 ...-the-scale-factor-for-RSS-comparison.patch |   303 +
 ...load-balance-and-hottest-CPU-changes.patch |   307 +
 .../gentoo-sources-6.17/0002-bbr3.patch       |     0
 .../gentoo-sources-6.17/0003-block.patch      |     0
 .../gentoo-sources-6.17/0005-fixes.patch.skip |     0
 ...cture-for-cache-aware-load-balancing.patch |   654 +
 ...ide-cache-aware-scheduling-decisions.patch |   227 +
 ...ions-to-enforce-LLC-migration-policy.patch |   335 +
 ...able-cache-aware-only-for-multi-LLCs.patch |   208 +
 ...-fair-Add-LLC-index-mapping-for-CPUs.patch |   291 +
 ...Assign-preferred-LLC-ID-to-processes.patch |   156 +
 ...ack-LLC-preferred-tasks-per-runqueue.patch |   257 +
 ...runqueue-task-LLC-preference-counter.patch |   194 +
 ...-prefering-each-LLC-in-a-sched-group.patch |   143 +
 ...ing-destination-LLC-during-balancing.patch |   187 +
 ...d_group-for-LLC-aware-load-balancing.patch |   184 +
 ...ation-type-for-cache-aware-balancing.patch |   185 +
 ...le-tasks-to-from-their-preferred-LLC.patch |   208 +
 ...n-selecting-tasks-for-load-balancing.patch |   201 +
 ...ference-in-task-migration-and-detach.patch |   156 +
 ...-threads-from-cache-aware-scheduling.patch |   172 +
 ...or-processes-with-high-thread-counts.patch |   170 +
 ...cheduling-for-memory-heavy-processes.patch |   246 +
 ...-tolerance-of-cache-aware-scheduling.patch |   366 +
 .../gentoo-sources-6.18/0001-amd-pstate.patch |   120 +
 ...ed-additional-timer-tick-frequencies.patch |     0
 .../gentoo-sources-6.18/0004-bbr3.patch       |  3394 ++
 .../gentoo-sources-6.18/0005-block.patch      |   214 +
 .../gentoo-sources-6.18/0007-crypto.patch     |  3441 ++
 .../gentoo-sources-6.18/0010-sched-ext.patch  |   708 +
 ...-for-cache-aware-load-balancing.patch.skip |   654 +
 ...ache-aware-scheduling-decisions.patch.skip |   227 +
 ...to-enforce-LLC-migration-policy.patch.skip |   335 +
 ...cache-aware-only-for-multi-LLCs.patch.skip |   208 +
 ...-Add-LLC-index-mapping-for-CPUs.patch.skip |   291 +
 ...n-preferred-LLC-ID-to-processes.patch.skip |   156 +
 ...LC-preferred-tasks-per-runqueue.patch.skip |   257 +
 ...eue-task-LLC-preference-counter.patch.skip |   194 +
 ...ering-each-LLC-in-a-sched-group.patch.skip |   143 +
 ...estination-LLC-during-balancing.patch.skip |   187 +
 ...up-for-LLC-aware-load-balancing.patch.skip |   184 +
 ...-type-for-cache-aware-balancing.patch.skip |   185 +
 ...sks-to-from-their-preferred-LLC.patch.skip |   208 +
 ...ecting-tasks-for-load-balancing.patch.skip |   201 +
 ...ce-in-task-migration-and-detach.patch.skip |   156 +
 ...ads-from-cache-aware-scheduling.patch.skip |   172 +
 ...ocesses-with-high-thread-counts.patch.skip |   170 +
 ...ling-for-memory-heavy-processes.patch.skip |   246 +
 ...rance-of-cache-aware-scheduling.patch.skip |   366 +
 ...-for-cache-aware-load-balancing.patch.skip |   637 +
 ...ache-aware-scheduling-decisions.patch.skip |   229 +
 ...to-enforce-LLC-migration-policy.patch.skip |   333 +
 ...ed-cache-Make-LLC-id-continuous.patch.skip |   257 +
 ...n-preferred-LLC-ID-to-processes.patch.skip |   172 +
 ...LC-preferred-tasks-per-runqueue.patch.skip |   289 +
 ...eue-task-LLC-preference-counter.patch.skip |   293 +
 ...er-runqueue-task-LLC-preference.patch.skip |   142 +
 ...estination-LLC-in-a-sched-group.patch.skip |   160 +
 ...only-once-in-update_sg_lb_stats.patch.skip |   142 +
 ...estination-LLC-during-balancing.patch.skip |   276 +
 ...-type-for-cache-aware-balancing.patch.skip |   191 +
 ...sks-to-from-their-preferred-LLC.patch.skip |   195 +
 ...ecting-tasks-for-load-balancing.patch.skip |   206 +
 ...ce-in-task-migration-and-detach.patch.skip |   251 +
 ...duling-for-multi-LLCs-NUMA-node.patch.skip |   192 +
 ...cess-for-cache-aware-scheduling.patch.skip |   172 +
 ...ocesses-with-high-thread-counts.patch.skip |   175 +
 ...ling-for-memory-heavy-processes.patch.skip |   258 +
 ...eters-of-cache-aware-scheduling.patch.skip |   478 +
 ...-for-cache-aware-load-balancing.patch.skip |   174 +
 ...ack-the-load-balance-statistics.patch.skip |   172 +
 ...cy-for-each-process-via-proc-fs.patch.skip |   323 +
 ...uce_kvfree_rcu_barrier_on_cache.patch.skip |   259 +
 .../gentoo-sources-6.19/0002-bbr3.patch       |  3395 ++
 .../gentoo-sources-6.19/0005-hdmi.patch       |  1729 +
 .../gentoo-sources-6.19/0006-r8125.patch      | 29360 ++++++++++++++++
 .../0007-vesa-dsc-bpp.patch                   |   392 +
 .../gentoo-sources-6.19/0008-vmscape.patch    |   366 +
 ...d-additional-timer-tick-frequencies.patch} |     0
 sys-kernel/gentoo-sources-6.6/0001-bbr3.patch |  3352 ++
 sys-kernel/gentoo-sources-6.6/0001-bore.patch |   825 +
 sys-kernel/gentoo-sources-6.6/0005-zstd.patch | 13833 ++++++++
 .../gentoo-sources-6.6/0010-sched-ext.patch   | 19747 -----------
 ...d-uarches-for-kernel-6.1.79-6.8-rc3.patch} |   418 +-
 ...r_multi-llc_select_idle_sibling.patch.skip |    94 -
 ...Tune-ondemand-governor-for-interacti.patch |    75 -
 ...ve-schedutil-dependency-on-Intel-AMD.patch |    36 -
 ...e-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch |    47 -
 ...-Disable-AVX2-and-tree-vectorization.patch |    29 -
 ...dm-crypt-Disable-workqueues-for-cryp.patch |    36 -
 .../0215-ZEN-Add-VHBA-driver.patch            |  1199 -
 .../0301-amd-pstate_preferred_core_V12.patch  |    54 -
 .../0302-amd-pstate_preferred_core_V12.patch  |    92 -
 .../0303-amd-pstate_preferred_core_V12.patch  |   322 -
 .../0304-amd-pstate_preferred_core_V12.patch  |   120 -
 .../0305-amd-pstate_preferred_core_V12.patch  |   182 -
 .../0306-amd-pstate_preferred_core_V12.patch  |   125 -
 .../0307-amd-pstate_preferred_core_V12.patch  |    57 -
 sys-kernel/gentoo-sources-7.0/0001-bore.patch |  1217 +
 ...d-additional-timer-tick-frequencies.patch} |     0
 sys-kernel/git-sources/0001-asus.patch        |  6038 ++++
 sys-kernel/git-sources/0002-bbr3.patch        |  3404 ++
 sys-kernel/git-sources/0003-cachy.patch       |  9540 +++++
 sys-kernel/git-sources/0004-fixes.patch       |   107 +
 sys-kernel/git-sources/0005-sched-ext.patch   | 21992 ------------
 ....14-builtin-preserve-enum-value.patch.skip |    13 +
 208 files changed, 241363 insertions(+), 45437 deletions(-)
 rename app-editors/kakoune/{0007-Implement-customizable-terminal-title.patch => 0007-Implement-customizable-terminal-title.patch.skip} (100%)
 delete mode 100644 app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff
 create mode 100644 dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip
 create mode 100644 dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch
 create mode 100644 gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch
 create mode 100644 gui-apps/noctalia-shell/mod_theme_foot.patch
 create mode 100644 gui-apps/noctalia-shell/mod_theme_hyprland.patch
 delete mode 100644 gui-wm/hyprland/5874-enable_LTO.patch
 create mode 100644 media-gfx/krita/0001-krita-use_wayland.patch
 delete mode 100644 media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch
 delete mode 100644 net-misc/networkmanager/1966.patch
 rename sys-kernel/gentoo-sources-6.10.3/{0000-bore.patch.skip => 0001-bore.patch} (100%)
 delete mode 100644 sys-kernel/gentoo-sources-6.10.3/0003-block.patch
 create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch
 create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch
 create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch
 create mode 100644 sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch
 rename sys-kernel/{git-sources/0003-bbr3.patch => gentoo-sources-6.11/0002-bbr3.patch} (98%)
 rename sys-kernel/{git-sources/0009-ntsync.patch => gentoo-sources-6.11/0007-ntsync.patch} (99%)
 rename sys-kernel/{gentoo-sources-6.10.3/0010-perf-per-core.patch => gentoo-sources-6.11/0008-perf-per-core.patch} (99%)
 rename sys-kernel/{git-sources => gentoo-sources-6.11}/0010-zstd.patch (99%)
 create mode 100644 sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0004-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0006-crypto.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0007-fixes.patch
 rename sys-kernel/{gentoo-sources-6.10.3/0009-ntsync.patch => gentoo-sources-6.12/0008-ntsync.patch} (84%)
 create mode 100644 sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0010-pksm.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/0012-zstd.patch
 create mode 100644 sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0003-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0005-crypto.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0008-ntsync.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0010-pksm.patch
 create mode 100644 sys-kernel/gentoo-sources-6.13/0012-zstd.patch
 create mode 100644 sys-kernel/gentoo-sources-6.14/0001-bore.patch
 create mode 100644 sys-kernel/gentoo-sources-6.14/0004-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.14/0006-crypto.patch
 create mode 100644 sys-kernel/gentoo-sources-6.14/0009-zstd.patch
 create mode 120000 sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15
 create mode 100644 sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.15/0004-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/0005-block.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.16/0002-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/0003-block.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/0005-fixes.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/0006-s5-power.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch
 create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/0002-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/0003-block.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch
 create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch
 create mode 100644 sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch
 rename sys-kernel/{gentoo-sources-6.6 => gentoo-sources-6.18}/0002-glitched-additional-timer-tick-frequencies.patch (100%)
 create mode 100644 sys-kernel/gentoo-sources-6.18/0004-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.18/0005-block.patch
 create mode 100644 sys-kernel/gentoo-sources-6.18/0007-crypto.patch
 create mode 100644 sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip
 create mode 100644 sys-kernel/gentoo-sources-6.19/0002-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.19/0005-hdmi.patch
 create mode 100644 sys-kernel/gentoo-sources-6.19/0006-r8125.patch
 create mode 100644 sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch
 create mode 100644 sys-kernel/gentoo-sources-6.19/0008-vmscape.patch
 rename sys-kernel/{gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch => gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch} (100%)
 create mode 100644 sys-kernel/gentoo-sources-6.6/0001-bbr3.patch
 create mode 100644 sys-kernel/gentoo-sources-6.6/0001-bore.patch
 create mode 100644 sys-kernel/gentoo-sources-6.6/0005-zstd.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch
 rename sys-kernel/{git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch => gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch} (61%)
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch
 delete mode 100644 sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch
 create mode 100644 sys-kernel/gentoo-sources-7.0/0001-bore.patch
 rename sys-kernel/{git-sources/0002-glitched-additional-timer-tick-frequencies.patch => gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch} (100%)
 create mode 100644 sys-kernel/git-sources/0001-asus.patch
 create mode 100644 sys-kernel/git-sources/0002-bbr3.patch
 create mode 100644 sys-kernel/git-sources/0003-cachy.patch
 create mode 100644 sys-kernel/git-sources/0004-fixes.patch
 delete mode 100644 sys-kernel/git-sources/0005-sched-ext.patch
 create mode 100644 sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip

diff --git a/app-editors/kakoune/0007-Implement-customizable-terminal-title.patch b/app-editors/kakoune/0007-Implement-customizable-terminal-title.patch.skip
similarity index 100%
rename from app-editors/kakoune/0007-Implement-customizable-terminal-title.patch
rename to app-editors/kakoune/0007-Implement-customizable-terminal-title.patch.skip
diff --git a/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff b/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff
deleted file mode 100644
index 8fee101..0000000
--- a/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp b/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp
-index 64b28d6d46a689fb21a4497b103a695c03103e5c..a18f2b4af4fc55ff638633a7ee7bc138985b4a0b 100644
---- a/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp
-+++ b/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp
-@@ -15,6 +15,7 @@ AuraMainboardController::AuraMainboardController(hid_device* dev_handle, const c
-     unsigned char num_total_mainboard_leds  = config_table[0x1B];
-     unsigned char num_rgb_headers           = config_table[0x1D];
-     unsigned char num_addressable_headers   = config_table[0x02];
-+    unsigned char effect_channel            = 0;
- 
-     if(num_total_mainboard_leds < num_rgb_headers)
-     {
-@@ -24,14 +25,18 @@ AuraMainboardController::AuraMainboardController(hid_device* dev_handle, const c
-     /*-----------------------------------------------------*\
-     | Add mainboard device                                  |
-     \*-----------------------------------------------------*/
--    device_info.push_back({0x00, 0x04, num_total_mainboard_leds, num_rgb_headers, AuraDeviceType::FIXED});
-+    if(num_total_mainboard_leds > 0)
-+    {
-+        device_info.push_back({effect_channel, 0x04, num_total_mainboard_leds, num_rgb_headers, AuraDeviceType::FIXED});
-+        effect_channel++;
-+    }
- 
-     /*-----------------------------------------------------*\
-     | Add addressable devices                               |
-     \*-----------------------------------------------------*/
-     for(int i = 0; i < num_addressable_headers; i++)
-     {
--        device_info.push_back({0x01, (unsigned char)i, 0x01, 0, AuraDeviceType::ADDRESSABLE});
-+        device_info.push_back({effect_channel, (unsigned char)i, 0x01, 0, AuraDeviceType::ADDRESSABLE});
-     }
- }
- 
diff --git a/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip b/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip
new file mode 100644
index 0000000..08dcd3f
--- /dev/null
+++ b/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip
@@ -0,0 +1,49 @@
+From a181269e49f039b55d1b5fd1509339b5ba06e6be Mon Sep 17 00:00:00 2001
+From: Vaxry <vaxry@vaxry.net>
+Date: Sun, 16 Nov 2025 20:48:56 +0000
+Subject: messageParser: fix varInt decoding
+
+fixes https://github.com/hyprwm/hyprlauncher/issues/65
+---
+ src/core/message/MessageParser.cpp  | 2 +-
+ src/core/wireObject/IWireObject.cpp | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/core/message/MessageParser.cpp b/src/core/message/MessageParser.cpp
+index 7a51be1..7a2c155 100644
+--- a/src/core/message/MessageParser.cpp
++++ b/src/core/message/MessageParser.cpp
+@@ -224,7 +224,7 @@ std::pair<size_t, size_t> CMessageParser::parseVarInt(const std::span<const uint
+     const auto LEN     = data.size();
+     do {
+         rolling += ((sc<uint8_t>(data[i] << 1) >> 1) << (i++ * 7));
+-    } while (i < LEN && data[i] & 0x80);
++    } while (i < LEN && (data[i - 1] & 0x80));
+ 
+     return {rolling, i};
+ }
+diff --git a/src/core/wireObject/IWireObject.cpp b/src/core/wireObject/IWireObject.cpp
+index 00b4406..7c81e58 100644
+--- a/src/core/wireObject/IWireObject.cpp
++++ b/src/core/wireObject/IWireObject.cpp
+@@ -101,7 +101,7 @@ uint32_t IWireObject::call(uint32_t id, ...) {
+             case HW_MESSAGE_MAGIC_TYPE_VARCHAR: {
+                 data.emplace_back(HW_MESSAGE_MAGIC_TYPE_VARCHAR);
+                 auto str = va_arg(va, const char*);
+-                data.append_range(g_messageParser->encodeVarInt(std::strlen(str)));
++                data.append_range(g_messageParser->encodeVarInt(std::string_view(str).size()));
+                 data.append_range(std::string_view(str));
+                 break;
+             }
+@@ -129,7 +129,7 @@ uint32_t IWireObject::call(uint32_t id, ...) {
+                     case HW_MESSAGE_MAGIC_TYPE_VARCHAR: {
+                         for (size_t i = 0; i < arrayLen; ++i) {
+                             const char* element = rc<const char**>(arrayData)[i];
+-                            data.append_range(g_messageParser->encodeVarInt(std::strlen(element)));
++                            data.append_range(g_messageParser->encodeVarInt(std::string_view(element).size()));
+                             data.append_range(std::string_view(element));
+                         }
+                         break;
+-- 
+2.51.0
+
diff --git a/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch b/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch
new file mode 100644
index 0000000..a4bcc0a
--- /dev/null
+++ b/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch
@@ -0,0 +1,52 @@
+From 0285fce0ce0db7b9446389870fe6c076310eb28e Mon Sep 17 00:00:00 2001
+From: Igor Khanin <igor@khanin.biz>
+Date: Thu, 26 Feb 2026 10:50:58 +0200
+Subject: Wayland: Ignore null wl_output in QWaylandScreen surface enter/leave
+
+Misbehaving compositors (as observed with some Smithay based
+compositors) may send wl_surface.enter and wl_surface.leave messages
+referring to an output which was already removed from the registry, and
+therefore its' proxy object was already destroyed. This manifests as
+the listener method being invoked with a null wl_output pointer, which
+Qt then dereferences - leading to a crash.
+
+To avoid crashing, simply just ignore such events.
+
+Pick-to: 6.11 6.10
+Change-Id: Ib217366b5aff1b39dcc6f42e52165b94ea7a1018
+Reviewed-by: David Edmundson <davidedmundson@kde.org>
+---
+ src/plugins/platforms/wayland/qwaylandsurface.cpp | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/src/plugins/platforms/wayland/qwaylandsurface.cpp b/src/plugins/platforms/wayland/qwaylandsurface.cpp
+index bd7c358e42a..ffccbefe61c 100644
+--- a/src/plugins/platforms/wayland/qwaylandsurface.cpp
++++ b/src/plugins/platforms/wayland/qwaylandsurface.cpp
+@@ -56,8 +56,10 @@ void QWaylandSurface::handleScreenRemoved(QScreen *qScreen)
+ 
+ void QWaylandSurface::surface_enter(wl_output *output)
+ {
+-    auto addedScreen = QWaylandScreen::fromWlOutput(output);
++    if (!output)
++        return;
+ 
++    auto addedScreen = QWaylandScreen::fromWlOutput(output);
+     if (!addedScreen)
+         return;
+ 
+@@ -76,8 +78,10 @@ void QWaylandSurface::surface_enter(wl_output *output)
+ 
+ void QWaylandSurface::surface_leave(wl_output *output)
+ {
+-    auto *removedScreen = QWaylandScreen::fromWlOutput(output);
++    if (!output)
++        return;
+ 
++    auto *removedScreen = QWaylandScreen::fromWlOutput(output);
+     if (!removedScreen)
+         return;
+ 
+-- 
+2.52.0
+
diff --git a/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch b/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch
new file mode 100644
index 0000000..918a129
--- /dev/null
+++ b/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch
@@ -0,0 +1,24 @@
+From 9063e6837b4e282a73b052a0c0371916daccf50a Mon Sep 17 00:00:00 2001
+From: VuaTech <vuatech@proton.me>
+Date: Fri, 12 Sep 2025 12:32:44 -0400
+Subject: cmake: Added find_package to resolve GuiPrivate not being found (#4)
+
+---
+ hyprqtplugin/CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/hyprqtplugin/CMakeLists.txt b/hyprqtplugin/CMakeLists.txt
+index bbdb33a..c427670 100644
+--- a/hyprqtplugin/CMakeLists.txt
++++ b/hyprqtplugin/CMakeLists.txt
+@@ -1,5 +1,7 @@
+ project(hyprqtplugin)
+ 
++find_package(Qt6 REQUIRED COMPONENTS Gui GuiPrivate)
++
+ set(app_SRCS
+   main.cpp
+   PlatformTheme.cpp
+-- 
+2.51.2
+
diff --git a/gui-apps/noctalia-shell/mod_theme_foot.patch b/gui-apps/noctalia-shell/mod_theme_foot.patch
new file mode 100644
index 0000000..810e396
--- /dev/null
+++ b/gui-apps/noctalia-shell/mod_theme_foot.patch
@@ -0,0 +1,29 @@
+--- noctalia-release/Assets/Templates/foot_predefined.backup	2026-04-23 08:13:04.228820613 +0200
++++ noctalia-release/Assets/Templates/terminal/foot-predefined	2026-04-23 08:17:51.671131130 +0200
+@@ -20,3 +20,26 @@
+ selection-foreground={{colors.terminal_selection_fg.default.hex_stripped}}
+ selection-background={{colors.terminal_selection_bg.default.hex_stripped}}
+ cursor={{colors.terminal_cursor_text.default.hex_stripped}} {{colors.terminal_cursor.default.hex_stripped}}
++
++[colors-light]
++foreground={{colors.terminal_background.default.hex_stripped}}
++background={{colors.terminal_foreground.default.hex_stripped}}
++regular0={{colors.terminal_normal_black.default.hex_stripped}}
++regular1={{colors.terminal_normal_red.default.hex_stripped}}
++regular2={{colors.terminal_normal_green.default.hex_stripped}}
++regular3={{colors.terminal_normal_yellow.default.hex_stripped}}
++regular4={{colors.terminal_normal_blue.default.hex_stripped}}
++regular5={{colors.terminal_normal_magenta.default.hex_stripped}}
++regular6={{colors.terminal_normal_cyan.default.hex_stripped}}
++regular7={{colors.terminal_normal_white.default.hex_stripped}}
++bright0={{colors.terminal_bright_black.default.hex_stripped}}
++bright1={{colors.terminal_bright_red.default.hex_stripped}}
++bright2={{colors.terminal_bright_green.default.hex_stripped}}
++bright3={{colors.terminal_bright_yellow.default.hex_stripped}}
++bright4={{colors.terminal_bright_blue.default.hex_stripped}}
++bright5={{colors.terminal_bright_magenta.default.hex_stripped}}
++bright6={{colors.terminal_bright_cyan.default.hex_stripped}}
++bright7={{colors.terminal_bright_white.default.hex_stripped}}
++selection-foreground={{colors.terminal_selection_bg.default.hex_stripped}}
++selection-background={{colors.terminal_selection_fg.default.hex_stripped}}
++cursor={{colors.terminal_cursor_text.default.hex_stripped}} {{colors.terminal_cursor.default.hex_stripped}}
diff --git a/gui-apps/noctalia-shell/mod_theme_hyprland.patch b/gui-apps/noctalia-shell/mod_theme_hyprland.patch
new file mode 100644
index 0000000..b30286b
--- /dev/null
+++ b/gui-apps/noctalia-shell/mod_theme_hyprland.patch
@@ -0,0 +1,43 @@
+*** noctalia-release/Assets/Templates/hyprland.conf	2026-04-02 06:45:57.794000629 +0200
+--- noctalia-release/Assets/Templates/hyprland_b.conf	2026-04-02 06:37:39.309812237 +0200
+***************
+*** 8,25 ****
+  general {
+      col.active_border = $primary
+      col.inactive_border = $surface
+  }
+  
+  group {
+      col.border_active = $secondary
+      col.border_inactive = $surface
+!     col.border_locked_active = $error
+      col.border_locked_inactive = $surface
+  
+      groupbar {
+          col.active = $secondary
+          col.inactive = $surface
+!         col.locked_active = $error
+          col.locked_inactive = $surface
+      }
+  }
+--- 8,27 ----
+  general {
+      col.active_border = $primary
+      col.inactive_border = $surface
++     col.nogroup_border = $surface
++     col.nogroup_border_active = $error
+  }
+  
+  group {
+      col.border_active = $secondary
+      col.border_inactive = $surface
+!     col.border_locked_active = $tertiary
+      col.border_locked_inactive = $surface
+  
+      groupbar {
+          col.active = $secondary
+          col.inactive = $surface
+!         col.locked_active = $tertiary
+          col.locked_inactive = $surface
+      }
+  }
diff --git a/gui-wm/hyprland/5874-enable_LTO.patch b/gui-wm/hyprland/5874-enable_LTO.patch
deleted file mode 100644
index 25394f4..0000000
--- a/gui-wm/hyprland/5874-enable_LTO.patch
+++ /dev/null
@@ -1,10 +0,0 @@
-diff --git a/meson.build b/meson.build
-@@ -5,6 +5,7 @@ project('Hyprland', 'cpp', 'c',
-    'default_library=static',
-    'optimization=3',
-    'buildtype=release',
-    + 'b_lto=true',
-    'debug=false'
-    # 'cpp_std=c++23' # not yet supported by meson, as of version 0.63.0
-    ])
---
diff --git a/media-gfx/krita/0001-krita-use_wayland.patch b/media-gfx/krita/0001-krita-use_wayland.patch
new file mode 100644
index 0000000..f7734ce
--- /dev/null
+++ b/media-gfx/krita/0001-krita-use_wayland.patch
@@ -0,0 +1,110 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 18fb3c1..a1aca83 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -544,8 +544,6 @@ endif()
+ 
+ if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU)
+ 
+-    find_package(Qt5 ${MIN_QT_VERSION} REQUIRED X11Extras)
+-
+     find_package(Qt5DBus ${MIN_QT_VERSION})
+     set(HAVE_DBUS ${Qt5DBus_FOUND})
+     set_package_properties(Qt5DBus PROPERTIES
+@@ -562,10 +560,8 @@ if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU)
+         TYPE OPTIONAL
+         PURPOSE "Optionally used to provide crash reporting on Linux")
+ 
+-    find_package(X11 REQUIRED COMPONENTS Xinput)
+-    set(HAVE_X11 TRUE)
+-    add_definitions(-DHAVE_X11)
+-
++    set(HAVE_X11 FALSE)
++    set(HAVE_XCB FALSE)
+ else()
+     set(HAVE_DBUS FALSE)
+     set(HAVE_X11 FALSE)
+diff --git a/krita/main.cc b/krita/main.cc
+index c7ff996..a063d1a 100644
+--- a/krita/main.cc
++++ b/krita/main.cc
+@@ -568,13 +568,6 @@ extern "C" MAIN_EXPORT int MAIN_FN(int argc, char **argv)
+ 
+     installTranslators(app);
+ 
+-    if (KisApplication::platformName() == "wayland") {
+-        QMessageBox::critical(nullptr,
+-                              i18nc("@title:window", "Fatal Error"),
+-                              i18n("Krita does not support the Wayland platform. Use XWayland to run Krita on Wayland. Krita will close now."));
+-        return -1;
+-    }
+-
+     KisUsageLogger::writeHeader();
+     KisOpenGL::initialize();
+ 
+diff --git a/libs/ui/CMakeLists.txt b/libs/ui/CMakeLists.txt
+index 91e7de3..c2b3e5b 100644
+--- a/libs/ui/CMakeLists.txt
++++ b/libs/ui/CMakeLists.txt
+@@ -712,11 +712,6 @@ if (ANDROID)
+     target_link_libraries(kritaui PRIVATE Qt5::AndroidExtras)
+ endif()
+ 
+-if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU)
+-  target_link_libraries(kritaui PRIVATE ${X11_X11_LIB}
+-                                ${X11_Xinput_LIB})
+-endif()
+-
+ if (HAIKU)
+   target_link_libraries(kritaui PRIVATE network expat iconv intl)
+ endif()
+@@ -736,15 +731,6 @@ if(OpenEXR_FOUND)
+   target_link_libraries(kritaui PUBLIC ${LINK_OPENEXR_LIB})
+ endif()
+ 
+-# Add VSync disable workaround
+-if(NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU)
+-    target_link_libraries(kritaui PRIVATE ${CMAKE_DL_LIBS} Qt5::X11Extras)
+-endif()
+-
+-if(X11_FOUND)
+-    target_link_libraries(kritaui PRIVATE Qt5::X11Extras ${X11_LIBRARIES})
+-endif()
+-
+ target_include_directories(kritaui
+     PUBLIC
+     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/canvas>
+diff --git a/libs/widgets/CMakeLists.txt b/libs/widgets/CMakeLists.txt
+index aeae382..6f26f93 100644
+--- a/libs/widgets/CMakeLists.txt
++++ b/libs/widgets/CMakeLists.txt
+@@ -113,10 +113,6 @@ target_link_libraries(kritawidgets
+     KF5::Completion
+ )
+ 
+-if(X11_FOUND)
+-    target_link_libraries(kritawidgets Qt5::X11Extras ${X11_LIBRARIES})
+-endif()
+-
+ set_target_properties(kritawidgets PROPERTIES
+     VERSION ${GENERIC_KRITA_LIB_VERSION} SOVERSION ${GENERIC_KRITA_LIB_SOVERSION}
+ )
+diff --git a/plugins/extensions/pykrita/plugin/version_checker.h b/plugins/extensions/pykrita/plugin/version_checker.h
+index ac092b2..6676d67 100644
+--- a/plugins/extensions/pykrita/plugin/version_checker.h
++++ b/plugins/extensions/pykrita/plugin/version_checker.h
+@@ -13,6 +13,14 @@
+ # include <QtCore/QStringList>
+ # include <QtCore/QtGlobal>
+ 
++#ifdef major
++#undef major
++#endif
++
++#ifdef minor
++#undef minor
++#endif
++
+ namespace PyKrita
+ {
+ 
diff --git a/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch b/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch
deleted file mode 100644
index 6a79f60..0000000
--- a/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch
+++ /dev/null
@@ -1,10 +0,0 @@
-*** ./plugin/plugingizmo/plugin.h	2023-04-30 16:29:49.409277767 +0200
-+++ ./plugin/plugingizmo/plugin.h	2023-04-30 16:29:40.469204589 +0200
-@@ -30,6 +30,7 @@
- #include <string>
- 
- #include <cstdlib>
-+#include <cstdint>
- 
- #if defined(WIN32)
- #define PG_EXPORT extern "C" __declspec(dllexport)
diff --git a/net-misc/networkmanager/1966.patch b/net-misc/networkmanager/1966.patch
deleted file mode 100644
index b48b21d..0000000
--- a/net-misc/networkmanager/1966.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-From 70d1c34b94baadc3305745cf159ea55f312beacc Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Fri, 7 Jun 2024 14:03:15 -0700
-Subject: [PATCH] libnm-systemd-core: Disable sd_dhcp6_client_set_duid_uuid
- function
-
-When building on musl systems ( with out systemd ), and using LLD linker
-from LLVM project we fail to link with undefined symbols.
-
-This symbol is in sd_id128.c but its disabled, so let disable the functions
-which need this function.
-
-| x86_64-yoe-linux-musl-ld.lld: error: undefined symbol: sd_id128_get_machine_app_specific
-| >>> referenced by sd-dhcp-duid.c:202 (/usr/src/debug/networkmanager/1.48.0/../NetworkManager-1.48.0/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c:202)
-| >>>               libnm-systemd-core.a.p/src_libsystemd-network_sd-dhcp-duid.c.o:(sd_dhcp_duid_set_uuid) in archive src/libnm-systemd-core/libnm-systemd-core.a
-| x86_64-yoe-linux-musl-clang: error: linker command failed with exit code 1 (use -v to see invocation)
-
-Signed-off-by: Khem Raj <raj.khem@gmail.com>
----
- src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c   | 2 ++
- .../src/libsystemd-network/sd-dhcp6-client.c                   | 3 ++-
- 2 files changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c
-index e664a4a720..7ba502086f 100644
---- a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c
-+++ b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c
-@@ -193,6 +193,7 @@ int sd_dhcp_duid_set_en(sd_dhcp_duid *duid) {
-         return 0;
- }
- 
-+#if 0
- int sd_dhcp_duid_set_uuid(sd_dhcp_duid *duid) {
-         sd_id128_t machine_id;
-         int r;
-@@ -209,6 +210,7 @@ int sd_dhcp_duid_set_uuid(sd_dhcp_duid *duid) {
-         duid->size = offsetof(struct duid, uuid.uuid) + sizeof(machine_id);
-         return 0;
- }
-+#endif
- 
- int dhcp_duid_to_string_internal(uint16_t type, const void *data, size_t data_size, char **ret) {
-         _cleanup_free_ char *p = NULL, *x = NULL;
-diff --git a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c
-index 7c20116409..08c1e96b3c 100644
---- a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c
-+++ b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c
-@@ -244,6 +244,7 @@ int sd_dhcp6_client_set_duid_en(sd_dhcp6_client *client) {
-         return 0;
- }
- 
-+#if 0
- int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client) {
-         int r;
- 
-@@ -256,7 +257,7 @@ int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client) {
- 
-         return 0;
- }
--
-+#endif
- int sd_dhcp6_client_set_duid_raw(sd_dhcp6_client *client, uint16_t duid_type, const uint8_t *duid, size_t duid_len) {
-         int r;
- 
--- 
-GitLab
-
diff --git a/sys-kernel/gentoo-sources b/sys-kernel/gentoo-sources
index 5f1b707..d942c51 120000
--- a/sys-kernel/gentoo-sources
+++ b/sys-kernel/gentoo-sources
@@ -1 +1 @@
-gentoo-sources-6.10.3
\ No newline at end of file
+gentoo-sources-7.0/
\ No newline at end of file
diff --git a/sys-kernel/gentoo-sources-6.10.3/0000-bore.patch.skip b/sys-kernel/gentoo-sources-6.10.3/0001-bore.patch
similarity index 100%
rename from sys-kernel/gentoo-sources-6.10.3/0000-bore.patch.skip
rename to sys-kernel/gentoo-sources-6.10.3/0001-bore.patch
diff --git a/sys-kernel/gentoo-sources-6.10.3/0003-block.patch b/sys-kernel/gentoo-sources-6.10.3/0003-block.patch
deleted file mode 100644
index e71c1d0..0000000
--- a/sys-kernel/gentoo-sources-6.10.3/0003-block.patch
+++ /dev/null
@@ -1,485 +0,0 @@
-From bbee24678a3eef2513debdb999d302394638f90b Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 3 Aug 2024 09:33:05 +0200
-Subject: [PATCH 03/12] block
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- block/bfq-iosched.c | 120 ++++++++++++++++++++++++++++++++++++--------
- block/bfq-iosched.h |  16 +++++-
- block/mq-deadline.c | 110 +++++++++++++++++++++++++++++++++-------
- 3 files changed, 203 insertions(+), 43 deletions(-)
-
-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
-index 4b88a54a9b76..88df08a246fa 100644
---- a/block/bfq-iosched.c
-+++ b/block/bfq-iosched.c
-@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
- 	return icq;
- }
- 
-+static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
-+{
-+	if (!current->io_context)
-+		return NULL;
-+	if (spin_trylock_irq(&q->queue_lock)) {
-+		struct bfq_io_cq *icq;
-+
-+		icq = icq_to_bic(ioc_lookup_icq(q));
-+		spin_unlock_irq(&q->queue_lock);
-+		return icq;
-+	}
-+
-+	return NULL;
-+}
-+
- /*
-  * Scheduler run of queue, if there are requests pending and no one in the
-  * driver that will restart queueing.
-@@ -2454,10 +2469,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
- 	 * returned by bfq_bic_lookup does not go away before
- 	 * bfqd->lock is taken.
- 	 */
--	struct bfq_io_cq *bic = bfq_bic_lookup(q);
-+	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
- 	bool ret;
- 
--	spin_lock_irq(&bfqd->lock);
-+	/*
-+	 * bio merging is called for every bio queued, and it's very easy
-+	 * to run into contention because of that. If we fail getting
-+	 * the dd lock, just skip this merge attempt. For related IO, the
-+	 * plug will be the successful merging point. If we get here, we
-+	 * already failed doing the obvious merge. Chances of actually
-+	 * getting a merge off this path is a lot slimmer, so skipping an
-+	 * occassional lookup that will most likely not succeed anyway should
-+	 * not be a problem.
-+	 */
-+	if (!spin_trylock_irq(&bfqd->lock))
-+		return false;
- 
- 	if (bic) {
- 		/*
-@@ -5148,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
- {
- 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
- 
-+	if (!list_empty_careful(&bfqd->at_head) ||
-+	    !list_empty_careful(&bfqd->at_tail))
-+		return true;
-+
- 	/*
- 	 * Avoiding lock: a race on bfqd->queued should cause at
- 	 * most a call to dispatch for nothing
-@@ -5297,15 +5327,61 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
- 					     bool idle_timer_disabled) {}
- #endif /* CONFIG_BFQ_CGROUP_DEBUG */
- 
-+static void bfq_insert_request(struct request_queue *q, struct request *rq,
-+			       blk_insert_t flags, struct list_head *free);
-+
-+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags,
-+			    struct list_head *list, struct list_head *free)
-+{
-+	while (!list_empty(list)) {
-+		struct request *rq;
-+
-+		rq = list_first_entry(list, struct request, queuelist);
-+		list_del_init(&rq->queuelist);
-+		bfq_insert_request(q, rq, flags, free);
-+	}
-+}
-+
-+static void bfq_do_insert(struct request_queue *q, struct list_head *free)
-+{
-+	struct bfq_data *bfqd = q->elevator->elevator_data;
-+	LIST_HEAD(at_head);
-+	LIST_HEAD(at_tail);
-+
-+	spin_lock(&bfqd->insert_lock);
-+	list_splice_init(&bfqd->at_head, &at_head);
-+	list_splice_init(&bfqd->at_tail, &at_tail);
-+	spin_unlock(&bfqd->insert_lock);
-+
-+	__bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
-+	__bfq_do_insert(q, 0, &at_tail, free);
-+}
-+
- static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- {
--	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
-+	struct request_queue *q = hctx->queue;
-+	struct bfq_data *bfqd = q->elevator->elevator_data;
- 	struct request *rq;
- 	struct bfq_queue *in_serv_queue;
- 	bool waiting_rq, idle_timer_disabled = false;
-+	LIST_HEAD(free);
-+
-+	/*
-+	 * If someone else is already dispatching, skip this one. This will
-+	 * defer the next dispatch event to when something completes, and could
-+	 * potentially lower the queue depth for contended cases.
-+	 *
-+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
-+	 * retries if nothing is dispatched.
-+	 */
-+	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
-+	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
-+		return NULL;
- 
- 	spin_lock_irq(&bfqd->lock);
- 
-+	bfq_do_insert(hctx->queue, &free);
-+
- 	in_serv_queue = bfqd->in_service_queue;
- 	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
- 
-@@ -5315,7 +5391,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
- 	}
- 
-+	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
- 	spin_unlock_irq(&bfqd->lock);
-+	blk_mq_free_requests(&free);
- 	bfq_update_dispatch_stats(hctx->queue, rq,
- 			idle_timer_disabled ? in_serv_queue : NULL,
- 				idle_timer_disabled);
-@@ -6236,27 +6314,21 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
- 
- static struct bfq_queue *bfq_init_rq(struct request *rq);
- 
--static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
--			       blk_insert_t flags)
-+static void bfq_insert_request(struct request_queue *q, struct request *rq,
-+			       blk_insert_t flags, struct list_head *free)
- {
--	struct request_queue *q = hctx->queue;
- 	struct bfq_data *bfqd = q->elevator->elevator_data;
- 	struct bfq_queue *bfqq;
- 	bool idle_timer_disabled = false;
- 	blk_opf_t cmd_flags;
--	LIST_HEAD(free);
- 
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
- 		bfqg_stats_update_legacy_io(q, rq);
- #endif
--	spin_lock_irq(&bfqd->lock);
- 	bfqq = bfq_init_rq(rq);
--	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
--		spin_unlock_irq(&bfqd->lock);
--		blk_mq_free_requests(&free);
-+	if (blk_mq_sched_try_insert_merge(q, rq, free))
- 		return;
--	}
- 
- 	trace_block_rq_insert(rq);
- 
-@@ -6286,8 +6358,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- 	 * merge).
- 	 */
- 	cmd_flags = rq->cmd_flags;
--	spin_unlock_irq(&bfqd->lock);
--
- 	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
- 				cmd_flags);
- }
-@@ -6296,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
- 				struct list_head *list,
- 				blk_insert_t flags)
- {
--	while (!list_empty(list)) {
--		struct request *rq;
-+	struct request_queue *q = hctx->queue;
-+	struct bfq_data *bfqd = q->elevator->elevator_data;
- 
--		rq = list_first_entry(list, struct request, queuelist);
--		list_del_init(&rq->queuelist);
--		bfq_insert_request(hctx, rq, flags);
--	}
-+	spin_lock_irq(&bfqd->insert_lock);
-+	if (flags & BLK_MQ_INSERT_AT_HEAD)
-+		list_splice_init(list, &bfqd->at_head);
-+	else
-+		list_splice_init(list, &bfqd->at_tail);
-+	spin_unlock_irq(&bfqd->insert_lock);
- }
- 
- static void bfq_update_hw_tag(struct bfq_data *bfqd)
-@@ -7211,6 +7283,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 	q->elevator = eq;
- 	spin_unlock_irq(&q->queue_lock);
- 
-+	spin_lock_init(&bfqd->lock);
-+	spin_lock_init(&bfqd->insert_lock);
-+
-+	INIT_LIST_HEAD(&bfqd->at_head);
-+	INIT_LIST_HEAD(&bfqd->at_tail);
-+
- 	/*
- 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
- 	 * Grab a permanent reference to it, so that the normal code flow
-@@ -7329,8 +7407,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 	/* see comments on the definition of next field inside bfq_data */
- 	bfqd->actuator_load_threshold = 4;
- 
--	spin_lock_init(&bfqd->lock);
--
- 	/*
- 	 * The invocation of the next bfq_create_group_hierarchy
- 	 * function is the head of a chain of function calls
-diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
-index 467e8cfc41a2..f44f5d4ec2f4 100644
---- a/block/bfq-iosched.h
-+++ b/block/bfq-iosched.h
-@@ -504,12 +504,26 @@ struct bfq_io_cq {
- 	unsigned int requests;	/* Number of requests this process has in flight */
- };
- 
-+enum {
-+	BFQ_DISPATCHING	= 0,
-+};
-+
- /**
-  * struct bfq_data - per-device data structure.
-  *
-  * All the fields are protected by @lock.
-  */
- struct bfq_data {
-+	struct {
-+		spinlock_t lock;
-+		spinlock_t insert_lock;
-+	} ____cacheline_aligned_in_smp;
-+
-+	unsigned long run_state;
-+
-+	struct list_head at_head;
-+	struct list_head at_tail;
-+
- 	/* device request queue */
- 	struct request_queue *queue;
- 	/* dispatch queue */
-@@ -795,8 +809,6 @@ struct bfq_data {
- 	/* fallback dummy bfqq for extreme OOM conditions */
- 	struct bfq_queue oom_bfqq;
- 
--	spinlock_t lock;
--
- 	/*
- 	 * bic associated with the task issuing current bio for
- 	 * merging. This and the next field are used as a support to
-diff --git a/block/mq-deadline.c b/block/mq-deadline.c
-index acdc28756d9d..8b214233a061 100644
---- a/block/mq-deadline.c
-+++ b/block/mq-deadline.c
-@@ -79,10 +79,23 @@ struct dd_per_prio {
- 	struct io_stats_per_prio stats;
- };
- 
-+enum {
-+	DD_DISPATCHING	= 0,
-+};
-+
- struct deadline_data {
- 	/*
- 	 * run time data
- 	 */
-+	struct {
-+		spinlock_t lock;
-+		spinlock_t insert_lock;
-+	} ____cacheline_aligned_in_smp;
-+
-+	unsigned long run_state;
-+
-+	struct list_head at_head;
-+	struct list_head at_tail;
- 
- 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
- 
-@@ -100,8 +113,6 @@ struct deadline_data {
- 	int front_merges;
- 	u32 async_depth;
- 	int prio_aging_expire;
--
--	spinlock_t lock;
- };
- 
- /* Maps an I/O priority class to a deadline scheduler priority. */
-@@ -112,6 +123,9 @@ static const enum dd_prio ioprio_class_to_prio[] = {
- 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
- };
- 
-+static void dd_insert_request(struct request_queue *q, struct request *rq,
-+			      blk_insert_t flags, struct list_head *free);
-+
- static inline struct rb_root *
- deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
- {
-@@ -451,6 +465,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
- 	return NULL;
- }
- 
-+static void __dd_do_insert(struct request_queue *q, blk_insert_t flags,
-+			   struct list_head *list, struct list_head *free)
-+{
-+	while (!list_empty(list)) {
-+		struct request *rq;
-+
-+		rq = list_first_entry(list, struct request, queuelist);
-+		list_del_init(&rq->queuelist);
-+		dd_insert_request(q, rq, flags, free);
-+	}
-+}
-+
-+static void dd_do_insert(struct request_queue *q, struct list_head *free)
-+{
-+	struct deadline_data *dd = q->elevator->elevator_data;
-+	LIST_HEAD(at_head);
-+	LIST_HEAD(at_tail);
-+
-+	spin_lock(&dd->insert_lock);
-+	list_splice_init(&dd->at_head, &at_head);
-+	list_splice_init(&dd->at_tail, &at_tail);
-+	spin_unlock(&dd->insert_lock);
-+
-+	__dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
-+	__dd_do_insert(q, 0, &at_tail, free);
-+}
-+
- /*
-  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
-  *
-@@ -461,12 +502,27 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
-  */
- static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
- {
--	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-+	struct request_queue *q = hctx->queue;
-+	struct deadline_data *dd = q->elevator->elevator_data;
- 	const unsigned long now = jiffies;
- 	struct request *rq;
- 	enum dd_prio prio;
-+	LIST_HEAD(free);
-+
-+	/*
-+	 * If someone else is already dispatching, skip this one. This will
-+	 * defer the next dispatch event to when something completes, and could
-+	 * potentially lower the queue depth for contended cases.
-+	 *
-+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
-+	 * retries if nothing is dispatched.
-+	 */
-+	if (test_bit(DD_DISPATCHING, &dd->run_state) ||
-+	    test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state))
-+		return NULL;
- 
- 	spin_lock(&dd->lock);
-+	dd_do_insert(q, &free);
- 	rq = dd_dispatch_prio_aged_requests(dd, now);
- 	if (rq)
- 		goto unlock;
-@@ -482,8 +538,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 	}
- 
- unlock:
-+	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
- 	spin_unlock(&dd->lock);
- 
-+	blk_mq_free_requests(&free);
- 	return rq;
- }
- 
-@@ -585,6 +643,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
- 
- 	eq->elevator_data = dd;
- 
-+	spin_lock_init(&dd->lock);
-+	spin_lock_init(&dd->insert_lock);
-+
-+	INIT_LIST_HEAD(&dd->at_head);
-+	INIT_LIST_HEAD(&dd->at_tail);
-+
- 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
- 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
- 
-@@ -601,7 +665,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
- 	dd->last_dir = DD_WRITE;
- 	dd->fifo_batch = fifo_batch;
- 	dd->prio_aging_expire = prio_aging_expire;
--	spin_lock_init(&dd->lock);
- 
- 	/* We dispatch from request queue wide instead of hw queue */
- 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
-@@ -657,7 +720,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
- 	struct request *free = NULL;
- 	bool ret;
- 
--	spin_lock(&dd->lock);
-+	/*
-+	 * bio merging is called for every bio queued, and it's very easy
-+	 * to run into contention because of that. If we fail getting
-+	 * the dd lock, just skip this merge attempt. For related IO, the
-+	 * plug will be the successful merging point. If we get here, we
-+	 * already failed doing the obvious merge. Chances of actually
-+	 * getting a merge off this path is a lot slimmer, so skipping an
-+	 * occassional lookup that will most likely not succeed anyway should
-+	 * not be a problem.
-+	 */
-+	if (!spin_trylock(&dd->lock))
-+		return false;
-+
- 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
- 	spin_unlock(&dd->lock);
- 
-@@ -670,10 +745,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
- /*
-  * add rq to rbtree and fifo
-  */
--static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
-+static void dd_insert_request(struct request_queue *q, struct request *rq,
- 			      blk_insert_t flags, struct list_head *free)
- {
--	struct request_queue *q = hctx->queue;
- 	struct deadline_data *dd = q->elevator->elevator_data;
- 	const enum dd_data_dir data_dir = rq_data_dir(rq);
- 	u16 ioprio = req_get_ioprio(rq);
-@@ -727,19 +801,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
- {
- 	struct request_queue *q = hctx->queue;
- 	struct deadline_data *dd = q->elevator->elevator_data;
--	LIST_HEAD(free);
--
--	spin_lock(&dd->lock);
--	while (!list_empty(list)) {
--		struct request *rq;
--
--		rq = list_first_entry(list, struct request, queuelist);
--		list_del_init(&rq->queuelist);
--		dd_insert_request(hctx, rq, flags, &free);
--	}
--	spin_unlock(&dd->lock);
- 
--	blk_mq_free_requests(&free);
-+	spin_lock(&dd->insert_lock);
-+	if (flags & BLK_MQ_INSERT_AT_HEAD)
-+		list_splice_init(list, &dd->at_head);
-+	else
-+		list_splice_init(list, &dd->at_tail);
-+	spin_unlock(&dd->insert_lock);
- }
- 
- /* Callback from inside blk_mq_rq_ctx_init(). */
-@@ -780,6 +848,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
- 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- 	enum dd_prio prio;
- 
-+	if (!list_empty_careful(&dd->at_head) ||
-+	    !list_empty_careful(&dd->at_tail))
-+		return true;
-+
- 	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
- 		if (dd_has_work_for_prio(&dd->per_prio[prio]))
- 			return true;
--- 
-2.46.0.rc1
-
diff --git a/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch b/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch
new file mode 100644
index 0000000..8a554c3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch
@@ -0,0 +1,4444 @@
+From 5a335b2c05a76e727dad94990bd8d78b220829c3 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 10 Oct 2024 12:45:33 +0200
+Subject: [PATCH] eevdf-next
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/scheduler/sched-deadline.rst |  14 +-
+ drivers/cpufreq/cppc_cpufreq.c             |   6 +-
+ fs/bcachefs/six.c                          |   2 +-
+ fs/select.c                                |   2 +-
+ include/linux/ioprio.h                     |   2 +-
+ include/linux/sched.h                      |  26 +-
+ include/linux/sched/deadline.h             |  14 +-
+ include/linux/sched/prio.h                 |   1 +
+ include/linux/sched/rt.h                   |  33 +-
+ include/uapi/linux/sched/types.h           |   6 +-
+ kernel/freezer.c                           |   2 +-
+ kernel/locking/rtmutex.c                   |   4 +-
+ kernel/locking/rwsem.c                     |   4 +-
+ kernel/locking/ww_mutex.h                  |   2 +-
+ kernel/sched/core.c                        | 273 +++++---
+ kernel/sched/cpufreq_schedutil.c           |   6 +-
+ kernel/sched/deadline.c                    | 465 ++++++++++---
+ kernel/sched/debug.c                       | 198 +++++-
+ kernel/sched/fair.c                        | 750 ++++++++++++++++-----
+ kernel/sched/features.h                    |  30 +-
+ kernel/sched/idle.c                        |  23 +-
+ kernel/sched/rt.c                          | 261 +++----
+ kernel/sched/sched.h                       | 102 ++-
+ kernel/sched/stats.h                       |  10 +
+ kernel/sched/stop_task.c                   |  18 +-
+ kernel/sched/syscalls.c                    | 132 +---
+ kernel/sched/topology.c                    |   8 +
+ kernel/time/hrtimer.c                      |   6 +-
+ kernel/trace/trace_sched_wakeup.c          |   2 +-
+ mm/page-writeback.c                        |   4 +-
+ mm/page_alloc.c                            |   2 +-
+ 31 files changed, 1688 insertions(+), 720 deletions(-)
+
+diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
+index 9fe4846079bb..22838ed8e13a 100644
+--- a/Documentation/scheduler/sched-deadline.rst
++++ b/Documentation/scheduler/sched-deadline.rst
+@@ -749,21 +749,19 @@ Appendix A. Test suite
+  of the command line options. Please refer to rt-app documentation for more
+  details (`<rt-app-sources>/doc/*.json`).
+ 
+- The second testing application is a modification of schedtool, called
+- schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
+- certain pid/application. schedtool-dl is available at:
+- https://github.com/scheduler-tools/schedtool-dl.git.
++ The second testing application is done using chrt which has support
++ for SCHED_DEADLINE.
+ 
+  The usage is straightforward::
+ 
+-  # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
++  # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
+ 
+  With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
+- of 10ms every 100ms (note that parameters are expressed in microseconds).
+- You can also use schedtool to create a reservation for an already running
++ of 10ms every 100ms (note that parameters are expressed in nanoseconds).
++ You can also use chrt to create a reservation for an already running
+  application, given that you know its pid::
+ 
+-  # schedtool -E -t 10000000:100000000 my_app_pid
++  # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
+ 
+ Appendix B. Minimal main()
+ ==========================
+diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
+index bafa32dd375d..1a5ad184d28f 100644
+--- a/drivers/cpufreq/cppc_cpufreq.c
++++ b/drivers/cpufreq/cppc_cpufreq.c
+@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
+ 		 * Fake (unused) bandwidth; workaround to "fix"
+ 		 * priority inheritance.
+ 		 */
+-		.sched_runtime	= 1000000,
+-		.sched_deadline = 10000000,
+-		.sched_period	= 10000000,
++		.sched_runtime	= NSEC_PER_MSEC,
++		.sched_deadline = 10 * NSEC_PER_MSEC,
++		.sched_period	= 10 * NSEC_PER_MSEC,
+ 	};
+ 	int ret;
+ 
+diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
+index 3a494c5d1247..9cbd3c14c94f 100644
+--- a/fs/bcachefs/six.c
++++ b/fs/bcachefs/six.c
+@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock)
+ 	 */
+ 	rcu_read_lock();
+ 	struct task_struct *owner = READ_ONCE(lock->owner);
+-	bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
++	bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
+ 	rcu_read_unlock();
+ 
+ 	return ret;
+diff --git a/fs/select.c b/fs/select.c
+index bc185d111436..bc5762b03945 100644
+--- a/fs/select.c
++++ b/fs/select.c
+@@ -82,7 +82,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
+ 	 * Realtime tasks get a slack of 0 for obvious reasons.
+ 	 */
+ 
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		return 0;
+ 
+ 	ktime_get_ts64(&now);
+diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
+index db1249cd9692..b25377b6ea98 100644
+--- a/include/linux/ioprio.h
++++ b/include/linux/ioprio.h
+@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
+ {
+ 	if (task->policy == SCHED_IDLE)
+ 		return IOPRIO_CLASS_IDLE;
+-	else if (task_is_realtime(task))
++	else if (rt_or_dl_task_policy(task))
+ 		return IOPRIO_CLASS_RT;
+ 	else
+ 		return IOPRIO_CLASS_BE;
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 1c771ea4481d..57cf27a3045c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -149,8 +149,9 @@ struct user_event_mm;
+  * Special states are those that do not use the normal wait-loop pattern. See
+  * the comment with set_special_state().
+  */
+-#define is_special_task_state(state)				\
+-	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
++#define is_special_task_state(state)					\
++	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |	\
++		    TASK_DEAD | TASK_FROZEN))
+ 
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ # define debug_normal_state_change(state_value)				\
+@@ -541,9 +542,14 @@ struct sched_entity {
+ 	struct rb_node			run_node;
+ 	u64				deadline;
+ 	u64				min_vruntime;
++	u64				min_slice;
+ 
+ 	struct list_head		group_node;
+-	unsigned int			on_rq;
++	unsigned char			on_rq;
++	unsigned char			sched_delayed;
++	unsigned char			rel_deadline;
++	unsigned char			custom_slice;
++					/* hole */
+ 
+ 	u64				exec_start;
+ 	u64				sum_exec_runtime;
+@@ -641,12 +647,24 @@ struct sched_dl_entity {
+ 	 * overruns.
+ 	 *
+ 	 * @dl_server tells if this is a server entity.
++	 *
++	 * @dl_defer tells if this is a deferred or regular server. For
++	 * now only defer server exists.
++	 *
++	 * @dl_defer_armed tells if the deferrable server is waiting
++	 * for the replenishment timer to activate it.
++	 *
++	 * @dl_defer_running tells if the deferrable server is actually
++	 * running, skipping the defer phase.
+ 	 */
+ 	unsigned int			dl_throttled      : 1;
+ 	unsigned int			dl_yielded        : 1;
+ 	unsigned int			dl_non_contending : 1;
+ 	unsigned int			dl_overrun	  : 1;
+ 	unsigned int			dl_server         : 1;
++	unsigned int			dl_defer	  : 1;
++	unsigned int			dl_defer_armed	  : 1;
++	unsigned int			dl_defer_running  : 1;
+ 
+ 	/*
+ 	 * Bandwidth enforcement timer. Each -deadline task has its
+@@ -674,7 +692,7 @@ struct sched_dl_entity {
+ 	 */
+ 	struct rq			*rq;
+ 	dl_server_has_tasks_f		server_has_tasks;
+-	dl_server_pick_f		server_pick;
++	dl_server_pick_f		server_pick_task;
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ 	/*
+diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
+index df3aca89d4f5..3a912ab42bb5 100644
+--- a/include/linux/sched/deadline.h
++++ b/include/linux/sched/deadline.h
+@@ -10,16 +10,16 @@
+ 
+ #include <linux/sched.h>
+ 
+-#define MAX_DL_PRIO		0
+-
+-static inline int dl_prio(int prio)
++static inline bool dl_prio(int prio)
+ {
+-	if (unlikely(prio < MAX_DL_PRIO))
+-		return 1;
+-	return 0;
++	return unlikely(prio < MAX_DL_PRIO);
+ }
+ 
+-static inline int dl_task(struct task_struct *p)
++/*
++ * Returns true if a task has a priority that belongs to DL class. PI-boosted
++ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
++ */
++static inline bool dl_task(struct task_struct *p)
+ {
+ 	return dl_prio(p->prio);
+ }
+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
+index ab83d85e1183..6ab43b4f72f9 100644
+--- a/include/linux/sched/prio.h
++++ b/include/linux/sched/prio.h
+@@ -14,6 +14,7 @@
+  */
+ 
+ #define MAX_RT_PRIO		100
++#define MAX_DL_PRIO		0
+ 
+ #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
+ #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
+diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
+index b2b9e6eb9683..4e3338103654 100644
+--- a/include/linux/sched/rt.h
++++ b/include/linux/sched/rt.h
+@@ -6,19 +6,40 @@
+ 
+ struct task_struct;
+ 
+-static inline int rt_prio(int prio)
++static inline bool rt_prio(int prio)
+ {
+-	if (unlikely(prio < MAX_RT_PRIO))
+-		return 1;
+-	return 0;
++	return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
+ }
+ 
+-static inline int rt_task(struct task_struct *p)
++static inline bool rt_or_dl_prio(int prio)
++{
++	return unlikely(prio < MAX_RT_PRIO);
++}
++
++/*
++ * Returns true if a task has a priority that belongs to RT class. PI-boosted
++ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
++ */
++static inline bool rt_task(struct task_struct *p)
+ {
+ 	return rt_prio(p->prio);
+ }
+ 
+-static inline bool task_is_realtime(struct task_struct *tsk)
++/*
++ * Returns true if a task has a priority that belongs to RT or DL classes.
++ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
++ * PI-boosted tasks.
++ */
++static inline bool rt_or_dl_task(struct task_struct *p)
++{
++	return rt_or_dl_prio(p->prio);
++}
++
++/*
++ * Returns true if a task has a policy that belongs to RT or DL classes.
++ * PI-boosted tasks will return false.
++ */
++static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
+ {
+ 	int policy = tsk->policy;
+ 
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index 90662385689b..bf6e9ae031c1 100644
+--- a/include/uapi/linux/sched/types.h
++++ b/include/uapi/linux/sched/types.h
+@@ -58,9 +58,9 @@
+  *
+  * This is reflected by the following fields of the sched_attr structure:
+  *
+- *  @sched_deadline	representative of the task's deadline
+- *  @sched_runtime	representative of the task's runtime
+- *  @sched_period	representative of the task's period
++ *  @sched_deadline	representative of the task's deadline in nanoseconds
++ *  @sched_runtime	representative of the task's runtime in nanoseconds
++ *  @sched_period	representative of the task's period in nanoseconds
+  *
+  * Given this task model, there are a multiplicity of scheduling algorithms
+  * and policies, that can be used to ensure all the tasks will make their
+diff --git a/kernel/freezer.c b/kernel/freezer.c
+index f57aaf96b829..44bbd7dbd2c8 100644
+--- a/kernel/freezer.c
++++ b/kernel/freezer.c
+@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop)
+ 		bool freeze;
+ 
+ 		raw_spin_lock_irq(&current->pi_lock);
+-		set_current_state(TASK_FROZEN);
++		WRITE_ONCE(current->__state, TASK_FROZEN);
+ 		/* unstale saved_state so that __thaw_task() will wake us up */
+ 		current->saved_state = TASK_RUNNING;
+ 		raw_spin_unlock_irq(&current->pi_lock);
+diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
+index fba1229f1de6..ebebd0eec7f6 100644
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task)
+ {
+ 	int prio = task->prio;
+ 
+-	if (!rt_prio(prio))
++	if (!rt_or_dl_prio(prio))
+ 		return DEFAULT_PRIO;
+ 
+ 	return prio;
+@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ 	 * Note that RT tasks are excluded from same priority (lateral)
+ 	 * steals to prevent the introduction of an unbounded latency.
+ 	 */
+-	if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
++	if (rt_or_dl_prio(waiter->tree.prio))
+ 		return false;
+ 
+ 	return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index 3277df47ab3c..299b793d55e1 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ 			 * if it is an RT task or wait in the wait queue
+ 			 * for too long.
+ 			 */
+-			if (has_handoff || (!rt_task(waiter->task) &&
++			if (has_handoff || (!rt_or_dl_task(waiter->task) &&
+ 					    !time_after(jiffies, waiter->timeout)))
+ 				return false;
+ 
+@@ -916,7 +916,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+ 		if (owner_state != OWNER_WRITER) {
+ 			if (need_resched())
+ 				break;
+-			if (rt_task(current) &&
++			if (rt_or_dl_task(current) &&
+ 			   (prev_owner_state != OWNER_WRITER))
+ 				break;
+ 		}
+diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
+index 3ad2cc4823e5..76d204b7d29c 100644
+--- a/kernel/locking/ww_mutex.h
++++ b/kernel/locking/ww_mutex.h
+@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+ 	int a_prio = a->task->prio;
+ 	int b_prio = b->task->prio;
+ 
+-	if (rt_prio(a_prio) || rt_prio(b_prio)) {
++	if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
+ 
+ 		if (a_prio > b_prio)
+ 			return true;
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 1af59cf714cd..6ea3c49788a4 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -163,7 +163,10 @@ static inline int __task_prio(const struct task_struct *p)
+ 	if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ 		return -2;
+ 
+-	if (rt_prio(p->prio)) /* includes deadline */
++	if (p->dl_server)
++		return -1; /* deadline */
++
++	if (rt_or_dl_prio(p->prio))
+ 		return p->prio; /* [-1, 99] */
+ 
+ 	if (p->sched_class == &idle_sched_class)
+@@ -192,8 +195,24 @@ static inline bool prio_less(const struct task_struct *a,
+ 	if (-pb < -pa)
+ 		return false;
+ 
+-	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+-		return !dl_time_before(a->dl.deadline, b->dl.deadline);
++	if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
++		const struct sched_dl_entity *a_dl, *b_dl;
++
++		a_dl = &a->dl;
++		/*
++		 * Since,'a' and 'b' can be CFS tasks served by DL server,
++		 * __task_prio() can return -1 (for DL) even for those. In that
++		 * case, get to the dl_server's DL entity.
++		 */
++		if (a->dl_server)
++			a_dl = a->dl_server;
++
++		b_dl = &b->dl;
++		if (b->dl_server)
++			b_dl = b->dl_server;
++
++		return !dl_time_before(a_dl->deadline, b_dl->deadline);
++	}
+ 
+ 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
+ 		return cfs_prio_less(a, b, in_fi);
+@@ -240,6 +259,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+ 
+ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	if (p->se.sched_delayed)
++		return;
++
+ 	rq->core->core_task_seq++;
+ 
+ 	if (!p->core_cookie)
+@@ -250,6 +272,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+ 
+ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
+ {
++	if (p->se.sched_delayed)
++		return;
++
+ 	rq->core->core_task_seq++;
+ 
+ 	if (sched_core_enqueued(p)) {
+@@ -1269,7 +1294,7 @@ bool sched_can_stop_tick(struct rq *rq)
+ 	 * dequeued by migrating while the constrained task continues to run.
+ 	 * E.g. going from 2->1 without going through pick_next_task().
+ 	 */
+-	if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
++	if (__need_bw_check(rq, rq->curr)) {
+ 		if (cfs_task_bw_constrained(rq->curr))
+ 			return false;
+ 	}
+@@ -1672,6 +1697,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+ 	if (unlikely(!p->sched_class->uclamp_enabled))
+ 		return;
+ 
++	if (p->se.sched_delayed)
++		return;
++
+ 	for_each_clamp_id(clamp_id)
+ 		uclamp_rq_inc_id(rq, p, clamp_id);
+ 
+@@ -1696,6 +1724,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+ 	if (unlikely(!p->sched_class->uclamp_enabled))
+ 		return;
+ 
++	if (p->se.sched_delayed)
++		return;
++
+ 	for_each_clamp_id(clamp_id)
+ 		uclamp_rq_dec_id(rq, p, clamp_id);
+ }
+@@ -1967,22 +1998,38 @@ unsigned long get_wchan(struct task_struct *p)
+ 
+ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ {
++	bool wakee_not_migrated = (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED);
++
+ 	if (!(flags & ENQUEUE_NOCLOCK))
+ 		update_rq_clock(rq);
+ 
+ 	if (!(flags & ENQUEUE_RESTORE)) {
+ 		sched_info_enqueue(rq, p);
+-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
++
++		/* Notify PSI that the task was migrated in a delayed state before wakeup. */
++		if ((p->migration_flags & DELAYED_MIGRATED) && !task_on_rq_migrating(p)) {
++			wakee_not_migrated = false;
++			p->migration_flags &= ~DELAYED_MIGRATED;
++		}
+ 	}
+ 
+-	uclamp_rq_inc(rq, p);
+ 	p->sched_class->enqueue_task(rq, p, flags);
++	/*
++	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
++	 * ->sched_delayed.
++	 */
++	uclamp_rq_inc(rq, p);
++	if (!(flags & ENQUEUE_RESTORE))
++		psi_enqueue(p, wakee_not_migrated);
+ 
+ 	if (sched_core_enabled(rq))
+ 		sched_core_enqueue(rq, p);
+ }
+ 
+-void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
++/*
++ * Must only return false when DEQUEUE_SLEEP.
++ */
++inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	if (sched_core_enabled(rq))
+ 		sched_core_dequeue(rq, p, flags);
+@@ -1993,10 +2040,17 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ 	if (!(flags & DEQUEUE_SAVE)) {
+ 		sched_info_dequeue(rq, p);
+ 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
++
++		if (p->se.sched_delayed && task_on_rq_migrating(p))
++			p->migration_flags |= DELAYED_MIGRATED;
+ 	}
+ 
++	/*
++	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
++	 * and mark the task ->sched_delayed.
++	 */
+ 	uclamp_rq_dec(rq, p);
+-	p->sched_class->dequeue_task(rq, p, flags);
++	return p->sched_class->dequeue_task(rq, p, flags);
+ }
+ 
+ void activate_task(struct rq *rq, struct task_struct *p, int flags)
+@@ -2014,12 +2068,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
+ 
+ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+-	WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
++	SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
++
++	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ 	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ 
++	/*
++	 * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
++	 * dequeue_task() and cleared *after* enqueue_task().
++	 */
++
+ 	dequeue_task(rq, p, flags);
+ }
+ 
++static void block_task(struct rq *rq, struct task_struct *p, int flags)
++{
++	if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
++		__block_task(rq, p);
++}
++
+ /**
+  * task_curr - is this task currently executing on a CPU?
+  * @p: the task in question.
+@@ -2233,6 +2300,12 @@ void migrate_disable(void)
+ 	struct task_struct *p = current;
+ 
+ 	if (p->migration_disabled) {
++#ifdef CONFIG_DEBUG_PREEMPT
++		/*
++		 *Warn about overflow half-way through the range.
++		 */
++		WARN_ON_ONCE((s16)p->migration_disabled < 0);
++#endif
+ 		p->migration_disabled++;
+ 		return;
+ 	}
+@@ -2251,14 +2324,20 @@ void migrate_enable(void)
+ 		.flags     = SCA_MIGRATE_ENABLE,
+ 	};
+ 
++#ifdef CONFIG_DEBUG_PREEMPT
++	/*
++	 * Check both overflow from migrate_disable() and superfluous
++	 * migrate_enable().
++	 */
++	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
++		return;
++#endif
++
+ 	if (p->migration_disabled > 1) {
+ 		p->migration_disabled--;
+ 		return;
+ 	}
+ 
+-	if (WARN_ON_ONCE(!p->migration_disabled))
+-		return;
+-
+ 	/*
+ 	 * Ensure stop_task runs either before or after this, and that
+ 	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+@@ -3607,8 +3686,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ 		rq->idle_stamp = 0;
+ 	}
+ #endif
+-
+-	p->dl_server = NULL;
+ }
+ 
+ /*
+@@ -3644,12 +3721,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
+ 
+ 	rq = __task_rq_lock(p, &rf);
+ 	if (task_on_rq_queued(p)) {
++		update_rq_clock(rq);
++		if (p->se.sched_delayed)
++			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_WAKEUP | ENQUEUE_DELAYED);
+ 		if (!task_on_cpu(rq, p)) {
+ 			/*
+ 			 * When on_rq && !on_cpu the task is preempted, see if
+ 			 * it should preempt the task that is current now.
+ 			 */
+-			update_rq_clock(rq);
+ 			wakeup_preempt(rq, p, wake_flags);
+ 		}
+ 		ttwu_do_wakeup(p);
+@@ -4029,11 +4108,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ 		 * case the whole 'p->on_rq && ttwu_runnable()' case below
+ 		 * without taking any locks.
+ 		 *
++		 * Specifically, given current runs ttwu() we must be before
++		 * schedule()'s block_task(), as such this must not observe
++		 * sched_delayed.
++		 *
+ 		 * In particular:
+ 		 *  - we rely on Program-Order guarantees for all the ordering,
+ 		 *  - we're serialized against set_special_state() by virtue of
+ 		 *    it disabling IRQs (this allows not taking ->pi_lock).
+ 		 */
++		SCHED_WARN_ON(p->se.sched_delayed);
+ 		if (!ttwu_state_match(p, state, &success))
+ 			goto out;
+ 
+@@ -4322,9 +4406,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
+ 	p->se.vlag			= 0;
+-	p->se.slice			= sysctl_sched_base_slice;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
++	/* A delayed task cannot be in clone(). */
++	SCHED_WARN_ON(p->se.sched_delayed);
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4572,6 +4658,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
++		p->se.custom_slice = 0;
++		p->se.slice = sysctl_sched_base_slice;
+ 
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -4686,7 +4774,7 @@ void wake_up_new_task(struct task_struct *p)
+ 	update_rq_clock(rq);
+ 	post_init_entity_util_avg(p);
+ 
+-	activate_task(rq, p, ENQUEUE_NOCLOCK);
++	activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
+ 	trace_sched_wakeup_new(p);
+ 	wakeup_preempt(rq, p, WF_FORK);
+ #ifdef CONFIG_SMP
+@@ -5769,8 +5857,8 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
+ 	schedstat_inc(this_rq()->sched_count);
+ }
+ 
+-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+-				  struct rq_flags *rf)
++static void prev_balance(struct rq *rq, struct task_struct *prev,
++			 struct rq_flags *rf)
+ {
+ #ifdef CONFIG_SMP
+ 	const struct sched_class *class;
+@@ -5787,16 +5875,6 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ 			break;
+ 	}
+ #endif
+-
+-	put_prev_task(rq, prev);
+-
+-	/*
+-	 * We've updated @prev and no longer need the server link, clear it.
+-	 * Must be done before ->pick_next_task() because that can (re)set
+-	 * ->dl_server.
+-	 */
+-	if (prev->dl_server)
+-		prev->dl_server = NULL;
+ }
+ 
+ /*
+@@ -5808,6 +5886,8 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 	const struct sched_class *class;
+ 	struct task_struct *p;
+ 
++	rq->dl_server = NULL;
++
+ 	/*
+ 	 * Optimization: we know that if all tasks are in the fair class we can
+ 	 * call that function directly, but only if the @prev task wasn't of a
+@@ -5823,34 +5903,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		/* Assume the next prioritized class is idle_sched_class */
+ 		if (!p) {
+-			put_prev_task(rq, prev);
+-			p = pick_next_task_idle(rq);
++			p = pick_task_idle(rq);
++			put_prev_set_next_task(rq, prev, p);
+ 		}
+ 
+-		/*
+-		 * This is a normal CFS pick, but the previous could be a DL pick.
+-		 * Clear it as previous is no longer picked.
+-		 */
+-		if (prev->dl_server)
+-			prev->dl_server = NULL;
+-
+-		/*
+-		 * This is the fast path; it cannot be a DL server pick;
+-		 * therefore even if @p == @prev, ->dl_server must be NULL.
+-		 */
+-		if (p->dl_server)
+-			p->dl_server = NULL;
+-
+ 		return p;
+ 	}
+ 
+ restart:
+-	put_prev_task_balance(rq, prev, rf);
++	prev_balance(rq, prev, rf);
+ 
+ 	for_each_class(class) {
+-		p = class->pick_next_task(rq);
+-		if (p)
+-			return p;
++		if (class->pick_next_task) {
++			p = class->pick_next_task(rq, prev);
++			if (p)
++				return p;
++		} else {
++			p = class->pick_task(rq);
++			if (p) {
++				put_prev_set_next_task(rq, prev, p);
++				return p;
++			}
++		}
+ 	}
+ 
+ 	BUG(); /* The idle class should always have a runnable task. */
+@@ -5880,6 +5954,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
+ 	const struct sched_class *class;
+ 	struct task_struct *p;
+ 
++	rq->dl_server = NULL;
++
+ 	for_each_class(class) {
+ 		p = class->pick_task(rq);
+ 		if (p)
+@@ -5918,6 +5994,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		 * another cpu during offline.
+ 		 */
+ 		rq->core_pick = NULL;
++		rq->core_dl_server = NULL;
+ 		return __pick_next_task(rq, prev, rf);
+ 	}
+ 
+@@ -5936,16 +6013,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+ 
+ 		next = rq->core_pick;
+-		if (next != prev) {
+-			put_prev_task(rq, prev);
+-			set_next_task(rq, next);
+-		}
+-
++		rq->dl_server = rq->core_dl_server;
+ 		rq->core_pick = NULL;
+-		goto out;
++		rq->core_dl_server = NULL;
++		goto out_set_next;
+ 	}
+ 
+-	put_prev_task_balance(rq, prev, rf);
++	prev_balance(rq, prev, rf);
+ 
+ 	smt_mask = cpu_smt_mask(cpu);
+ 	need_sync = !!rq->core->core_cookie;
+@@ -5986,6 +6060,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		next = pick_task(rq);
+ 		if (!next->core_cookie) {
+ 			rq->core_pick = NULL;
++			rq->core_dl_server = NULL;
+ 			/*
+ 			 * For robustness, update the min_vruntime_fi for
+ 			 * unconstrained picks as well.
+@@ -6013,7 +6088,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ 			update_rq_clock(rq_i);
+ 
+-		p = rq_i->core_pick = pick_task(rq_i);
++		rq_i->core_pick = p = pick_task(rq_i);
++		rq_i->core_dl_server = rq_i->dl_server;
++
+ 		if (!max || prio_less(max, p, fi_before))
+ 			max = p;
+ 	}
+@@ -6037,6 +6114,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		}
+ 
+ 		rq_i->core_pick = p;
++		rq_i->core_dl_server = NULL;
+ 
+ 		if (p == rq_i->idle) {
+ 			if (rq_i->nr_running) {
+@@ -6097,6 +6175,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		if (i == cpu) {
+ 			rq_i->core_pick = NULL;
++			rq_i->core_dl_server = NULL;
+ 			continue;
+ 		}
+ 
+@@ -6105,6 +6184,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		if (rq_i->curr == rq_i->core_pick) {
+ 			rq_i->core_pick = NULL;
++			rq_i->core_dl_server = NULL;
+ 			continue;
+ 		}
+ 
+@@ -6112,8 +6192,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 	}
+ 
+ out_set_next:
+-	set_next_task(rq, next);
+-out:
++	put_prev_set_next_task(rq, prev, next);
+ 	if (rq->core->core_forceidle_count && next == rq->idle)
+ 		queue_core_balance(rq);
+ 
+@@ -6349,19 +6428,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+  * Constants for the sched_mode argument of __schedule().
+  *
+  * The mode argument allows RT enabled kernels to differentiate a
+- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+- * optimize the AND operation out and just check for zero.
++ * preemption from blocking on an 'sleeping' spin/rwlock.
+  */
+-#define SM_NONE			0x0
+-#define SM_PREEMPT		0x1
+-#define SM_RTLOCK_WAIT		0x2
+-
+-#ifndef CONFIG_PREEMPT_RT
+-# define SM_MASK_PREEMPT	(~0U)
+-#else
+-# define SM_MASK_PREEMPT	SM_PREEMPT
+-#endif
++#define SM_IDLE			(-1)
++#define SM_NONE			0
++#define SM_PREEMPT		1
++#define SM_RTLOCK_WAIT		2
+ 
+ /*
+  * __schedule() is the main scheduler function.
+@@ -6402,9 +6474,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+  *
+  * WARNING: must be called with preemption disabled!
+  */
+-static void __sched notrace __schedule(unsigned int sched_mode)
++static void __sched notrace __schedule(int sched_mode)
+ {
+ 	struct task_struct *prev, *next;
++	/*
++	 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
++	 * as a preemption by schedule_debug() and RCU.
++	 */
++	bool preempt = sched_mode > SM_NONE;
++	bool block = false;
+ 	unsigned long *switch_count;
+ 	unsigned long prev_state;
+ 	struct rq_flags rf;
+@@ -6415,13 +6493,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 	rq = cpu_rq(cpu);
+ 	prev = rq->curr;
+ 
+-	schedule_debug(prev, !!sched_mode);
++	schedule_debug(prev, preempt);
+ 
+ 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
+ 		hrtick_clear(rq);
+ 
+ 	local_irq_disable();
+-	rcu_note_context_switch(!!sched_mode);
++	rcu_note_context_switch(preempt);
+ 
+ 	/*
+ 	 * Make sure that signal_pending_state()->signal_pending() below
+@@ -6450,22 +6528,32 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 
+ 	switch_count = &prev->nivcsw;
+ 
++	/* Task state changes only considers SM_PREEMPT as preemption */
++	preempt = sched_mode == SM_PREEMPT;
++
+ 	/*
+ 	 * We must load prev->state once (task_struct::state is volatile), such
+ 	 * that we form a control dependency vs deactivate_task() below.
+ 	 */
+ 	prev_state = READ_ONCE(prev->__state);
+-	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
++	if (sched_mode == SM_IDLE) {
++		if (!rq->nr_running) {
++			next = prev;
++			goto picked;
++		}
++	} else if (!preempt && prev_state) {
+ 		if (signal_pending_state(prev_state, prev)) {
+ 			WRITE_ONCE(prev->__state, TASK_RUNNING);
+ 		} else {
++			int flags = DEQUEUE_NOCLOCK;
++
+ 			prev->sched_contributes_to_load =
+ 				(prev_state & TASK_UNINTERRUPTIBLE) &&
+ 				!(prev_state & TASK_NOLOAD) &&
+ 				!(prev_state & TASK_FROZEN);
+ 
+-			if (prev->sched_contributes_to_load)
+-				rq->nr_uninterruptible++;
++			if (unlikely(is_special_task_state(prev_state)))
++				flags |= DEQUEUE_SPECIAL;
+ 
+ 			/*
+ 			 * __schedule()			ttwu()
+@@ -6478,17 +6566,14 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 			 *
+ 			 * After this, schedule() must not care about p->state any more.
+ 			 */
+-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+-
+-			if (prev->in_iowait) {
+-				atomic_inc(&rq->nr_iowait);
+-				delayacct_blkio_start();
+-			}
++			block_task(rq, prev, flags);
++			block = true;
+ 		}
+ 		switch_count = &prev->nvcsw;
+ 	}
+ 
+ 	next = pick_next_task(rq, prev, &rf);
++picked:
+ 	clear_tsk_need_resched(prev);
+ 	clear_preempt_need_resched();
+ #ifdef CONFIG_SCHED_DEBUG
+@@ -6528,9 +6613,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 
+ 		migrate_disable_switch(rq, prev);
+ 		psi_account_irqtime(rq, prev, next);
+-		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
++		psi_sched_switch(prev, next, block);
+ 
+-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
++		trace_sched_switch(preempt, prev, next, prev_state);
+ 
+ 		/* Also unlocks the rq: */
+ 		rq = context_switch(rq, prev, next, &rf);
+@@ -6606,7 +6691,7 @@ static void sched_update_worker(struct task_struct *tsk)
+ 	}
+ }
+ 
+-static __always_inline void __schedule_loop(unsigned int sched_mode)
++static __always_inline void __schedule_loop(int sched_mode)
+ {
+ 	do {
+ 		preempt_disable();
+@@ -6651,7 +6736,7 @@ void __sched schedule_idle(void)
+ 	 */
+ 	WARN_ON_ONCE(current->__state);
+ 	do {
+-		__schedule(SM_NONE);
++		__schedule(SM_IDLE);
+ 	} while (need_resched());
+ }
+ 
+@@ -8235,8 +8320,6 @@ void __init sched_init(void)
+ #endif /* CONFIG_RT_GROUP_SCHED */
+ 	}
+ 
+-	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+-
+ #ifdef CONFIG_SMP
+ 	init_defrootdomain();
+ #endif
+@@ -8291,8 +8374,13 @@ void __init sched_init(void)
+ 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+ #ifdef CONFIG_RT_GROUP_SCHED
++		/*
++		 * This is required for init cpu because rt.c:__enable_runtime()
++		 * starts working after scheduler_running, which is not the case
++		 * yet.
++		 */
++		rq->rt.rt_runtime = global_rt_runtime();
+ 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
+ #endif
+ #ifdef CONFIG_SMP
+@@ -8324,10 +8412,12 @@ void __init sched_init(void)
+ #endif /* CONFIG_SMP */
+ 		hrtick_rq_init(rq);
+ 		atomic_set(&rq->nr_iowait, 0);
++		fair_server_init(rq);
+ 
+ #ifdef CONFIG_SCHED_CORE
+ 		rq->core = rq;
+ 		rq->core_pick = NULL;
++		rq->core_dl_server = NULL;
+ 		rq->core_enabled = 0;
+ 		rq->core_tree = RB_ROOT;
+ 		rq->core_forceidle_count = 0;
+@@ -8340,6 +8430,7 @@ void __init sched_init(void)
+ 	}
+ 
+ 	set_load_weight(&init_task, false);
++	init_task.se.slice = sysctl_sched_base_slice,
+ 
+ 	/*
+ 	 * The boot idle thread does lazy MMU switching as well:
+@@ -8555,7 +8646,7 @@ void normalize_rt_tasks(void)
+ 		schedstat_set(p->stats.sleep_start, 0);
+ 		schedstat_set(p->stats.block_start, 0);
+ 
+-		if (!dl_task(p) && !rt_task(p)) {
++		if (!rt_or_dl_task(p)) {
+ 			/*
+ 			 * Renice negative nice level userspace
+ 			 * tasks back to 0:
+diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
+index eece6244f9d2..43111a515a28 100644
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
+ 		 * Fake (unused) bandwidth; workaround to "fix"
+ 		 * priority inheritance.
+ 		 */
+-		.sched_runtime	=  1000000,
+-		.sched_deadline = 10000000,
+-		.sched_period	= 10000000,
++		.sched_runtime	= NSEC_PER_MSEC,
++		.sched_deadline = 10 * NSEC_PER_MSEC,
++		.sched_period	= 10 * NSEC_PER_MSEC,
+ 	};
+ 	struct cpufreq_policy *policy = sg_policy->policy;
+ 	int ret;
+diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
+index c5a3691ba6cc..9ce93d0bf452 100644
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+ 		__sub_running_bw(dl_se->dl_bw, dl_rq);
+ }
+ 
+-static void dl_change_utilization(struct task_struct *p, u64 new_bw)
++static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
+ {
+-	struct rq *rq;
+-
+-	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+-
+-	if (task_on_rq_queued(p))
+-		return;
++	if (dl_se->dl_non_contending) {
++		sub_running_bw(dl_se, &rq->dl);
++		dl_se->dl_non_contending = 0;
+ 
+-	rq = task_rq(p);
+-	if (p->dl.dl_non_contending) {
+-		sub_running_bw(&p->dl, &rq->dl);
+-		p->dl.dl_non_contending = 0;
+ 		/*
+ 		 * If the timer handler is currently running and the
+ 		 * timer cannot be canceled, inactive_task_timer()
+@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+ 		 * will not touch the rq's active utilization,
+ 		 * so we are still safe.
+ 		 */
+-		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+-			put_task_struct(p);
++		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
++			if (!dl_server(dl_se))
++				put_task_struct(dl_task_of(dl_se));
++		}
+ 	}
+-	__sub_rq_bw(p->dl.dl_bw, &rq->dl);
++	__sub_rq_bw(dl_se->dl_bw, &rq->dl);
+ 	__add_rq_bw(new_bw, &rq->dl);
+ }
+ 
++static void dl_change_utilization(struct task_struct *p, u64 new_bw)
++{
++	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
++
++	if (task_on_rq_queued(p))
++		return;
++
++	dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
++}
++
+ static void __dl_clear_params(struct sched_dl_entity *dl_se);
+ 
+ /*
+@@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ 	/* for non-boosted task, pi_of(dl_se) == dl_se */
+ 	dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ 	dl_se->runtime = pi_of(dl_se)->dl_runtime;
++
++	/*
++	 * If it is a deferred reservation, and the server
++	 * is not handling an starvation case, defer it.
++	 */
++	if (dl_se->dl_defer & !dl_se->dl_defer_running) {
++		dl_se->dl_throttled = 1;
++		dl_se->dl_defer_armed = 1;
++	}
+ }
+ 
+ /*
+@@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
+ 	replenish_dl_new_period(dl_se, rq);
+ }
+ 
++static int start_dl_timer(struct sched_dl_entity *dl_se);
++static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
++
+ /*
+  * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+  * possibility of a entity lasting more than what it declared, and thus
+@@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
+ 	/*
+ 	 * This could be the case for a !-dl task that is boosted.
+ 	 * Just go with full inherited parameters.
++	 *
++	 * Or, it could be the case of a deferred reservation that
++	 * was not able to consume its runtime in background and
++	 * reached this point with current u > U.
++	 *
++	 * In both cases, set a new period.
+ 	 */
+-	if (dl_se->dl_deadline == 0)
+-		replenish_dl_new_period(dl_se, rq);
++	if (dl_se->dl_deadline == 0 ||
++	    (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
++		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
++		dl_se->runtime = pi_of(dl_se)->dl_runtime;
++	}
+ 
+ 	if (dl_se->dl_yielded && dl_se->runtime > 0)
+ 		dl_se->runtime = 0;
+@@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
+ 		dl_se->dl_yielded = 0;
+ 	if (dl_se->dl_throttled)
+ 		dl_se->dl_throttled = 0;
++
++	/*
++	 * If this is the replenishment of a deferred reservation,
++	 * clear the flag and return.
++	 */
++	if (dl_se->dl_defer_armed) {
++		dl_se->dl_defer_armed = 0;
++		return;
++	}
++
++	/*
++	 * A this point, if the deferred server is not armed, and the deadline
++	 * is in the future, if it is not running already, throttle the server
++	 * and arm the defer timer.
++	 */
++	if (dl_se->dl_defer && !dl_se->dl_defer_running &&
++	    dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
++		if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
++
++			/*
++			 * Set dl_se->dl_defer_armed and dl_throttled variables to
++			 * inform the start_dl_timer() that this is a deferred
++			 * activation.
++			 */
++			dl_se->dl_defer_armed = 1;
++			dl_se->dl_throttled = 1;
++			if (!start_dl_timer(dl_se)) {
++				/*
++				 * If for whatever reason (delays), a previous timer was
++				 * queued but not serviced, cancel it and clean the
++				 * deferrable server variables intended for start_dl_timer().
++				 */
++				hrtimer_try_to_cancel(&dl_se->dl_timer);
++				dl_se->dl_defer_armed = 0;
++				dl_se->dl_throttled = 0;
++			}
++		}
++	}
+ }
+ 
+ /*
+@@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
+ 		}
+ 
+ 		replenish_dl_new_period(dl_se, rq);
++	} else if (dl_server(dl_se) && dl_se->dl_defer) {
++		/*
++		 * The server can still use its previous deadline, so check if
++		 * it left the dl_defer_running state.
++		 */
++		if (!dl_se->dl_defer_running) {
++			dl_se->dl_defer_armed = 1;
++			dl_se->dl_throttled = 1;
++		}
+ 	}
+ }
+ 
+@@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
+ 	 * We want the timer to fire at the deadline, but considering
+ 	 * that it is actually coming from rq->clock and not from
+ 	 * hrtimer's time base reading.
++	 *
++	 * The deferred reservation will have its timer set to
++	 * (deadline - runtime). At that point, the CBS rule will decide
++	 * if the current deadline can be used, or if a replenishment is
++	 * required to avoid add too much pressure on the system
++	 * (current u > U).
+ 	 */
+-	act = ns_to_ktime(dl_next_period(dl_se));
++	if (dl_se->dl_defer_armed) {
++		WARN_ON_ONCE(!dl_se->dl_throttled);
++		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
++	} else {
++		/* act = deadline - rel-deadline + period */
++		act = ns_to_ktime(dl_next_period(dl_se));
++	}
++
+ 	now = hrtimer_cb_get_time(timer);
+ 	delta = ktime_to_ns(now) - rq_clock(rq);
+ 	act = ktime_add_ns(act, delta);
+@@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+ #endif
+ }
+ 
++/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
++static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
++
++static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
++{
++	struct rq *rq = rq_of_dl_se(dl_se);
++	u64 fw;
++
++	scoped_guard (rq_lock, rq) {
++		struct rq_flags *rf = &scope.rf;
++
++		if (!dl_se->dl_throttled || !dl_se->dl_runtime)
++			return HRTIMER_NORESTART;
++
++		sched_clock_tick();
++		update_rq_clock(rq);
++
++		if (!dl_se->dl_runtime)
++			return HRTIMER_NORESTART;
++
++		if (!dl_se->server_has_tasks(dl_se)) {
++			replenish_dl_entity(dl_se);
++			return HRTIMER_NORESTART;
++		}
++
++		if (dl_se->dl_defer_armed) {
++			/*
++			 * First check if the server could consume runtime in background.
++			 * If so, it is possible to push the defer timer for this amount
++			 * of time. The dl_server_min_res serves as a limit to avoid
++			 * forwarding the timer for a too small amount of time.
++			 */
++			if (dl_time_before(rq_clock(dl_se->rq),
++					   (dl_se->deadline - dl_se->runtime - dl_server_min_res))) {
++
++				/* reset the defer timer */
++				fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime;
++
++				hrtimer_forward_now(timer, ns_to_ktime(fw));
++				return HRTIMER_RESTART;
++			}
++
++			dl_se->dl_defer_running = 1;
++		}
++
++		enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
++
++		if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl))
++			resched_curr(rq);
++
++		__push_dl_task(rq, rf);
++	}
++
++	return HRTIMER_NORESTART;
++}
++
+ /*
+  * This is the bandwidth enforcement timer callback. If here, we know
+  * a task is not on its dl_rq, since the fact that the timer was running
+@@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+ 	struct rq_flags rf;
+ 	struct rq *rq;
+ 
+-	if (dl_server(dl_se)) {
+-		struct rq *rq = rq_of_dl_se(dl_se);
+-		struct rq_flags rf;
+-
+-		rq_lock(rq, &rf);
+-		if (dl_se->dl_throttled) {
+-			sched_clock_tick();
+-			update_rq_clock(rq);
+-
+-			if (dl_se->server_has_tasks(dl_se)) {
+-				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+-				resched_curr(rq);
+-				__push_dl_task(rq, &rf);
+-			} else {
+-				replenish_dl_entity(dl_se);
+-			}
+-
+-		}
+-		rq_unlock(rq, &rf);
+-
+-		return HRTIMER_NORESTART;
+-	}
++	if (dl_server(dl_se))
++		return dl_server_timer(timer, dl_se);
+ 
+ 	p = dl_task_of(dl_se);
+ 	rq = task_rq_lock(p, &rf);
+@@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+ 	return (delta * u_act) >> BW_SHIFT;
+ }
+ 
+-static inline void
+-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+-                        int flags);
+-static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
++s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+ {
+ 	s64 scaled_delta_exec;
+ 
+-	if (unlikely(delta_exec <= 0)) {
+-		if (unlikely(dl_se->dl_yielded))
+-			goto throttle;
+-		return;
+-	}
+-
+-	if (dl_entity_is_special(dl_se))
+-		return;
+-
+ 	/*
+ 	 * For tasks that participate in GRUB, we implement GRUB-PA: the
+ 	 * spare reclaimed bandwidth is used to clock down frequency.
+@@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ 	}
+ 
++	return scaled_delta_exec;
++}
++
++static inline void
++update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
++			int flags);
++static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
++{
++	s64 scaled_delta_exec;
++
++	if (unlikely(delta_exec <= 0)) {
++		if (unlikely(dl_se->dl_yielded))
++			goto throttle;
++		return;
++	}
++
++	if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer)
++		return;
++
++	if (dl_entity_is_special(dl_se))
++		return;
++
++	scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
++
+ 	dl_se->runtime -= scaled_delta_exec;
+ 
++	/*
++	 * The fair server can consume its runtime while throttled (not queued/
++	 * running as regular CFS).
++	 *
++	 * If the server consumes its entire runtime in this state. The server
++	 * is not required for the current period. Thus, reset the server by
++	 * starting a new period, pushing the activation.
++	 */
++	if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
++		/*
++		 * If the server was previously activated - the starving condition
++		 * took place, it this point it went away because the fair scheduler
++		 * was able to get runtime in background. So return to the initial
++		 * state.
++		 */
++		dl_se->dl_defer_running = 0;
++
++		hrtimer_try_to_cancel(&dl_se->dl_timer);
++
++		replenish_dl_new_period(dl_se, dl_se->rq);
++
++		/*
++		 * Not being able to start the timer seems problematic. If it could not
++		 * be started for whatever reason, we need to "unthrottle" the DL server
++		 * and queue right away. Otherwise nothing might queue it. That's similar
++		 * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn.
++		 */
++		WARN_ON_ONCE(!start_dl_timer(dl_se));
++
++		return;
++	}
++
+ throttle:
+ 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
+ 		dl_se->dl_throttled = 1;
+@@ -1381,6 +1547,14 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 			resched_curr(rq);
+ 	}
+ 
++	/*
++	 * The fair server (sole dl_server) does not account for real-time
++	 * workload because it is running fair work.
++	 */
++	if (dl_se == &rq->fair_server)
++		return;
++
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	/*
+ 	 * Because -- for now -- we share the rt bandwidth, we need to
+ 	 * account our runtime there too, otherwise actual rt tasks
+@@ -1405,34 +1579,155 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 			rt_rq->rt_time += delta_exec;
+ 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ 	}
++#endif
++}
++
++/*
++ * In the non-defer mode, the idle time is not accounted, as the
++ * server provides a guarantee.
++ *
++ * If the dl_server is in defer mode, the idle time is also considered
++ * as time available for the fair server, avoiding a penalty for the
++ * rt scheduler that did not consumed that time.
++ */
++void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
++{
++	s64 delta_exec, scaled_delta_exec;
++
++	if (!rq->fair_server.dl_defer)
++		return;
++
++	/* no need to discount more */
++	if (rq->fair_server.runtime < 0)
++		return;
++
++	delta_exec = rq_clock_task(rq) - p->se.exec_start;
++	if (delta_exec < 0)
++		return;
++
++	scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
++
++	rq->fair_server.runtime -= scaled_delta_exec;
++
++	if (rq->fair_server.runtime < 0) {
++		rq->fair_server.dl_defer_running = 0;
++		rq->fair_server.runtime = 0;
++	}
++
++	p->se.exec_start = rq_clock_task(rq);
+ }
+ 
+ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+ {
+-	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
++	/* 0 runtime = fair server disabled */
++	if (dl_se->dl_runtime)
++		update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
+ 
+ void dl_server_start(struct sched_dl_entity *dl_se)
+ {
++	struct rq *rq = dl_se->rq;
++
++	/*
++	 * XXX: the apply do not work fine at the init phase for the
++	 * fair server because things are not yet set. We need to improve
++	 * this before getting generic.
++	 */
+ 	if (!dl_server(dl_se)) {
++		u64 runtime =  50 * NSEC_PER_MSEC;
++		u64 period = 1000 * NSEC_PER_MSEC;
++
++		dl_server_apply_params(dl_se, runtime, period, 1);
++
+ 		dl_se->dl_server = 1;
++		dl_se->dl_defer = 1;
+ 		setup_new_dl_entity(dl_se);
+ 	}
++
++	if (!dl_se->dl_runtime)
++		return;
++
+ 	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
++	if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
++		resched_curr(dl_se->rq);
+ }
+ 
+ void dl_server_stop(struct sched_dl_entity *dl_se)
+ {
++	if (!dl_se->dl_runtime)
++		return;
++
+ 	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
++	hrtimer_try_to_cancel(&dl_se->dl_timer);
++	dl_se->dl_defer_armed = 0;
++	dl_se->dl_throttled = 0;
+ }
+ 
+ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ 		    dl_server_has_tasks_f has_tasks,
+-		    dl_server_pick_f pick)
++		    dl_server_pick_f pick_task)
+ {
+ 	dl_se->rq = rq;
+ 	dl_se->server_has_tasks = has_tasks;
+-	dl_se->server_pick = pick;
++	dl_se->server_pick_task = pick_task;
++}
++
++void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
++{
++	u64 new_bw = dl_se->dl_bw;
++	int cpu = cpu_of(rq);
++	struct dl_bw *dl_b;
++
++	dl_b = dl_bw_of(cpu_of(rq));
++	guard(raw_spinlock)(&dl_b->lock);
++
++	if (!dl_bw_cpus(cpu))
++		return;
++
++	__dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
++}
++
++int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
++{
++	u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
++	u64 new_bw = to_ratio(period, runtime);
++	struct rq *rq = dl_se->rq;
++	int cpu = cpu_of(rq);
++	struct dl_bw *dl_b;
++	unsigned long cap;
++	int retval = 0;
++	int cpus;
++
++	dl_b = dl_bw_of(cpu);
++	guard(raw_spinlock)(&dl_b->lock);
++
++	cpus = dl_bw_cpus(cpu);
++	cap = dl_bw_capacity(cpu);
++
++	if (__dl_overflow(dl_b, cap, old_bw, new_bw))
++		return -EBUSY;
++
++	if (init) {
++		__add_rq_bw(new_bw, &rq->dl);
++		__dl_add(dl_b, new_bw, cpus);
++	} else {
++		__dl_sub(dl_b, dl_se->dl_bw, cpus);
++		__dl_add(dl_b, new_bw, cpus);
++
++		dl_rq_change_utilization(rq, dl_se, new_bw);
++	}
++
++	dl_se->dl_runtime = runtime;
++	dl_se->dl_deadline = period;
++	dl_se->dl_period = period;
++
++	dl_se->runtime = 0;
++	dl_se->deadline = 0;
++
++	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
++	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
++
++	return retval;
+ }
+ 
+ /*
+@@ -1729,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
+ 	 * be counted in the active utilization; hence, we need to call
+ 	 * add_running_bw().
+ 	 */
+-	if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
++	if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ 		if (flags & ENQUEUE_WAKEUP)
+ 			task_contending(dl_se, flags);
+ 
+@@ -1751,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
+ 		setup_new_dl_entity(dl_se);
+ 	}
+ 
++	/*
++	 * If the reservation is still throttled, e.g., it got replenished but is a
++	 * deferred task and still got to wait, don't enqueue.
++	 */
++	if (dl_se->dl_throttled && start_dl_timer(dl_se))
++		return;
++
++	/*
++	 * We're about to enqueue, make sure we're not ->dl_throttled!
++	 * In case the timer was not started, say because the defer time
++	 * has passed, mark as not throttled and mark unarmed.
++	 * Also cancel earlier timers, since letting those run is pointless.
++	 */
++	if (dl_se->dl_throttled) {
++		hrtimer_try_to_cancel(&dl_se->dl_timer);
++		dl_se->dl_defer_armed = 0;
++		dl_se->dl_throttled = 0;
++	}
++
+ 	__enqueue_dl_entity(dl_se);
+ }
+ 
+@@ -1840,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ 		enqueue_pushable_dl_task(rq, p);
+ }
+ 
+-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
++static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	update_curr_dl(rq);
+ 
+@@ -1850,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ 	dequeue_dl_entity(&p->dl, flags);
+ 	if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ 		dequeue_pushable_dl_task(rq, p);
++
++	return true;
+ }
+ 
+ /*
+@@ -2068,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
+ 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ 
+ 	deadline_queue_push_tasks(rq);
++
++	if (hrtick_enabled(rq))
++		start_hrtick_dl(rq, &p->dl);
+ }
+ 
+ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
+@@ -2080,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
+ 	return __node_2_dle(left);
+ }
+ 
+-static struct task_struct *pick_task_dl(struct rq *rq)
++/*
++ * __pick_next_task_dl - Helper to pick the next -deadline task to run.
++ * @rq: The runqueue to pick the next task from.
++ */
++static struct task_struct *__pick_task_dl(struct rq *rq)
+ {
+ 	struct sched_dl_entity *dl_se;
+ 	struct dl_rq *dl_rq = &rq->dl;
+@@ -2094,14 +2417,13 @@ static struct task_struct *pick_task_dl(struct rq *rq)
+ 	WARN_ON_ONCE(!dl_se);
+ 
+ 	if (dl_server(dl_se)) {
+-		p = dl_se->server_pick(dl_se);
++		p = dl_se->server_pick_task(dl_se);
+ 		if (!p) {
+-			WARN_ON_ONCE(1);
+ 			dl_se->dl_yielded = 1;
+ 			update_curr_dl_se(rq, dl_se, 0);
+ 			goto again;
+ 		}
+-		p->dl_server = dl_se;
++		rq->dl_server = dl_se;
+ 	} else {
+ 		p = dl_task_of(dl_se);
+ 	}
+@@ -2109,24 +2431,12 @@ static struct task_struct *pick_task_dl(struct rq *rq)
+ 	return p;
+ }
+ 
+-static struct task_struct *pick_next_task_dl(struct rq *rq)
++static struct task_struct *pick_task_dl(struct rq *rq)
+ {
+-	struct task_struct *p;
+-
+-	p = pick_task_dl(rq);
+-	if (!p)
+-		return p;
+-
+-	if (!p->dl_server)
+-		set_next_task_dl(rq, p, true);
+-
+-	if (hrtick_enabled(rq))
+-		start_hrtick_dl(rq, &p->dl);
+-
+-	return p;
++	return __pick_task_dl(rq);
+ }
+ 
+-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
++static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
+ {
+ 	struct sched_dl_entity *dl_se = &p->dl;
+ 	struct dl_rq *dl_rq = &rq->dl;
+@@ -2818,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_dl,
+ 
+-	.pick_next_task		= pick_next_task_dl,
++	.pick_task		= pick_task_dl,
+ 	.put_prev_task		= put_prev_task_dl,
+ 	.set_next_task		= set_next_task_dl,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_dl,
+-	.pick_task		= pick_task_dl,
+ 	.select_task_rq		= select_task_rq_dl,
+ 	.migrate_task_rq	= migrate_task_rq_dl,
+ 	.set_cpus_allowed       = set_cpus_allowed_dl,
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index c1eb9a1afd13..de1dc5264b3f 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
+ 	.release	= seq_release,
+ };
+ 
++enum dl_param {
++	DL_RUNTIME = 0,
++	DL_PERIOD,
++};
++
++static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
++static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC;     /* 100 us */
++
++static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
++				       size_t cnt, loff_t *ppos, enum dl_param param)
++{
++	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
++	struct rq *rq = cpu_rq(cpu);
++	u64 runtime, period;
++	size_t err;
++	int retval;
++	u64 value;
++
++	err = kstrtoull_from_user(ubuf, cnt, 10, &value);
++	if (err)
++		return err;
++
++	scoped_guard (rq_lock_irqsave, rq) {
++		runtime  = rq->fair_server.dl_runtime;
++		period = rq->fair_server.dl_period;
++
++		switch (param) {
++		case DL_RUNTIME:
++			if (runtime == value)
++				break;
++			runtime = value;
++			break;
++		case DL_PERIOD:
++			if (value == period)
++				break;
++			period = value;
++			break;
++		}
++
++		if (runtime > period ||
++		    period > fair_server_period_max ||
++		    period < fair_server_period_min) {
++			return  -EINVAL;
++		}
++
++		if (rq->cfs.h_nr_running) {
++			update_rq_clock(rq);
++			dl_server_stop(&rq->fair_server);
++		}
++
++		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
++		if (retval)
++			cnt = retval;
++
++		if (!runtime)
++			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
++					cpu_of(rq));
++
++		if (rq->cfs.h_nr_running)
++			dl_server_start(&rq->fair_server);
++	}
++
++	*ppos += cnt;
++	return cnt;
++}
++
++static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
++{
++	unsigned long cpu = (unsigned long) m->private;
++	struct rq *rq = cpu_rq(cpu);
++	u64 value;
++
++	switch (param) {
++	case DL_RUNTIME:
++		value = rq->fair_server.dl_runtime;
++		break;
++	case DL_PERIOD:
++		value = rq->fair_server.dl_period;
++		break;
++	}
++
++	seq_printf(m, "%llu\n", value);
++	return 0;
++
++}
++
++static ssize_t
++sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
++				size_t cnt, loff_t *ppos)
++{
++	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
++}
++
++static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
++{
++	return sched_fair_server_show(m, v, DL_RUNTIME);
++}
++
++static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
++{
++	return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
++}
++
++static const struct file_operations fair_server_runtime_fops = {
++	.open		= sched_fair_server_runtime_open,
++	.write		= sched_fair_server_runtime_write,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
++static ssize_t
++sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
++			       size_t cnt, loff_t *ppos)
++{
++	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
++}
++
++static int sched_fair_server_period_show(struct seq_file *m, void *v)
++{
++	return sched_fair_server_show(m, v, DL_PERIOD);
++}
++
++static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
++{
++	return single_open(filp, sched_fair_server_period_show, inode->i_private);
++}
++
++static const struct file_operations fair_server_period_fops = {
++	.open		= sched_fair_server_period_open,
++	.write		= sched_fair_server_period_write,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
+ static struct dentry *debugfs_sched;
+ 
++static void debugfs_fair_server_init(void)
++{
++	struct dentry *d_fair;
++	unsigned long cpu;
++
++	d_fair = debugfs_create_dir("fair_server", debugfs_sched);
++	if (!d_fair)
++		return;
++
++	for_each_possible_cpu(cpu) {
++		struct dentry *d_cpu;
++		char buf[32];
++
++		snprintf(buf, sizeof(buf), "cpu%lu", cpu);
++		d_cpu = debugfs_create_dir(buf, d_fair);
++
++		debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
++		debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
++	}
++}
++
+ static __init int sched_init_debug(void)
+ {
+ 	struct dentry __maybe_unused *numa;
+@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
+ 
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
++	debugfs_fair_server_init();
++
+ 	return 0;
+ }
+ late_initcall(sched_init_debug);
+@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 	else
+ 		SEQ_printf(m, " %c", task_state_to_char(p));
+ 
+-	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
++	SEQ_printf(m, " %15s %5d %9Ld.%06ld   %c   %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld   %5d ",
+ 		p->comm, task_pid_nr(p),
+ 		SPLIT_NS(p->se.vruntime),
+ 		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ 		SPLIT_NS(p->se.deadline),
++		p->se.custom_slice ? 'S' : ' ',
+ 		SPLIT_NS(p->se.slice),
+ 		SPLIT_NS(p->se.sum_exec_runtime),
+ 		(long long)(p->nvcsw + p->nivcsw),
+ 		p->prio);
+ 
+-	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
++	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
+-		SPLIT_NS(p->se.sum_exec_runtime),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+-	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
++	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
+ #endif
+ #ifdef CONFIG_CGROUP_SCHED
+-	SEQ_printf_task_group_path(m, task_group(p), " %s")
++	SEQ_printf_task_group_path(m, task_group(p), "        %s")
+ #endif
+ 
+ 	SEQ_printf(m, "\n");
+@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+ 
+ 	SEQ_printf(m, "\n");
+ 	SEQ_printf(m, "runnable tasks:\n");
+-	SEQ_printf(m, " S            task   PID         tree-key  switches  prio"
+-		   "     wait-time             sum-exec        sum-sleep\n");
++	SEQ_printf(m, " S            task   PID       vruntime   eligible    "
++		   "deadline             slice          sum-exec      switches  "
++		   "prio         wait-time        sum-sleep       sum-block"
++#ifdef CONFIG_NUMA_BALANCING
++		   "  node   group-id"
++#endif
++#ifdef CONFIG_CGROUP_SCHED
++		   "  group-path"
++#endif
++		   "\n");
+ 	SEQ_printf(m, "-------------------------------------------------------"
+-		   "------------------------------------------------------\n");
++		   "------------------------------------------------------"
++		   "------------------------------------------------------"
++#ifdef CONFIG_NUMA_BALANCING
++		   "--------------"
++#endif
++#ifdef CONFIG_CGROUP_SCHED
++		   "--------------"
++#endif
++		   "\n");
+ 
+ 	rcu_read_lock();
+ 	for_each_process_thread(g, p) {
+@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ 	SEQ_printf(m, "\n");
+ 	SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
+ #endif
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+-			SPLIT_NS(cfs_rq->exec_clock));
+ 
+ 	raw_spin_rq_lock_irqsave(rq, flags);
+ 	root = __pick_root_entity(cfs_rq);
+@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ 			SPLIT_NS(right_vruntime));
+ 	spread = right_vruntime - left_vruntime;
+ 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
+-	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
+-			cfs_rq->nr_spread_over);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
+@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+ 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+ 
+ 	PU(rt_nr_running);
++
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	P(rt_throttled);
+ 	PN(rt_time);
+ 	PN(rt_runtime);
++#endif
+ 
+ #undef PN
+ #undef PU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 91b242e47db7..c89e7f1693d4 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -792,8 +792,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ 	}
+ 
+ 	/* ensure we never gain time by being placed backwards. */
+-	u64_u32_store(cfs_rq->min_vruntime,
+-		      __update_min_vruntime(cfs_rq, vruntime));
++	cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
++}
++
++static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
++{
++	struct sched_entity *root = __pick_root_entity(cfs_rq);
++	struct sched_entity *curr = cfs_rq->curr;
++	u64 min_slice = ~0ULL;
++
++	if (curr && curr->on_rq)
++		min_slice = curr->slice;
++
++	if (root)
++		min_slice = min(min_slice, root->min_slice);
++
++	return min_slice;
+ }
+ 
+ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -812,19 +826,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node
+ 	}
+ }
+ 
++static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
++{
++	if (node) {
++		struct sched_entity *rse = __node_2_se(node);
++		if (rse->min_slice < se->min_slice)
++			se->min_slice = rse->min_slice;
++	}
++}
++
+ /*
+  * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
+  */
+ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
+ {
+ 	u64 old_min_vruntime = se->min_vruntime;
++	u64 old_min_slice = se->min_slice;
+ 	struct rb_node *node = &se->run_node;
+ 
+ 	se->min_vruntime = se->vruntime;
+ 	__min_vruntime_update(se, node->rb_right);
+ 	__min_vruntime_update(se, node->rb_left);
+ 
+-	return se->min_vruntime == old_min_vruntime;
++	se->min_slice = se->slice;
++	__min_slice_update(se, node->rb_right);
++	__min_slice_update(se, node->rb_left);
++
++	return se->min_vruntime == old_min_vruntime &&
++	       se->min_slice == old_min_slice;
+ }
+ 
+ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+@@ -837,6 +866,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	avg_vruntime_add(cfs_rq, se);
+ 	se->min_vruntime = se->vruntime;
++	se->min_slice = se->slice;
+ 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ 				__entity_less, &min_vruntime_cb);
+ }
+@@ -987,17 +1017,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+  * this is probably good enough.
+  */
+-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
++static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	if ((s64)(se->vruntime - se->deadline) < 0)
+-		return;
++		return false;
+ 
+ 	/*
+ 	 * For EEVDF the virtual time slope is determined by w_i (iow.
+ 	 * nice) while the request time r_i is determined by
+ 	 * sysctl_sched_base_slice.
+ 	 */
+-	se->slice = sysctl_sched_base_slice;
++	if (!se->custom_slice)
++		se->slice = sysctl_sched_base_slice;
+ 
+ 	/*
+ 	 * EEVDF: vd_i = ve_i + r_i / w_i
+@@ -1007,10 +1038,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	/*
+ 	 * The task has consumed its request, reschedule.
+ 	 */
+-	if (cfs_rq->nr_running > 1) {
+-		resched_curr(rq_of(cfs_rq));
+-		clear_buddies(cfs_rq, se);
+-	}
++	return true;
+ }
+ 
+ #include "pelt.h"
+@@ -1148,6 +1176,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+ 		dl_server_update(p->dl_server, delta_exec);
+ }
+ 
++static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
++{
++	if (!sched_feat(PREEMPT_SHORT))
++		return false;
++
++	if (curr->vlag == curr->deadline)
++		return false;
++
++	return !entity_eligible(cfs_rq, curr);
++}
++
++static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
++				    struct sched_entity *pse, struct sched_entity *se)
++{
++	if (!sched_feat(PREEMPT_SHORT))
++		return false;
++
++	if (pse->slice >= se->slice)
++		return false;
++
++	if (!entity_eligible(cfs_rq, pse))
++		return false;
++
++	if (entity_before(pse, se))
++		return true;
++
++	if (!entity_eligible(cfs_rq, se))
++		return true;
++
++	return false;
++}
++
+ /*
+  * Used by other classes to account runtime.
+  */
+@@ -1169,23 +1229,44 @@ s64 update_curr_common(struct rq *rq)
+ static void update_curr(struct cfs_rq *cfs_rq)
+ {
+ 	struct sched_entity *curr = cfs_rq->curr;
++	struct rq *rq = rq_of(cfs_rq);
+ 	s64 delta_exec;
++	bool resched;
+ 
+ 	if (unlikely(!curr))
+ 		return;
+ 
+-	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
++	delta_exec = update_curr_se(rq, curr);
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
+-	update_deadline(cfs_rq, curr);
++	resched = update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+ 
+-	if (entity_is_task(curr))
+-		update_curr_task(task_of(curr), delta_exec);
++	if (entity_is_task(curr)) {
++		struct task_struct *p = task_of(curr);
++
++		update_curr_task(p, delta_exec);
++
++		/*
++		 * Any fair task that runs outside of fair_server should
++		 * account against fair_server such that it can account for
++		 * this time and possibly avoid running this period.
++		 */
++		if (p->dl_server != &rq->fair_server)
++			dl_server_update(&rq->fair_server, delta_exec);
++	}
+ 
+ 	account_cfs_rq_runtime(cfs_rq, delta_exec);
++
++	if (cfs_rq->nr_running == 1)
++		return;
++
++	if (resched || did_preempt_short(cfs_rq, curr)) {
++		resched_curr(rq);
++		clear_buddies(cfs_rq, curr);
++	}
+ }
+ 
+ static void update_curr_fair(struct rq *rq)
+@@ -5200,7 +5281,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ 	s64 lag = 0;
+ 
+-	se->slice = sysctl_sched_base_slice;
++	if (!se->custom_slice)
++		se->slice = sysctl_sched_base_slice;
+ 	vslice = calc_delta_fair(se->slice, se);
+ 
+ 	/*
+@@ -5281,6 +5363,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 
+ 	se->vruntime = vruntime - lag;
+ 
++	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
++		se->deadline += se->vruntime;
++		se->rel_deadline = 0;
++		return;
++	}
++
+ 	/*
+ 	 * When joining the competition; the existing tasks will be,
+ 	 * on average, halfway through their slice, as such start tasks
+@@ -5300,6 +5388,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+ 
+ static inline bool cfs_bandwidth_used(void);
+ 
++static void
++requeue_delayed_entity(struct sched_entity *se);
++
+ static void
+ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -5387,19 +5478,47 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+ 
+-static void
++static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
++{
++	se->sched_delayed = 0;
++	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
++		se->vlag = 0;
++}
++
++static bool
+ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+-	int action = UPDATE_TG;
++	bool sleep = flags & DEQUEUE_SLEEP;
++
++	update_curr(cfs_rq);
++
++	if (flags & DEQUEUE_DELAYED) {
++		SCHED_WARN_ON(!se->sched_delayed);
++	} else {
++		bool delay = sleep;
++		/*
++		 * DELAY_DEQUEUE relies on spurious wakeups, special task
++		 * states must not suffer spurious wakeups, excempt them.
++		 */
++		if (flags & DEQUEUE_SPECIAL)
++			delay = false;
++
++		SCHED_WARN_ON(delay && se->sched_delayed);
+ 
++		if (sched_feat(DELAY_DEQUEUE) && delay &&
++		    !entity_eligible(cfs_rq, se)) {
++			if (cfs_rq->next == se)
++				cfs_rq->next = NULL;
++			update_load_avg(cfs_rq, se, 0);
++			se->sched_delayed = 1;
++			return false;
++		}
++	}
++
++	int action = UPDATE_TG;
+ 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ 		action |= DO_DETACH;
+ 
+-	/*
+-	 * Update run-time statistics of the 'current'.
+-	 */
+-	update_curr(cfs_rq);
+-
+ 	/*
+ 	 * When dequeuing a sched_entity, we must:
+ 	 *   - Update loads to have both entity and cfs_rq synced with now.
+@@ -5417,6 +5536,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	clear_buddies(cfs_rq, se);
+ 
+ 	update_entity_lag(cfs_rq, se);
++	if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
++		se->deadline -= se->vruntime;
++		se->rel_deadline = 1;
++	}
++
+ 	if (se != cfs_rq->curr)
+ 		__dequeue_entity(cfs_rq, se);
+ 	se->on_rq = 0;
+@@ -5436,8 +5560,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
+ 		update_min_vruntime(cfs_rq);
+ 
++	if (flags & DEQUEUE_DELAYED)
++		finish_delayed_dequeue_entity(se);
++
+ 	if (cfs_rq->nr_running == 0)
+ 		update_idle_cfs_rq_clock_pelt(cfs_rq);
++
++	return true;
+ }
+ 
+ static void
+@@ -5463,6 +5592,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	}
+ 
+ 	update_stats_curr_start(cfs_rq, se);
++	SCHED_WARN_ON(cfs_rq->curr);
+ 	cfs_rq->curr = se;
+ 
+ 	/*
+@@ -5483,6 +5613,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
+ }
+ 
++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
++
+ /*
+  * Pick the next process, keeping these things in mind, in this order:
+  * 1) keep things fair between processes/task groups
+@@ -5491,16 +5623,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+  * 4) do not run the "skip" process, if something else is available
+  */
+ static struct sched_entity *
+-pick_next_entity(struct cfs_rq *cfs_rq)
++pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+ {
+ 	/*
+ 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
+ 	 */
+ 	if (sched_feat(NEXT_BUDDY) &&
+-	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
++	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
++		/* ->next will never be delayed */
++		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ 		return cfs_rq->next;
++	}
+ 
+-	return pick_eevdf(cfs_rq);
++	struct sched_entity *se = pick_eevdf(cfs_rq);
++	if (se->sched_delayed) {
++		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++		SCHED_WARN_ON(se->sched_delayed);
++		SCHED_WARN_ON(se->on_rq);
++		return NULL;
++	}
++	return se;
+ }
+ 
+ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+@@ -5524,6 +5666,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ 		/* in !on_rq case, update occurred at dequeue */
+ 		update_load_avg(cfs_rq, prev, 0);
+ 	}
++	SCHED_WARN_ON(cfs_rq->curr != prev);
+ 	cfs_rq->curr = NULL;
+ }
+ 
+@@ -5787,6 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ 	struct sched_entity *se;
+ 	long task_delta, idle_task_delta, dequeue = 1;
++	long rq_h_nr_running = rq->cfs.h_nr_running;
+ 
+ 	raw_spin_lock(&cfs_b->lock);
+ 	/* This will start the period timer if necessary */
+@@ -5820,11 +5964,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	idle_task_delta = cfs_rq->idle_h_nr_running;
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
++		int flags;
++
+ 		/* throttled entity or throttle-on-deactivate */
+ 		if (!se->on_rq)
+ 			goto done;
+ 
+-		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
++		/*
++		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
++		 * This avoids teaching dequeue_entities() about throttled
++		 * entities and keeps things relatively simple.
++		 */
++		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
++		if (se->sched_delayed)
++			flags |= DEQUEUE_DELAYED;
++		dequeue_entity(qcfs_rq, se, flags);
+ 
+ 		if (cfs_rq_is_idle(group_cfs_rq(se)))
+ 			idle_task_delta = cfs_rq->h_nr_running;
+@@ -5858,6 +6012,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	/* At this point se is NULL and we are at root level*/
+ 	sub_nr_running(rq, task_delta);
+ 
++	/* Stop the fair server if throttling resulted in no runnable tasks */
++	if (rq_h_nr_running && !rq->cfs.h_nr_running)
++		dl_server_stop(&rq->fair_server);
+ done:
+ 	/*
+ 	 * Note: distribution will already see us throttled via the
+@@ -5876,6 +6033,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ 	struct sched_entity *se;
+ 	long task_delta, idle_task_delta;
++	long rq_h_nr_running = rq->cfs.h_nr_running;
+ 
+ 	se = cfs_rq->tg->se[cpu_of(rq)];
+ 
+@@ -5913,7 +6071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ 
+-		if (se->on_rq)
++		/* Handle any unfinished DELAY_DEQUEUE business first. */
++		if (se->sched_delayed) {
++			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
++
++			dequeue_entity(qcfs_rq, se, flags);
++		} else if (se->on_rq)
+ 			break;
+ 		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+ 
+@@ -5945,6 +6108,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 			goto unthrottle_throttle;
+ 	}
+ 
++	/* Start the fair server if un-throttling resulted in new runnable tasks */
++	if (!rq_h_nr_running && rq->cfs.h_nr_running)
++		dl_server_start(&rq->fair_server);
++
+ 	/* At this point se is NULL and we are at root level*/
+ 	add_nr_running(rq, task_delta);
+ 
+@@ -6577,7 +6744,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+ {
+ 	int cpu = cpu_of(rq);
+ 
+-	if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
++	if (!cfs_bandwidth_used())
+ 		return;
+ 
+ 	if (!tick_nohz_full_cpu(cpu))
+@@ -6760,6 +6927,37 @@ static int sched_idle_cpu(int cpu)
+ }
+ #endif
+ 
++static void
++requeue_delayed_entity(struct sched_entity *se)
++{
++	struct cfs_rq *cfs_rq = cfs_rq_of(se);
++
++	/*
++	 * se->sched_delayed should imply: se->on_rq == 1.
++	 * Because a delayed entity is one that is still on
++	 * the runqueue competing until elegibility.
++	 */
++	SCHED_WARN_ON(!se->sched_delayed);
++	SCHED_WARN_ON(!se->on_rq);
++
++	if (sched_feat(DELAY_ZERO)) {
++		update_entity_lag(cfs_rq, se);
++		if (se->vlag > 0) {
++			cfs_rq->nr_running--;
++			if (se != cfs_rq->curr)
++				__dequeue_entity(cfs_rq, se);
++			se->vlag = 0;
++			place_entity(cfs_rq, se, 0);
++			if (se != cfs_rq->curr)
++				__enqueue_entity(cfs_rq, se);
++			cfs_rq->nr_running++;
++		}
++	}
++
++	update_load_avg(cfs_rq, se, 0);
++	se->sched_delayed = 0;
++}
++
+ /*
+  * The enqueue_task method is called before nr_running is
+  * increased. Here we update the fair scheduling stats and
+@@ -6772,6 +6970,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	struct sched_entity *se = &p->se;
+ 	int idle_h_nr_running = task_has_idle_policy(p);
+ 	int task_new = !(flags & ENQUEUE_WAKEUP);
++	int rq_h_nr_running = rq->cfs.h_nr_running;
++	u64 slice = 0;
+ 
+ 	/*
+ 	 * The code below (indirectly) updates schedutil which looks at
+@@ -6779,7 +6979,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	 * Let's add the task's estimated utilization to the cfs_rq's
+ 	 * estimated utilization, before we update schedutil.
+ 	 */
+-	util_est_enqueue(&rq->cfs, p);
++	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
++		util_est_enqueue(&rq->cfs, p);
++
++	if (flags & ENQUEUE_DELAYED) {
++		requeue_delayed_entity(se);
++		return;
++	}
+ 
+ 	/*
+ 	 * If in_iowait is set, the code below may not trigger any cpufreq
+@@ -6790,10 +6996,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ 
+ 	for_each_sched_entity(se) {
+-		if (se->on_rq)
++		if (se->on_rq) {
++			if (se->sched_delayed)
++				requeue_delayed_entity(se);
+ 			break;
++		}
+ 		cfs_rq = cfs_rq_of(se);
++
++		/*
++		 * Basically set the slice of group entries to the min_slice of
++		 * their respective cfs_rq. This ensures the group can service
++		 * its entities in the desired time-frame.
++		 */
++		if (slice) {
++			se->slice = slice;
++			se->custom_slice = 1;
++		}
+ 		enqueue_entity(cfs_rq, se, flags);
++		slice = cfs_rq_min_slice(cfs_rq);
+ 
+ 		cfs_rq->h_nr_running++;
+ 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+@@ -6815,6 +7035,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		se_update_runnable(se);
+ 		update_cfs_group(se);
+ 
++		se->slice = slice;
++		slice = cfs_rq_min_slice(cfs_rq);
++
+ 		cfs_rq->h_nr_running++;
+ 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ 
+@@ -6826,6 +7049,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			goto enqueue_throttle;
+ 	}
+ 
++	if (!rq_h_nr_running && rq->cfs.h_nr_running) {
++		/* Account for idle runtime */
++		if (!rq->nr_running)
++			dl_server_update_idle_time(rq, rq->curr);
++		dl_server_start(&rq->fair_server);
++	}
++
+ 	/* At this point se is NULL and we are at root level*/
+ 	add_nr_running(rq, 1);
+ 
+@@ -6855,36 +7085,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ static void set_next_buddy(struct sched_entity *se);
+ 
+ /*
+- * The dequeue_task method is called before nr_running is
+- * decreased. We remove the task from the rbtree and
+- * update the fair scheduling stats:
++ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
++ * failing half-way through and resume the dequeue later.
++ *
++ * Returns:
++ * -1 - dequeue delayed
++ *  0 - dequeue throttled
++ *  1 - dequeue complete
+  */
+-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ {
+-	struct cfs_rq *cfs_rq;
+-	struct sched_entity *se = &p->se;
+-	int task_sleep = flags & DEQUEUE_SLEEP;
+-	int idle_h_nr_running = task_has_idle_policy(p);
+ 	bool was_sched_idle = sched_idle_rq(rq);
++	int rq_h_nr_running = rq->cfs.h_nr_running;
++	bool task_sleep = flags & DEQUEUE_SLEEP;
++	bool task_delayed = flags & DEQUEUE_DELAYED;
++	struct task_struct *p = NULL;
++	int idle_h_nr_running = 0;
++	int h_nr_running = 0;
++	struct cfs_rq *cfs_rq;
++	u64 slice = 0;
+ 
+-	util_est_dequeue(&rq->cfs, p);
++	if (entity_is_task(se)) {
++		p = task_of(se);
++		h_nr_running = 1;
++		idle_h_nr_running = task_has_idle_policy(p);
++	} else {
++		cfs_rq = group_cfs_rq(se);
++		slice = cfs_rq_min_slice(cfs_rq);
++	}
+ 
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+-		dequeue_entity(cfs_rq, se, flags);
+ 
+-		cfs_rq->h_nr_running--;
++		if (!dequeue_entity(cfs_rq, se, flags)) {
++			if (p && &p->se == se)
++				return -1;
++
++			break;
++		}
++
++		cfs_rq->h_nr_running -= h_nr_running;
+ 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ 
+ 		if (cfs_rq_is_idle(cfs_rq))
+-			idle_h_nr_running = 1;
++			idle_h_nr_running = h_nr_running;
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			goto dequeue_throttle;
++			return 0;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+ 		if (cfs_rq->load.weight) {
++			slice = cfs_rq_min_slice(cfs_rq);
++
+ 			/* Avoid re-evaluating load for this entity: */
+ 			se = parent_entity(se);
+ 			/*
+@@ -6896,6 +7149,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			break;
+ 		}
+ 		flags |= DEQUEUE_SLEEP;
++		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
+ 	}
+ 
+ 	for_each_sched_entity(se) {
+@@ -6905,28 +7159,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		se_update_runnable(se);
+ 		update_cfs_group(se);
+ 
+-		cfs_rq->h_nr_running--;
++		se->slice = slice;
++		slice = cfs_rq_min_slice(cfs_rq);
++
++		cfs_rq->h_nr_running -= h_nr_running;
+ 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ 
+ 		if (cfs_rq_is_idle(cfs_rq))
+-			idle_h_nr_running = 1;
++			idle_h_nr_running = h_nr_running;
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			goto dequeue_throttle;
+-
++			return 0;
+ 	}
+ 
+-	/* At this point se is NULL and we are at root level*/
+-	sub_nr_running(rq, 1);
++	sub_nr_running(rq, h_nr_running);
++
++	if (rq_h_nr_running && !rq->cfs.h_nr_running)
++		dl_server_stop(&rq->fair_server);
+ 
+ 	/* balance early to pull high priority tasks */
+ 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ 		rq->next_balance = jiffies;
+ 
+-dequeue_throttle:
+-	util_est_update(&rq->cfs, p, task_sleep);
++	if (p && task_delayed) {
++		SCHED_WARN_ON(!task_sleep);
++		SCHED_WARN_ON(p->on_rq != 1);
++
++		/* Fix-up what dequeue_task_fair() skipped */
++		hrtick_update(rq);
++
++		/* Fix-up what block_task() skipped. */
++		__block_task(rq, p);
++	}
++
++	return 1;
++}
++
++/*
++ * The dequeue_task method is called before nr_running is
++ * decreased. We remove the task from the rbtree and
++ * update the fair scheduling stats:
++ */
++static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
++{
++	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
++		util_est_dequeue(&rq->cfs, p);
++
++	if (dequeue_entities(rq, &p->se, flags) < 0) {
++		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
++		return false;
++	}
++
++	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ 	hrtick_update(rq);
++	return true;
+ }
+ 
+ #ifdef CONFIG_SMP
+@@ -7824,6 +8111,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+ 	return cpu_util(cpu, p, -1, 0);
+ }
+ 
++/*
++ * This function computes an effective utilization for the given CPU, to be
++ * used for frequency selection given the linear relation: f = u * f_max.
++ *
++ * The scheduler tracks the following metrics:
++ *
++ *   cpu_util_{cfs,rt,dl,irq}()
++ *   cpu_bw_dl()
++ *
++ * Where the cfs,rt and dl util numbers are tracked with the same metric and
++ * synchronized windows and are thus directly comparable.
++ *
++ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
++ * which excludes things like IRQ and steal-time. These latter are then accrued
++ * in the IRQ utilization.
++ *
++ * The DL bandwidth number OTOH is not a measured metric but a value computed
++ * based on the task model parameters and gives the minimal utilization
++ * required to meet deadlines.
++ */
++unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
++				 unsigned long *min,
++				 unsigned long *max)
++{
++	unsigned long util, irq, scale;
++	struct rq *rq = cpu_rq(cpu);
++
++	scale = arch_scale_cpu_capacity(cpu);
++
++	/*
++	 * Early check to see if IRQ/steal time saturates the CPU, can be
++	 * because of inaccuracies in how we track these -- see
++	 * update_irq_load_avg().
++	 */
++	irq = cpu_util_irq(rq);
++	if (unlikely(irq >= scale)) {
++		if (min)
++			*min = scale;
++		if (max)
++			*max = scale;
++		return scale;
++	}
++
++	if (min) {
++		/*
++		 * The minimum utilization returns the highest level between:
++		 * - the computed DL bandwidth needed with the IRQ pressure which
++		 *   steals time to the deadline task.
++		 * - The minimum performance requirement for CFS and/or RT.
++		 */
++		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
++
++		/*
++		 * When an RT task is runnable and uclamp is not used, we must
++		 * ensure that the task will run at maximum compute capacity.
++		 */
++		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
++			*min = max(*min, scale);
++	}
++
++	/*
++	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
++	 * CFS tasks and we use the same metric to track the effective
++	 * utilization (PELT windows are synchronized) we can directly add them
++	 * to obtain the CPU's actual utilization.
++	 */
++	util = util_cfs + cpu_util_rt(rq);
++	util += cpu_util_dl(rq);
++
++	/*
++	 * The maximum hint is a soft bandwidth requirement, which can be lower
++	 * than the actual utilization because of uclamp_max requirements.
++	 */
++	if (max)
++		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
++
++	if (util >= scale)
++		return scale;
++
++	/*
++	 * There is still idle time; further improve the number by using the
++	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
++	 * need to scale the task numbers:
++	 *
++	 *              max - irq
++	 *   U' = irq + --------- * U
++	 *                 max
++	 */
++	util = scale_irq_capacity(util, irq, scale);
++	util += irq;
++
++	return min(scale, util);
++}
++
++unsigned long sched_cpu_util(int cpu)
++{
++	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
++}
++
+ /*
+  * energy_env - Utilization landscape for energy estimation.
+  * @task_busy_time: Utilization contribution by the task for which we test the
+@@ -8308,7 +8694,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+ 
+ static void task_dead_fair(struct task_struct *p)
+ {
+-	remove_entity_load_avg(&p->se);
++	struct sched_entity *se = &p->se;
++
++	if (se->sched_delayed) {
++		struct rq_flags rf;
++		struct rq *rq;
++
++		rq = task_rq_lock(p, &rf);
++		if (se->sched_delayed) {
++			update_rq_clock(rq);
++			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++		}
++		task_rq_unlock(rq, p, &rf);
++	}
++
++	remove_entity_load_avg(se);
+ }
+ 
+ /*
+@@ -8344,7 +8744,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
+ static int
+ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+-	if (rq->nr_running)
++	if (sched_fair_runnable(rq))
+ 		return 1;
+ 
+ 	return sched_balance_newidle(rq, rf) != 0;
+@@ -8430,7 +8830,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
+ 	cfs_rq = cfs_rq_of(se);
+ 	update_curr(cfs_rq);
+ 	/*
+-	 * XXX pick_eevdf(cfs_rq) != se ?
++	 * If @p has a shorter slice than current and @p is eligible, override
++	 * current's slice protection in order to allow preemption.
++	 *
++	 * Note that even if @p does not turn out to be the most eligible
++	 * task at this moment, current's slice protection will be lost.
++	 */
++	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
++		se->vlag = se->deadline + 1;
++
++	/*
++	 * If @p has become the most eligible task, force preemption.
+ 	 */
+ 	if (pick_eevdf(cfs_rq) == pse)
+ 		goto preempt;
+@@ -8441,7 +8851,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
+ 	resched_curr(rq);
+ }
+ 
+-#ifdef CONFIG_SMP
+ static struct task_struct *pick_task_fair(struct rq *rq)
+ {
+ 	struct sched_entity *se;
+@@ -8453,95 +8862,58 @@ static struct task_struct *pick_task_fair(struct rq *rq)
+ 		return NULL;
+ 
+ 	do {
+-		struct sched_entity *curr = cfs_rq->curr;
++		/* Might not have done put_prev_entity() */
++		if (cfs_rq->curr && cfs_rq->curr->on_rq)
++			update_curr(cfs_rq);
+ 
+-		/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+-		if (curr) {
+-			if (curr->on_rq)
+-				update_curr(cfs_rq);
+-			else
+-				curr = NULL;
++		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
++			goto again;
+ 
+-			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+-				goto again;
+-		}
+-
+-		se = pick_next_entity(cfs_rq);
++		se = pick_next_entity(rq, cfs_rq);
++		if (!se)
++			goto again;
+ 		cfs_rq = group_cfs_rq(se);
+ 	} while (cfs_rq);
+ 
+ 	return task_of(se);
+ }
+-#endif
++
++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+ 
+ struct task_struct *
+ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+-	struct cfs_rq *cfs_rq = &rq->cfs;
+ 	struct sched_entity *se;
+ 	struct task_struct *p;
+ 	int new_tasks;
+ 
+ again:
+-	if (!sched_fair_runnable(rq))
++	p = pick_task_fair(rq);
++	if (!p)
+ 		goto idle;
++	se = &p->se;
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+-	if (!prev || prev->sched_class != &fair_sched_class)
++	if (prev->sched_class != &fair_sched_class)
+ 		goto simple;
+ 
++	__put_prev_set_next_dl_server(rq, prev, p);
++
+ 	/*
+ 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ 	 * likely that a next task is from the same cgroup as the current.
+ 	 *
+ 	 * Therefore attempt to avoid putting and setting the entire cgroup
+ 	 * hierarchy, only change the part that actually changes.
+-	 */
+-
+-	do {
+-		struct sched_entity *curr = cfs_rq->curr;
+-
+-		/*
+-		 * Since we got here without doing put_prev_entity() we also
+-		 * have to consider cfs_rq->curr. If it is still a runnable
+-		 * entity, update_curr() will update its vruntime, otherwise
+-		 * forget we've ever seen it.
+-		 */
+-		if (curr) {
+-			if (curr->on_rq)
+-				update_curr(cfs_rq);
+-			else
+-				curr = NULL;
+-
+-			/*
+-			 * This call to check_cfs_rq_runtime() will do the
+-			 * throttle and dequeue its entity in the parent(s).
+-			 * Therefore the nr_running test will indeed
+-			 * be correct.
+-			 */
+-			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+-				cfs_rq = &rq->cfs;
+-
+-				if (!cfs_rq->nr_running)
+-					goto idle;
+-
+-				goto simple;
+-			}
+-		}
+-
+-		se = pick_next_entity(cfs_rq);
+-		cfs_rq = group_cfs_rq(se);
+-	} while (cfs_rq);
+-
+-	p = task_of(se);
+-
+-	/*
++	 *
+ 	 * Since we haven't yet done put_prev_entity and if the selected task
+ 	 * is a different task than we started out with, try and touch the
+ 	 * least amount of cfs_rqs.
+ 	 */
+ 	if (prev != p) {
+ 		struct sched_entity *pse = &prev->se;
++		struct cfs_rq *cfs_rq;
+ 
+ 		while (!(cfs_rq = is_same_group(se, pse))) {
+ 			int se_depth = se->depth;
+@@ -8559,38 +8931,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
+ 
+ 		put_prev_entity(cfs_rq, pse);
+ 		set_next_entity(cfs_rq, se);
+-	}
+-
+-	goto done;
+-simple:
+-#endif
+-	if (prev)
+-		put_prev_task(rq, prev);
+ 
+-	do {
+-		se = pick_next_entity(cfs_rq);
+-		set_next_entity(cfs_rq, se);
+-		cfs_rq = group_cfs_rq(se);
+-	} while (cfs_rq);
++		__set_next_task_fair(rq, p, true);
++	}
+ 
+-	p = task_of(se);
++	return p;
+ 
+-done: __maybe_unused;
+-#ifdef CONFIG_SMP
+-	/*
+-	 * Move the next running task to the front of
+-	 * the list, so our cfs_tasks list becomes MRU
+-	 * one.
+-	 */
+-	list_move(&p->se.group_node, &rq->cfs_tasks);
++simple:
+ #endif
+-
+-	if (hrtick_enabled_fair(rq))
+-		hrtick_start_fair(rq, p);
+-
+-	update_misfit_status(p, rq);
+-	sched_fair_update_stop_tick(rq, p);
+-
++	put_prev_set_next_task(rq, prev, p);
+ 	return p;
+ 
+ idle:
+@@ -8619,15 +8968,34 @@ done: __maybe_unused;
+ 	return NULL;
+ }
+ 
+-static struct task_struct *__pick_next_task_fair(struct rq *rq)
++static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
++{
++	return pick_next_task_fair(rq, prev, NULL);
++}
++
++static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
++{
++	return !!dl_se->rq->cfs.nr_running;
++}
++
++static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+ {
+-	return pick_next_task_fair(rq, NULL, NULL);
++	return pick_task_fair(dl_se->rq);
++}
++
++void fair_server_init(struct rq *rq)
++{
++	struct sched_dl_entity *dl_se = &rq->fair_server;
++
++	init_dl_entity(dl_se);
++
++	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+ }
+ 
+ /*
+  * Account for a descheduled task:
+  */
+-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
+ 	struct sched_entity *se = &prev->se;
+ 	struct cfs_rq *cfs_rq;
+@@ -12721,22 +13089,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+  */
+ static void task_fork_fair(struct task_struct *p)
+ {
+-	struct sched_entity *se = &p->se, *curr;
+-	struct cfs_rq *cfs_rq;
+-	struct rq *rq = this_rq();
+-	struct rq_flags rf;
+-
+-	rq_lock(rq, &rf);
+-	update_rq_clock(rq);
+-
+ 	set_task_max_allowed_capacity(p);
+-
+-	cfs_rq = task_cfs_rq(current);
+-	curr = cfs_rq->curr;
+-	if (curr)
+-		update_curr(cfs_rq);
+-	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
+-	rq_unlock(rq, &rf);
+ }
+ 
+ /*
+@@ -12848,10 +13201,28 @@ static void attach_task_cfs_rq(struct task_struct *p)
+ static void switched_from_fair(struct rq *rq, struct task_struct *p)
+ {
+ 	detach_task_cfs_rq(p);
++	/*
++	 * Since this is called after changing class, this is a little weird
++	 * and we cannot use DEQUEUE_DELAYED.
++	 */
++	if (p->se.sched_delayed) {
++		/* First, dequeue it from its new class' structures */
++		dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
++		/*
++		 * Now, clean up the fair_sched_class side of things
++		 * related to sched_delayed being true and that wasn't done
++		 * due to the generic dequeue not using DEQUEUE_DELAYED.
++		 */
++		finish_delayed_dequeue_entity(&p->se);
++		p->se.rel_deadline = 0;
++		__block_task(rq, p);
++	}
+ }
+ 
+ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ {
++	SCHED_WARN_ON(p->se.sched_delayed);
++
+ 	attach_task_cfs_rq(p);
+ 
+ 	set_task_max_allowed_capacity(p);
+@@ -12869,12 +13240,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ 	}
+ }
+ 
+-/* Account for a task changing its policy or group.
+- *
+- * This routine is mostly called to set cfs_rq->curr field when a task
+- * migrates between groups/classes.
+- */
+-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ {
+ 	struct sched_entity *se = &p->se;
+ 
+@@ -12887,6 +13253,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ 		list_move(&se->group_node, &rq->cfs_tasks);
+ 	}
+ #endif
++	if (!first)
++		return;
++
++	SCHED_WARN_ON(se->sched_delayed);
++
++	if (hrtick_enabled_fair(rq))
++		hrtick_start_fair(rq, p);
++
++	update_misfit_status(p, rq);
++	sched_fair_update_stop_tick(rq, p);
++}
++
++/*
++ * Account for a task changing its policy or group.
++ *
++ * This routine is mostly called to set cfs_rq->curr field when a task
++ * migrates between groups/classes.
++ */
++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
++{
++	struct sched_entity *se = &p->se;
+ 
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+@@ -12895,12 +13282,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ 		/* ensure bandwidth has been allocated on our new cfs_rq */
+ 		account_cfs_rq_runtime(cfs_rq, 0);
+ 	}
++
++	__set_next_task_fair(rq, p, first);
+ }
+ 
+ void init_cfs_rq(struct cfs_rq *cfs_rq)
+ {
+ 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+-	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
++	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ #ifdef CONFIG_SMP
+ 	raw_spin_lock_init(&cfs_rq->removed.lock);
+ #endif
+@@ -13002,28 +13391,35 @@ void online_fair_sched_group(struct task_group *tg)
+ 
+ void unregister_fair_sched_group(struct task_group *tg)
+ {
+-	unsigned long flags;
+-	struct rq *rq;
+ 	int cpu;
+ 
+ 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ 
+ 	for_each_possible_cpu(cpu) {
+-		if (tg->se[cpu])
+-			remove_entity_load_avg(tg->se[cpu]);
++		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
++		struct sched_entity *se = tg->se[cpu];
++		struct rq *rq = cpu_rq(cpu);
++
++		if (se) {
++			if (se->sched_delayed) {
++				guard(rq_lock_irqsave)(rq);
++				if (se->sched_delayed) {
++					update_rq_clock(rq);
++					dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++				}
++				list_del_leaf_cfs_rq(cfs_rq);
++			}
++			remove_entity_load_avg(se);
++		}
+ 
+ 		/*
+ 		 * Only empty task groups can be destroyed; so we can speculatively
+ 		 * check on_list without danger of it being re-added.
+ 		 */
+-		if (!tg->cfs_rq[cpu]->on_list)
+-			continue;
+-
+-		rq = cpu_rq(cpu);
+-
+-		raw_spin_rq_lock_irqsave(rq, flags);
+-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+-		raw_spin_rq_unlock_irqrestore(rq, flags);
++		if (cfs_rq->on_list) {
++			guard(rq_lock_irqsave)(rq);
++			list_del_leaf_cfs_rq(cfs_rq);
++		}
+ 	}
+ }
+ 
+@@ -13213,13 +13609,13 @@ DEFINE_SCHED_CLASS(fair) = {
+ 
+ 	.wakeup_preempt		= check_preempt_wakeup_fair,
+ 
++	.pick_task		= pick_task_fair,
+ 	.pick_next_task		= __pick_next_task_fair,
+ 	.put_prev_task		= put_prev_task_fair,
+ 	.set_next_task          = set_next_task_fair,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_fair,
+-	.pick_task		= pick_task_fair,
+ 	.select_task_rq		= select_task_rq_fair,
+ 	.migrate_task_rq	= migrate_task_rq_fair,
+ 
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 143f55df890b..290874079f60 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -5,8 +5,24 @@
+  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+ SCHED_FEAT(PLACE_LAG, true)
++/*
++ * Give new tasks half a slice to ease into the competition.
++ */
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
++/*
++ * Preserve relative virtual deadline on 'migration'.
++ */
++SCHED_FEAT(PLACE_REL_DEADLINE, true)
++/*
++ * Inhibit (wakeup) preemption until the current task has either matched the
++ * 0-lag point or until is has exhausted it's slice.
++ */
+ SCHED_FEAT(RUN_TO_PARITY, true)
++/*
++ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
++ * current.
++ */
++SCHED_FEAT(PREEMPT_SHORT, true)
+ 
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+@@ -21,6 +37,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
+  */
+ SCHED_FEAT(CACHE_HOT_BUDDY, true)
+ 
++/*
++ * Delay dequeueing tasks until they get selected or woken.
++ *
++ * By delaying the dequeue for non-eligible tasks, they remain in the
++ * competition and can burn off their negative lag. When they get selected
++ * they'll have positive lag by definition.
++ *
++ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
++ */
++SCHED_FEAT(DELAY_DEQUEUE, true)
++SCHED_FEAT(DELAY_ZERO, true)
++
+ /*
+  * Allow wakeup-time preemption of the current task:
+  */
+@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true)
+ SCHED_FEAT(UTIL_EST, true)
+ 
+ SCHED_FEAT(LATENCY_WARN, false)
+-
+-SCHED_FEAT(HZ_BW, true)
+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
+index 6e78d071beb5..7a105a0123aa 100644
+--- a/kernel/sched/idle.c
++++ b/kernel/sched/idle.c
+@@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
+ 	resched_curr(rq);
+ }
+ 
+-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
++	dl_server_update_idle_time(rq, prev);
+ }
+ 
+ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
+ {
+ 	update_idle_core(rq);
+ 	schedstat_inc(rq->sched_goidle);
++	next->se.exec_start = rq_clock_task(rq);
+ }
+ 
+-#ifdef CONFIG_SMP
+-static struct task_struct *pick_task_idle(struct rq *rq)
++struct task_struct *pick_task_idle(struct rq *rq)
+ {
+ 	return rq->idle;
+ }
+-#endif
+-
+-struct task_struct *pick_next_task_idle(struct rq *rq)
+-{
+-	struct task_struct *next = rq->idle;
+-
+-	set_next_task_idle(rq, next, true);
+-
+-	return next;
+-}
+ 
+ /*
+  * It is not legal to sleep in the idle task - print a warning
+  * message if some code attempts to do it:
+  */
+-static void
++static bool
+ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	raw_spin_rq_unlock_irq(rq);
+ 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ 	dump_stack();
+ 	raw_spin_rq_lock_irq(rq);
++	return true;
+ }
+ 
+ /*
+@@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_idle,
+ 
+-	.pick_next_task		= pick_next_task_idle,
++	.pick_task		= pick_task_idle,
+ 	.put_prev_task		= put_prev_task_idle,
+ 	.set_next_task          = set_next_task_idle,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_idle,
+-	.pick_task		= pick_task_idle,
+ 	.select_task_rq		= select_task_rq_idle,
+ 	.set_cpus_allowed	= set_cpus_allowed_common,
+ #endif
+diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
+index 310523c1b9e3..172c588de542 100644
+--- a/kernel/sched/rt.c
++++ b/kernel/sched/rt.c
+@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE;
+ /* More than 4 hours if BW_SHIFT equals 20. */
+ static const u64 max_rt_runtime = MAX_BW;
+ 
+-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+-
+-struct rt_bandwidth def_rt_bandwidth;
+-
+ /*
+  * period over which we measure -rt task CPU usage in us.
+  * default: 1s
+@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void)
+ late_initcall(sched_rt_sysctl_init);
+ #endif
+ 
++void init_rt_rq(struct rt_rq *rt_rq)
++{
++	struct rt_prio_array *array;
++	int i;
++
++	array = &rt_rq->active;
++	for (i = 0; i < MAX_RT_PRIO; i++) {
++		INIT_LIST_HEAD(array->queue + i);
++		__clear_bit(i, array->bitmap);
++	}
++	/* delimiter for bitsearch: */
++	__set_bit(MAX_RT_PRIO, array->bitmap);
++
++#if defined CONFIG_SMP
++	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
++	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
++	rt_rq->overloaded = 0;
++	plist_head_init(&rt_rq->pushable_tasks);
++#endif /* CONFIG_SMP */
++	/* We start is dequeued state, because no RT tasks are queued */
++	rt_rq->rt_queued = 0;
++
++#ifdef CONFIG_RT_GROUP_SCHED
++	rt_rq->rt_time = 0;
++	rt_rq->rt_throttled = 0;
++	rt_rq->rt_runtime = 0;
++	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
++#endif
++}
++
++#ifdef CONFIG_RT_GROUP_SCHED
++
++static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
++
+ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+ {
+ 	struct rt_bandwidth *rt_b =
+@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+ 	do_start_rt_bandwidth(rt_b);
+ }
+ 
+-void init_rt_rq(struct rt_rq *rt_rq)
+-{
+-	struct rt_prio_array *array;
+-	int i;
+-
+-	array = &rt_rq->active;
+-	for (i = 0; i < MAX_RT_PRIO; i++) {
+-		INIT_LIST_HEAD(array->queue + i);
+-		__clear_bit(i, array->bitmap);
+-	}
+-	/* delimiter for bit-search: */
+-	__set_bit(MAX_RT_PRIO, array->bitmap);
+-
+-#if defined CONFIG_SMP
+-	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+-	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+-	rt_rq->overloaded = 0;
+-	plist_head_init(&rt_rq->pushable_tasks);
+-#endif /* CONFIG_SMP */
+-	/* We start is dequeued state, because no RT tasks are queued */
+-	rt_rq->rt_queued = 0;
+-
+-	rt_rq->rt_time = 0;
+-	rt_rq->rt_throttled = 0;
+-	rt_rq->rt_runtime = 0;
+-	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+-}
+-
+-#ifdef CONFIG_RT_GROUP_SCHED
+ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+ {
+ 	hrtimer_cancel(&rt_b->rt_period_timer);
+@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg)
+ {
+ 	if (tg->rt_se)
+ 		destroy_rt_bandwidth(&tg->rt_bandwidth);
+-
+ }
+ 
+ void free_rt_sched_group(struct task_group *tg)
+@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+ 	if (!tg->rt_se)
+ 		goto err;
+ 
+-	init_rt_bandwidth(&tg->rt_bandwidth,
+-			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
++	init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
+ 
+ 	for_each_possible_cpu(i) {
+ 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
+@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+ 	return &rt_rq->tg->rt_bandwidth;
+ }
+ 
+-#else /* !CONFIG_RT_GROUP_SCHED */
+-
+-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+-{
+-	return rt_rq->rt_runtime;
+-}
+-
+-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+-{
+-	return ktime_to_ns(def_rt_bandwidth.rt_period);
+-}
+-
+-typedef struct rt_rq *rt_rq_iter_t;
+-
+-#define for_each_rt_rq(rt_rq, iter, rq) \
+-	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+-
+-#define for_each_sched_rt_entity(rt_se) \
+-	for (; rt_se; rt_se = NULL)
+-
+-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+-{
+-	return NULL;
+-}
+-
+-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+-{
+-	struct rq *rq = rq_of_rt_rq(rt_rq);
+-
+-	if (!rt_rq->rt_nr_running)
+-		return;
+-
+-	enqueue_top_rt_rq(rt_rq);
+-	resched_curr(rq);
+-}
+-
+-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+-{
+-	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+-}
+-
+-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+-{
+-	return rt_rq->rt_throttled;
+-}
+-
+-static inline const struct cpumask *sched_rt_period_mask(void)
+-{
+-	return cpu_online_mask;
+-}
+-
+-static inline
+-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+-{
+-	return &cpu_rq(cpu)->rt;
+-}
+-
+-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+-{
+-	return &def_rt_bandwidth;
+-}
+-
+-#endif /* CONFIG_RT_GROUP_SCHED */
+-
+ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+ {
+ 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	const struct cpumask *span;
+ 
+ 	span = sched_rt_period_mask();
+-#ifdef CONFIG_RT_GROUP_SCHED
++
+ 	/*
+ 	 * FIXME: isolated CPUs should really leave the root task group,
+ 	 * whether they are isolcpus or were isolated via cpusets, lest
+@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	 */
+ 	if (rt_b == &root_task_group.rt_bandwidth)
+ 		span = cpu_online_mask;
+-#endif
++
+ 	for_each_cpu(i, span) {
+ 		int enqueue = 0;
+ 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	return idle;
+ }
+ 
+-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+-{
+-#ifdef CONFIG_RT_GROUP_SCHED
+-	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+-
+-	if (rt_rq)
+-		return rt_rq->highest_prio.curr;
+-#endif
+-
+-	return rt_task_of(rt_se)->prio;
+-}
+-
+ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ {
+ 	u64 runtime = sched_rt_runtime(rt_rq);
+@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ 	return 0;
+ }
+ 
++#else /* !CONFIG_RT_GROUP_SCHED */
++
++typedef struct rt_rq *rt_rq_iter_t;
++
++#define for_each_rt_rq(rt_rq, iter, rq) \
++	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
++
++#define for_each_sched_rt_entity(rt_se) \
++	for (; rt_se; rt_se = NULL)
++
++static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
++{
++	return NULL;
++}
++
++static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
++{
++	struct rq *rq = rq_of_rt_rq(rt_rq);
++
++	if (!rt_rq->rt_nr_running)
++		return;
++
++	enqueue_top_rt_rq(rt_rq);
++	resched_curr(rq);
++}
++
++static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
++{
++	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
++}
++
++static inline int rt_rq_throttled(struct rt_rq *rt_rq)
++{
++	return false;
++}
++
++static inline const struct cpumask *sched_rt_period_mask(void)
++{
++	return cpu_online_mask;
++}
++
++static inline
++struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
++{
++	return &cpu_rq(cpu)->rt;
++}
++
++#ifdef CONFIG_SMP
++static void __enable_runtime(struct rq *rq) { }
++static void __disable_runtime(struct rq *rq) { }
++#endif
++
++#endif /* CONFIG_RT_GROUP_SCHED */
++
++static inline int rt_se_prio(struct sched_rt_entity *rt_se)
++{
++#ifdef CONFIG_RT_GROUP_SCHED
++	struct rt_rq *rt_rq = group_rt_rq(rt_se);
++
++	if (rt_rq)
++		return rt_rq->highest_prio.curr;
++#endif
++
++	return rt_task_of(rt_se)->prio;
++}
++
+ /*
+  * Update the current task's runtime statistics. Skip current tasks that
+  * are not in our scheduling class.
+@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ static void update_curr_rt(struct rq *rq)
+ {
+ 	struct task_struct *curr = rq->curr;
+-	struct sched_rt_entity *rt_se = &curr->rt;
+ 	s64 delta_exec;
+ 
+ 	if (curr->sched_class != &rt_sched_class)
+@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq)
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
++#ifdef CONFIG_RT_GROUP_SCHED
++	struct sched_rt_entity *rt_se = &curr->rt;
++
+ 	if (!rt_bandwidth_enabled())
+ 		return;
+ 
+@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq)
+ 				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
+ 		}
+ 	}
++#endif
+ }
+ 
+ static void
+@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ static void
+ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ {
+-	start_rt_bandwidth(&def_rt_bandwidth);
+ }
+ 
+ static inline
+@@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 		enqueue_pushable_task(rq, p);
+ }
+ 
+-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
++static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	struct sched_rt_entity *rt_se = &p->rt;
+ 
+@@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 	dequeue_rt_entity(rt_se, flags);
+ 
+ 	dequeue_pushable_task(rq, p);
++
++	return true;
+ }
+ 
+ /*
+@@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
+ 	return p;
+ }
+ 
+-static struct task_struct *pick_next_task_rt(struct rq *rq)
+-{
+-	struct task_struct *p = pick_task_rt(rq);
+-
+-	if (p)
+-		set_next_task_rt(rq, p, true);
+-
+-	return p;
+-}
+-
+-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
++static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
+ {
+ 	struct sched_rt_entity *rt_se = &p->rt;
+ 	struct rt_rq *rt_rq = &rq->rt;
+@@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_rt,
+ 
+-	.pick_next_task		= pick_next_task_rt,
++	.pick_task		= pick_task_rt,
+ 	.put_prev_task		= put_prev_task_rt,
+ 	.set_next_task          = set_next_task_rt,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_rt,
+-	.pick_task		= pick_task_rt,
+ 	.select_task_rq		= select_task_rq_rt,
+ 	.set_cpus_allowed       = set_cpus_allowed_common,
+ 	.rq_online              = rq_online_rt,
+@@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+ #ifdef CONFIG_SYSCTL
+ static int sched_rt_global_constraints(void)
+ {
+-	unsigned long flags;
+-	int i;
+-
+-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+-	for_each_possible_cpu(i) {
+-		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+-
+-		raw_spin_lock(&rt_rq->rt_runtime_lock);
+-		rt_rq->rt_runtime = global_rt_runtime();
+-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+-	}
+-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+-
+ 	return 0;
+ }
+ #endif /* CONFIG_SYSCTL */
+@@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void)
+ 
+ static void sched_rt_do_global(void)
+ {
+-	unsigned long flags;
+-
+-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+-	def_rt_bandwidth.rt_runtime = global_rt_runtime();
+-	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+ }
+ 
+ static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 432b43aa091c..8b84608f2531 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -68,6 +68,7 @@
+ #include <linux/wait_api.h>
+ #include <linux/wait_bit.h>
+ #include <linux/workqueue_api.h>
++#include <linux/delayacct.h>
+ 
+ #include <trace/events/power.h>
+ #include <trace/events/sched.h>
+@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
+ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+ extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
+ extern int  dl_bw_check_overflow(int cpu);
+-
++extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
+ /*
+  * SCHED_DEADLINE supports servers (nested scheduling) with the following
+  * interface:
+@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
+ extern void dl_server_stop(struct sched_dl_entity *dl_se);
+ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ 		    dl_server_has_tasks_f has_tasks,
+-		    dl_server_pick_f pick);
++		    dl_server_pick_f pick_task);
++
++extern void dl_server_update_idle_time(struct rq *rq,
++		    struct task_struct *p);
++extern void fair_server_init(struct rq *rq);
++extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
++extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
++		    u64 runtime, u64 period, bool init);
+ 
+ #ifdef CONFIG_CGROUP_SCHED
+ 
+@@ -599,17 +607,12 @@ struct cfs_rq {
+ 	s64			avg_vruntime;
+ 	u64			avg_load;
+ 
+-	u64			exec_clock;
+ 	u64			min_vruntime;
+ #ifdef CONFIG_SCHED_CORE
+ 	unsigned int		forceidle_seq;
+ 	u64			min_vruntime_fi;
+ #endif
+ 
+-#ifndef CONFIG_64BIT
+-	u64			min_vruntime_copy;
+-#endif
+-
+ 	struct rb_root_cached	tasks_timeline;
+ 
+ 	/*
+@@ -619,10 +622,6 @@ struct cfs_rq {
+ 	struct sched_entity	*curr;
+ 	struct sched_entity	*next;
+ 
+-#ifdef	CONFIG_SCHED_DEBUG
+-	unsigned int		nr_spread_over;
+-#endif
+-
+ #ifdef CONFIG_SMP
+ 	/*
+ 	 * CFS load tracking
+@@ -726,13 +725,13 @@ struct rt_rq {
+ #endif /* CONFIG_SMP */
+ 	int			rt_queued;
+ 
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	int			rt_throttled;
+ 	u64			rt_time;
+ 	u64			rt_runtime;
+ 	/* Nests inside the rq lock: */
+ 	raw_spinlock_t		rt_runtime_lock;
+ 
+-#ifdef CONFIG_RT_GROUP_SCHED
+ 	unsigned int		rt_nr_boosted;
+ 
+ 	struct rq		*rq;
+@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se)
+ 
+ static inline long se_runnable(struct sched_entity *se)
+ {
++	if (se->sched_delayed)
++		return false;
++
+ 	if (entity_is_task(se))
+ 		return !!se->on_rq;
+ 	else
+@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { }
+ 
+ static inline long se_runnable(struct sched_entity *se)
+ {
++	if (se->sched_delayed)
++		return false;
++
+ 	return !!se->on_rq;
+ }
+ 
+@@ -1044,6 +1049,8 @@ struct rq {
+ 	struct rt_rq		rt;
+ 	struct dl_rq		dl;
+ 
++	struct sched_dl_entity	fair_server;
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	/* list of leaf cfs_rq on this CPU: */
+ 	struct list_head	leaf_cfs_rq_list;
+@@ -1059,6 +1066,7 @@ struct rq {
+ 	unsigned int		nr_uninterruptible;
+ 
+ 	struct task_struct __rcu	*curr;
++	struct sched_dl_entity	*dl_server;
+ 	struct task_struct	*idle;
+ 	struct task_struct	*stop;
+ 	unsigned long		next_balance;
+@@ -1158,7 +1166,6 @@ struct rq {
+ 	/* latency stats */
+ 	struct sched_info	rq_sched_info;
+ 	unsigned long long	rq_cpu_time;
+-	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+ 
+ 	/* sys_sched_yield() stats */
+ 	unsigned int		yld_count;
+@@ -1187,6 +1194,7 @@ struct rq {
+ 	/* per rq */
+ 	struct rq		*core;
+ 	struct task_struct	*core_pick;
++	struct sched_dl_entity	*core_dl_server;
+ 	unsigned int		core_enabled;
+ 	unsigned int		core_sched_seq;
+ 	struct rb_root		core_tree;
+@@ -1236,6 +1244,7 @@ static inline int cpu_of(struct rq *rq)
+ }
+ 
+ #define MDF_PUSH		0x01
++#define DELAYED_MIGRATED	0x02 /* Task was migrated when in DELAYED_DEQUEUE state */
+ 
+ static inline bool is_migration_disabled(struct task_struct *p)
+ {
+@@ -2247,11 +2256,13 @@ extern const u32		sched_prio_to_wmult[40];
+  *
+  */
+ 
+-#define DEQUEUE_SLEEP		0x01
++#define DEQUEUE_SLEEP		0x01 /* Matches ENQUEUE_WAKEUP */
+ #define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
+ #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
+ #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
++#define DEQUEUE_SPECIAL		0x10
+ #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
++#define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+ 
+ #define ENQUEUE_WAKEUP		0x01
+ #define ENQUEUE_RESTORE		0x02
+@@ -2267,6 +2278,7 @@ extern const u32		sched_prio_to_wmult[40];
+ #endif
+ #define ENQUEUE_INITIAL		0x80
+ #define ENQUEUE_MIGRATING	0x100
++#define ENQUEUE_DELAYED		0x200
+ 
+ #define RETRY_TASK		((void *)-1UL)
+ 
+@@ -2285,23 +2297,31 @@ struct sched_class {
+ #endif
+ 
+ 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
++	bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ 	void (*yield_task)   (struct rq *rq);
+ 	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ 
+ 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ 
+-	struct task_struct *(*pick_next_task)(struct rq *rq);
++	struct task_struct *(*pick_task)(struct rq *rq);
++	/*
++	 * Optional! When implemented pick_next_task() should be equivalent to:
++	 *
++	 *   next = pick_task();
++	 *   if (next) {
++	 *       put_prev_task(prev);
++	 *       set_next_task_first(next);
++	 *   }
++	 */
++	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ 
+-	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
++	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
+ 	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ 
+ #ifdef CONFIG_SMP
+ 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ 
+-	struct task_struct * (*pick_task)(struct rq *rq);
+-
+ 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ 
+ 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+@@ -2345,7 +2365,7 @@ struct sched_class {
+ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+ {
+ 	WARN_ON_ONCE(rq->curr != prev);
+-	prev->sched_class->put_prev_task(rq, prev);
++	prev->sched_class->put_prev_task(rq, prev, NULL);
+ }
+ 
+ static inline void set_next_task(struct rq *rq, struct task_struct *next)
+@@ -2353,6 +2373,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
+ 	next->sched_class->set_next_task(rq, next, false);
+ }
+ 
++static inline void
++__put_prev_set_next_dl_server(struct rq *rq,
++			      struct task_struct *prev,
++			      struct task_struct *next)
++{
++	prev->dl_server = NULL;
++	next->dl_server = rq->dl_server;
++	rq->dl_server = NULL;
++}
++
++static inline void put_prev_set_next_task(struct rq *rq,
++					  struct task_struct *prev,
++					  struct task_struct *next)
++{
++	WARN_ON_ONCE(rq->curr != prev);
++
++	__put_prev_set_next_dl_server(rq, prev, next);
++
++	if (next == prev)
++		return;
++
++	prev->sched_class->put_prev_task(rq, prev, next);
++	next->sched_class->set_next_task(rq, next, true);
++}
+ 
+ /*
+  * Helper to define a sched_class instance; each one is placed in a separate
+@@ -2408,7 +2452,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
+ }
+ 
+ extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+-extern struct task_struct *pick_next_task_idle(struct rq *rq);
++extern struct task_struct *pick_task_idle(struct rq *rq);
+ 
+ #define SCA_CHECK		0x01
+ #define SCA_MIGRATE_DISABLE	0x02
+@@ -2515,7 +2559,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
+ extern void resched_curr(struct rq *rq);
+ extern void resched_cpu(int cpu);
+ 
+-extern struct rt_bandwidth def_rt_bandwidth;
+ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+ 
+@@ -2586,6 +2629,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
+ 	sched_update_tick_dependency(rq);
+ }
+ 
++static inline void __block_task(struct rq *rq, struct task_struct *p)
++{
++	WRITE_ONCE(p->on_rq, 0);
++	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
++	if (p->sched_contributes_to_load)
++		rq->nr_uninterruptible++;
++
++	if (p->in_iowait) {
++		atomic_inc(&rq->nr_iowait);
++		delayacct_blkio_start();
++	}
++}
++
+ extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+ 
+@@ -3607,7 +3663,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c
+ extern void __setscheduler_prio(struct task_struct *p, int prio);
+ extern void set_load_weight(struct task_struct *p, bool update_load);
+ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
++extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+ 
+ extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ 				const struct sched_class *prev_class,
+diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
+index 237780aa3c53..06a2c6d3ec1e 100644
+--- a/kernel/sched/stats.h
++++ b/kernel/sched/stats.h
+@@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+ 	if (static_branch_likely(&psi_disabled))
+ 		return;
+ 
++	/*
++	 * Delayed task is not ready to run yet!
++	 * Wait for a requeue before accounting.
++	 */
++	if (p->se.sched_delayed)
++		return;
++
+ 	if (p->in_memstall)
+ 		set |= TSK_MEMSTALL_RUNNING;
+ 
+@@ -148,6 +155,9 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
+ 	if (static_branch_likely(&psi_disabled))
+ 		return;
+ 
++	/* Delayed task can only be dequeued for migration. */
++	WARN_ON_ONCE(p->se.sched_delayed && sleep);
++
+ 	/*
+ 	 * A voluntary sleep is a dequeue followed by a task switch. To
+ 	 * avoid walking all ancestors twice, psi_task_switch() handles
+diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
+index b1b8fe61c532..058dd42e3d9b 100644
+--- a/kernel/sched/stop_task.c
++++ b/kernel/sched/stop_task.c
+@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq)
+ 	return rq->stop;
+ }
+ 
+-static struct task_struct *pick_next_task_stop(struct rq *rq)
+-{
+-	struct task_struct *p = pick_task_stop(rq);
+-
+-	if (p)
+-		set_next_task_stop(rq, p, true);
+-
+-	return p;
+-}
+-
+ static void
+ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	add_nr_running(rq, 1);
+ }
+ 
+-static void
++static bool
+ dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	sub_nr_running(rq, 1);
++	return true;
+ }
+ 
+ static void yield_task_stop(struct rq *rq)
+@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq)
+ 	BUG(); /* the stop task should never yield, its pointless. */
+ }
+ 
+-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
+ 	update_curr_common(rq);
+ }
+@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_stop,
+ 
+-	.pick_next_task		= pick_next_task_stop,
++	.pick_task		= pick_task_stop,
+ 	.put_prev_task		= put_prev_task_stop,
+ 	.set_next_task          = set_next_task_stop,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_stop,
+-	.pick_task		= pick_task_stop,
+ 	.select_task_rq		= select_task_rq_stop,
+ 	.set_cpus_allowed	= set_cpus_allowed_common,
+ #endif
+diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
+index ae1b42775ef9..c62acf509b74 100644
+--- a/kernel/sched/syscalls.c
++++ b/kernel/sched/syscalls.c
+@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p)
+ 	 * keep the priority unchanged. Otherwise, update priority
+ 	 * to the normal priority:
+ 	 */
+-	if (!rt_prio(p->prio))
++	if (!rt_or_dl_prio(p->prio))
+ 		return p->normal_prio;
+ 	return p->prio;
+ }
+@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu)
+ 
+ #endif
+ 
+-#ifdef CONFIG_SMP
+-/*
+- * This function computes an effective utilization for the given CPU, to be
+- * used for frequency selection given the linear relation: f = u * f_max.
+- *
+- * The scheduler tracks the following metrics:
+- *
+- *   cpu_util_{cfs,rt,dl,irq}()
+- *   cpu_bw_dl()
+- *
+- * Where the cfs,rt and dl util numbers are tracked with the same metric and
+- * synchronized windows and are thus directly comparable.
+- *
+- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+- * which excludes things like IRQ and steal-time. These latter are then accrued
+- * in the IRQ utilization.
+- *
+- * The DL bandwidth number OTOH is not a measured metric but a value computed
+- * based on the task model parameters and gives the minimal utilization
+- * required to meet deadlines.
+- */
+-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+-				 unsigned long *min,
+-				 unsigned long *max)
+-{
+-	unsigned long util, irq, scale;
+-	struct rq *rq = cpu_rq(cpu);
+-
+-	scale = arch_scale_cpu_capacity(cpu);
+-
+-	/*
+-	 * Early check to see if IRQ/steal time saturates the CPU, can be
+-	 * because of inaccuracies in how we track these -- see
+-	 * update_irq_load_avg().
+-	 */
+-	irq = cpu_util_irq(rq);
+-	if (unlikely(irq >= scale)) {
+-		if (min)
+-			*min = scale;
+-		if (max)
+-			*max = scale;
+-		return scale;
+-	}
+-
+-	if (min) {
+-		/*
+-		 * The minimum utilization returns the highest level between:
+-		 * - the computed DL bandwidth needed with the IRQ pressure which
+-		 *   steals time to the deadline task.
+-		 * - The minimum performance requirement for CFS and/or RT.
+-		 */
+-		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+-
+-		/*
+-		 * When an RT task is runnable and uclamp is not used, we must
+-		 * ensure that the task will run at maximum compute capacity.
+-		 */
+-		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+-			*min = max(*min, scale);
+-	}
+-
+-	/*
+-	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
+-	 * CFS tasks and we use the same metric to track the effective
+-	 * utilization (PELT windows are synchronized) we can directly add them
+-	 * to obtain the CPU's actual utilization.
+-	 */
+-	util = util_cfs + cpu_util_rt(rq);
+-	util += cpu_util_dl(rq);
+-
+-	/*
+-	 * The maximum hint is a soft bandwidth requirement, which can be lower
+-	 * than the actual utilization because of uclamp_max requirements.
+-	 */
+-	if (max)
+-		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+-
+-	if (util >= scale)
+-		return scale;
+-
+-	/*
+-	 * There is still idle time; further improve the number by using the
+-	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+-	 * need to scale the task numbers:
+-	 *
+-	 *              max - irq
+-	 *   U' = irq + --------- * U
+-	 *                 max
+-	 */
+-	util = scale_irq_capacity(util, irq, scale);
+-	util += irq;
+-
+-	return min(scale, util);
+-}
+-
+-unsigned long sched_cpu_util(int cpu)
+-{
+-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+-}
+-#endif /* CONFIG_SMP */
+-
+ /**
+  * find_process_by_pid - find a process with a matching PID value.
+  * @pid: the pid in question.
+@@ -401,10 +300,20 @@ static void __setscheduler_params(struct task_struct *p,
+ 
+ 	p->policy = policy;
+ 
+-	if (dl_policy(policy))
++	if (dl_policy(policy)) {
+ 		__setparam_dl(p, attr);
+-	else if (fair_policy(policy))
++	} else if (fair_policy(policy)) {
+ 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
++		if (attr->sched_runtime) {
++			p->se.custom_slice = 1;
++			p->se.slice = clamp_t(u64, attr->sched_runtime,
++					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
++					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
++		} else {
++			p->se.custom_slice = 0;
++			p->se.slice = sysctl_sched_base_slice;
++		}
++	}
+ 
+ 	/*
+ 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
+@@ -700,7 +609,9 @@ int __sched_setscheduler(struct task_struct *p,
+ 	 * but store a possible modification of reset_on_fork.
+ 	 */
+ 	if (unlikely(policy == p->policy)) {
+-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
++		if (fair_policy(policy) &&
++		    (attr->sched_nice != task_nice(p) ||
++		     (attr->sched_runtime != p->se.slice)))
+ 			goto change;
+ 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ 			goto change;
+@@ -846,6 +757,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
+ 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
+ 	};
+ 
++	if (p->se.custom_slice)
++		attr.sched_runtime = p->se.slice;
++
+ 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+@@ -1012,12 +926,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+-	if (task_has_dl_policy(p))
++	if (task_has_dl_policy(p)) {
+ 		__getparam_dl(p, attr);
+-	else if (task_has_rt_policy(p))
++	} else if (task_has_rt_policy(p)) {
+ 		attr->sched_priority = p->rt_priority;
+-	else
++	} else {
+ 		attr->sched_nice = task_nice(p);
++		attr->sched_runtime = p->se.slice;
++	}
+ }
+ 
+ /**
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 76504b776d03..9748a4c8d668 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
+ 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ 		set_rq_online(rq);
+ 
++	/*
++	 * Because the rq is not a task, dl_add_task_root_domain() did not
++	 * move the fair server bw to the rd if it already started.
++	 * Add it now.
++	 */
++	if (rq->fair_server.dl_server)
++		__dl_server_attach_root(&rq->fair_server, rq);
++
+ 	rq_unlock_irqrestore(rq, &rf);
+ 
+ 	if (old_rd)
+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
+index b8ee320208d4..f4be3abbb47b 100644
+--- a/kernel/time/hrtimer.c
++++ b/kernel/time/hrtimer.c
+@@ -1975,7 +1975,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ 	 * expiry.
+ 	 */
+ 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+-		if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
++		if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
+ 			mode |= HRTIMER_MODE_HARD;
+ 	}
+ 
+@@ -2075,7 +2075,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
+ 	u64 slack;
+ 
+ 	slack = current->timer_slack_ns;
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		slack = 0;
+ 
+ 	hrtimer_init_sleeper_on_stack(&t, clockid, mode);
+@@ -2280,7 +2280,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ 	 * Override any slack passed by the user if under
+ 	 * rt contraints.
+ 	 */
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		delta = 0;
+ 
+ 	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
+diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
+index 130ca7e7787e..ae2ace5e515a 100644
+--- a/kernel/trace/trace_sched_wakeup.c
++++ b/kernel/trace/trace_sched_wakeup.c
+@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
+ 	 *  - wakeup_dl handles tasks belonging to sched_dl class only.
+ 	 */
+ 	if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
+-	    (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
++	    (wakeup_rt && !rt_or_dl_task(p)) ||
+ 	    (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
+ 		return;
+ 
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 3bd08b60a9b3..9bd709077621 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -426,7 +426,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
+ 		bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
+ 
+ 	tsk = current;
+-	if (rt_task(tsk)) {
++	if (rt_or_dl_task(tsk)) {
+ 		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
+ 		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
+ 	}
+@@ -485,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
+ 	else
+ 		dirty = vm_dirty_ratio * node_memory / 100;
+ 
+-	if (rt_task(tsk))
++	if (rt_or_dl_task(tsk))
+ 		dirty += dirty / 4;
+ 
+ 	/*
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 6040ed48da3e..ba29c5f5ef64 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -4009,7 +4009,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
+ 		 */
+ 		if (alloc_flags & ALLOC_MIN_RESERVE)
+ 			alloc_flags &= ~ALLOC_CPUSET;
+-	} else if (unlikely(rt_task(current)) && in_task())
++	} else if (unlikely(rt_or_dl_task(current)) && in_task())
+ 		alloc_flags |= ALLOC_MIN_RESERVE;
+ 
+ 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+-- 
+2.47.0.rc0
+
diff --git a/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch
new file mode 100644
index 0000000..9abc2a6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch
@@ -0,0 +1,3386 @@
+From 9dff1ca88508fbe0bf6044ecc4423640382a4d57 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 10 Oct 2024 12:36:51 +0200
+Subject: [PATCH 03/12] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    4 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    9 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2230 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 16 files changed, 1940 insertions(+), 553 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 6a5e08b937b3..27aab715490e 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -369,7 +369,9 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
+ 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+ 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+ 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index c0deaafebfdc..d53f042d936e 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 196c148fce8a..f37256b8abfd 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -973,9 +989,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1087,6 +1108,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1109,7 +1131,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1129,10 +1155,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1143,7 +1172,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1167,8 +1198,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1234,6 +1268,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1253,6 +1295,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1265,6 +1308,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2416,7 +2474,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 50655de04c9b..82f8bd8f0d16 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index 3b687d20c9ed..a7c30c243b54 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -507,12 +507,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dbf896f3146c..4702cd2f1ffc 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 8e94ed7c56a0..50dc9970cad2 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -668,15 +668,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index 3f88d0961e5b..4273cac333f6 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+ 
++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++{
++}
++
+ static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ 				    const struct rate_sample *rs)
+ {
+@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
++	.skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 831a18dc7aa6..d9faa8fef55e 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -3849,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..a180fa648d5e 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index 0306d257fa64..28f581c0dab7 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index e37488d3453f..62eef7d067c2 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3799,7 +3815,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3816,6 +3833,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3826,6 +3844,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3934,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4008,7 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4032,6 +4056,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4051,7 +4076,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5718,13 +5743,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index a19a9dbd3409..e0ef8406a326 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 16c48df8df4c..6c3a1895238e 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1601,7 +1604,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1676,6 +1679,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2033,13 +2060,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 4d40615dc8fc..f27941201ef2 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.47.0.rc0
+
diff --git a/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch b/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch
new file mode 100644
index 0000000..cfe58be
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch
@@ -0,0 +1,433 @@
+From 92797c0423c1c2ffc1276ca82f17d01852adbe34 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 10 Oct 2024 12:37:57 +0200
+Subject: [PATCH 07/12] ksm
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/alpha/kernel/syscalls/syscall.tbl        |   3 +
+ arch/arm/tools/syscall.tbl                    |   3 +
+ arch/m68k/kernel/syscalls/syscall.tbl         |   3 +
+ arch/microblaze/kernel/syscalls/syscall.tbl   |   3 +
+ arch/mips/kernel/syscalls/syscall_n32.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_n64.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_o32.tbl     |   3 +
+ arch/parisc/kernel/syscalls/syscall.tbl       |   3 +
+ arch/powerpc/kernel/syscalls/syscall.tbl      |   3 +
+ arch/s390/kernel/syscalls/syscall.tbl         |   3 +
+ arch/sh/kernel/syscalls/syscall.tbl           |   3 +
+ arch/sparc/kernel/syscalls/syscall.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_32.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_64.tbl        |   3 +
+ arch/xtensa/kernel/syscalls/syscall.tbl       |   3 +
+ include/linux/syscalls.h                      |   3 +
+ include/uapi/asm-generic/unistd.h             |   9 +-
+ kernel/sys.c                                  | 138 ++++++++++++++++++
+ kernel/sys_ni.c                               |   3 +
+ scripts/syscall.tbl                           |   3 +
+ .../arch/powerpc/entry/syscalls/syscall.tbl   |   3 +
+ .../perf/arch/s390/entry/syscalls/syscall.tbl |   3 +
+ 22 files changed, 206 insertions(+), 1 deletion(-)
+
+diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
+index 74720667fe09..e6a11f3c0a2e 100644
+--- a/arch/alpha/kernel/syscalls/syscall.tbl
++++ b/arch/alpha/kernel/syscalls/syscall.tbl
+@@ -502,3 +502,6 @@
+ 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 571	common	lsm_list_modules		sys_lsm_list_modules
+ 572	common  mseal				sys_mseal
++573	common	process_ksm_enable		sys_process_ksm_enable
++574	common	process_ksm_disable		sys_process_ksm_disable
++575	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
+index 23c98203c40f..10a3099decbe 100644
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -477,3 +477,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
+index 22a3cbd4c602..12d2c7594bf0 100644
+--- a/arch/m68k/kernel/syscalls/syscall.tbl
++++ b/arch/m68k/kernel/syscalls/syscall.tbl
+@@ -462,3 +462,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
+index 2b81a6bd78b2..e2a93c856eed 100644
+--- a/arch/microblaze/kernel/syscalls/syscall.tbl
++++ b/arch/microblaze/kernel/syscalls/syscall.tbl
+@@ -468,3 +468,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
+index 953f5b7dc723..b921fbf56fa6 100644
+--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
+@@ -401,3 +401,6 @@
+ 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n32	lsm_list_modules		sys_lsm_list_modules
+ 462	n32	mseal				sys_mseal
++463	n32	process_ksm_enable		sys_process_ksm_enable
++464	n32	process_ksm_disable		sys_process_ksm_disable
++465	n32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
+index 1464c6be6eb3..8d7f9ddd66f4 100644
+--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
+@@ -377,3 +377,6 @@
+ 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n64	lsm_list_modules		sys_lsm_list_modules
+ 462	n64	mseal				sys_mseal
++463	n64	process_ksm_enable		sys_process_ksm_enable
++464	n64	process_ksm_disable		sys_process_ksm_disable
++465	n64	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
+index 2439a2491cff..9d6142739954 100644
+--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
+@@ -450,3 +450,6 @@
+ 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	o32	lsm_list_modules		sys_lsm_list_modules
+ 462	o32	mseal				sys_mseal
++463	o32	process_ksm_enable		sys_process_ksm_enable
++464	o32	process_ksm_disable		sys_process_ksm_disable
++465	o32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
+index 66dc406b12e4..9d46476fd908 100644
+--- a/arch/parisc/kernel/syscalls/syscall.tbl
++++ b/arch/parisc/kernel/syscalls/syscall.tbl
+@@ -461,3 +461,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
+index ebae8415dfbb..16f71bc2f6f0 100644
+--- a/arch/powerpc/kernel/syscalls/syscall.tbl
++++ b/arch/powerpc/kernel/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
+index 01071182763e..7394bad8178e 100644
+--- a/arch/s390/kernel/syscalls/syscall.tbl
++++ b/arch/s390/kernel/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
+index c55fd7696d40..b9fc31221b87 100644
+--- a/arch/sh/kernel/syscalls/syscall.tbl
++++ b/arch/sh/kernel/syscalls/syscall.tbl
+@@ -466,3 +466,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
+index cfdfb3707c16..0d79fd772854 100644
+--- a/arch/sparc/kernel/syscalls/syscall.tbl
++++ b/arch/sparc/kernel/syscalls/syscall.tbl
+@@ -508,3 +508,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 534c74b14fab..c546a30575f1 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -468,3 +468,6 @@
+ 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	i386	lsm_list_modules	sys_lsm_list_modules
+ 462	i386	mseal 			sys_mseal
++463	i386	process_ksm_enable		sys_process_ksm_enable
++464	i386	process_ksm_disable		sys_process_ksm_disable
++465	i386	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 7093ee21c0d1..0fcd10ba8dfe 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -386,6 +386,9 @@
+ 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	common	lsm_list_modules	sys_lsm_list_modules
+ 462 	common  mseal			sys_mseal
++463	common	process_ksm_enable	sys_process_ksm_enable
++464	common	process_ksm_disable	sys_process_ksm_disable
++465	common	process_ksm_status	sys_process_ksm_status
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
+index 67083fc1b2f5..c1aecee4ad9b 100644
+--- a/arch/xtensa/kernel/syscalls/syscall.tbl
++++ b/arch/xtensa/kernel/syscalls/syscall.tbl
+@@ -433,3 +433,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 4bcf6754738d..b3ea08e920f7 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+ asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ 			size_t vlen, int behavior, unsigned int flags);
+ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
+ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+ 			unsigned long prot, unsigned long pgoff,
+ 			unsigned long flags);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 5bf6148cac2b..613e559ad6e0 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+ #define __NR_mseal 462
+ __SYSCALL(__NR_mseal, sys_mseal)
+ 
++#define __NR_process_ksm_enable 463
++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
++#define __NR_process_ksm_disable 464
++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
++#define __NR_process_ksm_status 465
++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 463
++#define __NR_syscalls 466
+ 
+ /*
+  * 32 bit systems traditionally used different
+diff --git a/kernel/sys.c b/kernel/sys.c
+index 3a2df1bd9f64..bc77dc784527 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -2789,6 +2789,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ 	return error;
+ }
+ 
++#ifdef CONFIG_KSM
++enum pkc_action {
++	PKSM_ENABLE = 0,
++	PKSM_DISABLE,
++	PKSM_STATUS,
++};
++
++static long do_process_ksm_control(int pidfd, enum pkc_action action)
++{
++	long ret;
++	struct task_struct *task;
++	struct mm_struct *mm;
++	unsigned int f_flags;
++
++	task = pidfd_get_task(pidfd, &f_flags);
++	if (IS_ERR(task)) {
++		ret = PTR_ERR(task);
++		goto out;
++	}
++
++	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
++	if (IS_ERR_OR_NULL(mm)) {
++		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
++		goto release_task;
++	}
++
++	/* Require CAP_SYS_NICE for influencing process performance. */
++	if (!capable(CAP_SYS_NICE)) {
++		ret = -EPERM;
++		goto release_mm;
++	}
++
++	if (mmap_write_lock_killable(mm)) {
++		ret = -EINTR;
++		goto release_mm;
++	}
++
++	switch (action) {
++		case PKSM_ENABLE:
++			ret = ksm_enable_merge_any(mm);
++			break;
++		case PKSM_DISABLE:
++			ret = ksm_disable_merge_any(mm);
++			break;
++		case PKSM_STATUS:
++			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
++			break;
++	}
++
++	mmap_write_unlock(mm);
++
++release_mm:
++	mmput(mm);
++release_task:
++	put_task_struct(task);
++out:
++	return ret;
++}
++#endif /* CONFIG_KSM */
++
++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_ENABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_DISABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_STATUS);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++#ifdef CONFIG_KSM
++static ssize_t process_ksm_enable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
++}
++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
++
++static ssize_t process_ksm_disable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
++}
++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
++
++static ssize_t process_ksm_status_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_status);
++}
++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
++
++static struct attribute *process_ksm_sysfs_attrs[] = {
++	&process_ksm_enable_attr.attr,
++	&process_ksm_disable_attr.attr,
++	&process_ksm_status_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group process_ksm_sysfs_attr_group = {
++	.attrs = process_ksm_sysfs_attrs,
++	.name = "process_ksm",
++};
++
++static int __init process_ksm_sysfs_init(void)
++{
++	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
++}
++subsys_initcall(process_ksm_sysfs_init);
++#endif /* CONFIG_KSM */
++
+ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ 		struct getcpu_cache __user *, unused)
+ {
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index c00a86931f8c..d82213d68522 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
+ COND_SYSCALL(madvise);
+ COND_SYSCALL(process_madvise);
+ COND_SYSCALL(process_mrelease);
++COND_SYSCALL(process_ksm_enable);
++COND_SYSCALL(process_ksm_disable);
++COND_SYSCALL(process_ksm_status);
+ COND_SYSCALL(remap_file_pages);
+ COND_SYSCALL(mbind);
+ COND_SYSCALL(get_mempolicy);
+diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
+index 845e24eb372e..227d9cc12365 100644
+--- a/scripts/syscall.tbl
++++ b/scripts/syscall.tbl
+@@ -403,3 +403,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable			sys_process_ksm_enable
++464	common	process_ksm_disable			sys_process_ksm_disable
++465	common	process_ksm_status			sys_process_ksm_status
+diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+index ebae8415dfbb..16f71bc2f6f0 100644
+--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+index 01071182763e..7394bad8178e 100644
+--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+-- 
+2.47.0.rc0
+
diff --git a/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch b/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch
new file mode 100644
index 0000000..552ebb3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch
@@ -0,0 +1,18652 @@
+From 89792579fbd7314abdd8a19d0ee9b510e9bec911 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 10 Oct 2024 12:39:34 +0200
+Subject: [PATCH 12/12] zstd
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/zstd.h                          |    2 +-
+ include/linux/zstd_errors.h                   |   23 +-
+ include/linux/zstd_lib.h                      |  850 +++++--
+ lib/zstd/Makefile                             |    2 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  149 ++
+ lib/zstd/common/bitstream.h                   |  127 +-
+ lib/zstd/common/compiler.h                    |  134 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   34 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   12 +-
+ lib/zstd/common/error_private.h               |   84 +-
+ lib/zstd/common/fse.h                         |   94 +-
+ lib/zstd/common/fse_decompress.c              |  130 +-
+ lib/zstd/common/huf.h                         |  237 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   28 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |  109 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
+ lib/zstd/compress/hist.c                      |    3 +-
+ lib/zstd/compress/hist.h                      |    3 +-
+ lib/zstd/compress/huf_compress.c              |  441 ++--
+ lib/zstd/compress/zstd_compress.c             | 2111 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  359 ++-
+ lib/zstd/compress/zstd_compress_literals.c    |  155 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  376 ++-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  169 +-
+ lib/zstd/compress/zstd_double_fast.c          |  143 +-
+ lib/zstd/compress/zstd_double_fast.h          |   17 +-
+ lib/zstd/compress/zstd_fast.c                 |  596 +++--
+ lib/zstd/compress/zstd_fast.h                 |    6 +-
+ lib/zstd/compress/zstd_lazy.c                 |  732 +++---
+ lib/zstd/compress/zstd_lazy.h                 |  138 +-
+ lib/zstd/compress/zstd_ldm.c                  |   21 +-
+ lib/zstd/compress/zstd_ldm.h                  |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  497 ++--
+ lib/zstd/compress/zstd_opt.h                  |   41 +-
+ lib/zstd/decompress/huf_decompress.c          |  887 ++++---
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  358 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  708 +++---
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |    9 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |    2 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 58 files changed, 6577 insertions(+), 3531 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index 113408eef6ec..f109d49f43f8 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..6d5cf55f0bf3 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+ 
+ 
+ /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE 
++
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..6320fedcf8a4 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
++/* ======   Dependencies   ======*/
+ #include <linux/limits.h>   /* INT_MAX */
+ #include <linux/types.h>   /* size_t */
+ 
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +85,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  6
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+  * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+ 
+ 
+ /*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -183,7 +228,7 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
+ /*= Compression context
+  *  When compressing many times,
+  *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+@@ -196,9 +241,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer *
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+  *  they will all be reset. Only `compressionLevel` remains.
+  */
+@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +265,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +369,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,7 +461,6 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+@@ -412,6 +469,9 @@ typedef enum {
+      * ZSTD_c_validateSequences
+      * ZSTD_c_useBlockSplitter
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -421,7 +481,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +490,11 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +557,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +609,17 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
+ 
+ } ZSTD_dParameter;
+ 
+@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +770,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be reused multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ *   call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is reused,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1303,7 +1395,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+ 
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsSize The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
+  */
+-
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                       const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+  * The entire source is compressed into a single frame.
+  *
+@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+  *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Without validation, providing a sequence that does not conform to the zstd spec will cause
+  * undefined behavior, and may produce a corrupted block.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+  *
+@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t (*ZSTD_sequenceProducer_F) (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+-
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+ 
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..464c410b2768 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..16c3d08e8d1a
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "compiler.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..aa3487ec4b6a
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,149 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_ctz(val);
++#   else
++        return ZSTD_countTrailingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_clz(val);
++#   else
++        return ZSTD_countLeadingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4) && defined(__LP64__)
++        return (unsigned)__builtin_ctzll(val);
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (leastSignificantWord == 0) {
++                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++            } else {
++                return ZSTD_countTrailingZeros32(leastSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)(__builtin_clzll(val));
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (mostSignificantWord == 0) {
++                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++            } else {
++                return ZSTD_countLeadingZeros32(mostSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..6a13f1f0f1e8 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ 
+ /*=========================================
+@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ /*-********************************************
+ *  bitStream decoding API (read backward)
+ **********************************************/
++typedef size_t BitContainerType;
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+ MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
++ *  unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
+         return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..508ee25537bb 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,9 +143,9 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+@@ -179,6 +196,85 @@
+ *  Sanitizer
+ *****************************************************************/
+ 
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without trigging
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..8eb6aa9a3b20 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,10 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..226ba3c57ec3 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared,
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+ 
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..a4062d30d170 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..0410ca415b54 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) {
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) {
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ 
+ #endif /* ERROR_H_MODULE */
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..2185a578617d 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+ 
+@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39767..3a17e84f27bf 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+ #include "error_private.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +56,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+             break;
+     }   }
+ 
+-    return op-ostart;
+-}
+-
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..57462466e188 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+-
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+-/* ***   Advanced function   *** */
+ 
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +144,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
+-
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif   /* HUF_H_298734234 */
+ 
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index 1d9cc03924ca..2e91e7780c1f 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a527d..f08638cced6c 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -45,6 +46,8 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+@@ -65,7 +68,7 @@
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+  * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +93,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..44b95b25344a 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33a1c..f931f7d0e294 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..11da1233e890 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+     seqDef* sequencesStart;
+     seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
+     size_t maxNbSeq;
+     size_t maxNbLit;
+ 
+@@ -301,8 +297,8 @@ typedef struct {
+      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+      * the existing value of the litLength or matchLength by 0x10000.
+      */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+ 
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+     seqLen.matchLength = seq->mlBase + MINMATCH;
+     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
++            seqLen.litLength += 0x10000;
+         }
+         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
++            seqLen.matchLength += 0x10000;
+         }
+     }
+     return seqLen;
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ 
+ /* ZSTD_invalidateRepCodes() :
+@@ -420,13 +357,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..44a3c10becf2 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..0b12587cc14b 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..f7687b0fc20a 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..0b229f5d2ae2 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
+ }
+ 
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
++}
++
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +519,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +528,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1249,81 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
++
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index f620cafca633..0d139727cd39 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +28,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -55,14 +57,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+     }
+     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+     default:
+         return 0;
+     }
+@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        }
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
+ 
+     case ZSTD_c_useBlockSplitter:
+         BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        CCtxParams->maxBlockSize = value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_searchForExternalRepcodes:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_cParamMode_e mode,
++                            ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+       + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_paramSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+         entropySpace +
+@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+     }
+ }
+ 
+@@ -1637,6 +1879,19 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         }
+     }
+ 
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+     assert(params->useBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+ 
+@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (ZSTD_hasExtSeqProd(params)) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2647,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2658,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const seqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                        const seqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* dst, size_t dstCapacity,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+     {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ }
+ 
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++ZSTD_entropyCompressSeqStore(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
+@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+         static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+         DEBUGLOG(4, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+ 
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+             /* Updates ldmSeqStore.size */
+@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
++            assert(
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->extSeqBuf,
++                    nbExternalSeqs,
++                    zc->extSeqBufCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_sequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++                            zc, &seqPos,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_blockCompressor const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
++            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     return ZSTDbss_compress;
+ }
+ 
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
++    const seqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = seqStore->sequences - inSeqs;
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+ 
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+ 
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
++}
++
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+     void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const seqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const seqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        seqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+         seqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+  */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                                const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++                        const seqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+         seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const seqStore_t* const seqStore,
+                                   repcodes_t* const dRep, repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3481,45 +4027,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+     return cSize;
+@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ 
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+         if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+ 
+ 
+             ip += blockSize;
+@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++                                         ZSTD_dictTableLoadMethod_e dtlm,
++                                         ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+     }
+ 
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    if (params->cParams.strategy < ZSTD_btultra) {
++        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
++    }
++
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+                         cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                         customMem);
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSize - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSize;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                 op, oend-op, ip, iend-ip);
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+                                         ip, iend-ip);
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 void* cDst;
+                 size_t cSize;
+                 size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                               ZSTD_sequencePosition* seqPos,
+                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++                                        const void* src, size_t blockSize,
++                                        ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+     repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         dictSize = 0;
+     }
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
++
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
+     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+     return 0;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ 
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
++                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+     ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+     return sequenceCopier;
+ }
+ 
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
++    }
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++    int const lastBlock = (remaining <= blockSize);
++    return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters)
++        return blockSize_noDelimiter(blockSize, remaining);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+     ZSTD_sequencePosition seqPos = {0, 0, 0};
+ 
+@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+         size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSize, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+         blockSize -= additionalByteAdjustment;
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
++            ZSTD_isRLE(ip, blockSize)) {
+             /* We don't want to emit our first block as a RLE even if it qualifies because
+             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+             * This is only an issue for zstd <= v1.4.3
+@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+     size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+     /* Begin writing output, starting with frame header */
+@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+ 
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
++    } else {
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..53cb582a8d2b 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+ 
+ 
+ /*-*************************************
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -142,26 +145,33 @@ typedef struct {
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -212,8 +222,10 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+@@ -228,6 +240,18 @@ struct ZSTD_matchState_t {
+     const ZSTD_matchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+     const rawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s {
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_paramSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_paramSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,7 +490,7 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     seqStorePtr->sequences[0].litLength = (U16)litLength;
+ 
+     /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++    seqStorePtr->sequences[0].offBase = offBase;
+ 
+     /* match Length */
+     assert(matchLength >= MINMATCH);
+@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +723,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+ 
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+     repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
+                                   void const* src, size_t srcSize,
+                                   int forceNonContiguous)
+ {
+@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ 
+ /* ===============================================================
+@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++                                              ZSTD_sequencePosition* seqPos,
++                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..3e9ea46a670a 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+     symbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..5c028c78d889 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..7fe6f4ff5cf2 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..41f6521b27cd 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++                   const seqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
++{
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const seqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
+     return 0;
+ }
+ 
++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+ {
+     const seqDef* const sstart = seqStorePtr->sequencesStart;
+     const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+-
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
+-
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
++
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+         }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
++            const seqDef* seq;
+             repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..86bc3c2c23c7 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+ 
+ 
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+     assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+ }
+ 
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+  */
+@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..5ff54f17d92f 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,49 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                 goto _match_found;
+             }
+-        } else {
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                     goto _match_found;
+                 }
+-            } else {
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..b7ddc714f13e 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -15,8 +16,12 @@
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
++
+ size_t ZSTD_compressBlock_doubleFast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..b7a63ba4ce56 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,46 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++                }   }   }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls, U32 const hasStep)
+@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 == ip0 + 1, so
++             * we know we will resume searching after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _offset;
+         }
+ 
+@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* first write next hash table entry; we've already calculated it */
++            if (step <= 4) {
++                /* We need to avoid writing an index into the hash table >= the
++                 * position at which we will pick up our searching after we've
++                 * taken this match.
++                 *
++                 * The minimum possible match has length 4, so the earliest ip0
++                 * can be after we take this match will be the current ip0 + 4.
++                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++                 * write this position.
++                 */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
++
+             goto _offset;
+         }
+ 
+@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     match0 = base + idx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++                /* found a regular match */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..e64d9e1b2d39 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+ 
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..3e88d8a1a136 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,14 +11,23 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
+         const ZSTD_matchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
+                         ZSTD_matchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ 
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch(
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic(
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic(
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic(
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict(
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..22c9201f4e63 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -22,98 +23,175 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+ 
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
++
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
++
+ 
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..07f3bc6437ce 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
+         ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..c540731abde7 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..a87b66ac8d24 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +30,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
+                 const ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+                 ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal(
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+ }
+ 
+ FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_matchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+         ZSTD_matchState_t* ms,
+         U32* nextToUpdate3,
+@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID)
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
+ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                                seqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip));
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                     pos, ZSTD_fCost(sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
++                        opt[pos].off = offBase;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
++                    ZSTD_optimal_t const prevMatch = opt[cur];
+                     DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                 inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+             ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
++                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
++                                inr-istart, cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
+                 }   }
+ 
+@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt(
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
++                          seqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2(
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2(
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState(
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict(
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..ac1b743d27cd 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,30 +15,40 @@
+ 
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
++size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict(
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ 
+ #endif /* ZSTD_OPT_H */
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..ac8b87f48f84 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilowest, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+ 
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
+          */
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
+ 
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
++    bit->start = (const char*)args->ilowest;
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +379,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        assert(dstSize >= 6); /* validated above */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    do {                                                        \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    } while (0)
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    do {                                                            \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+     * overwritten any op[].
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1175,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        assert(dstSize >= 6 /* validated above */);
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1);
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
++        } while (op[3] < olimit);
++    }
++
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..30ef65e1ab5c 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94711..c9cbc45f6ed9 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -53,13 +54,15 @@
+ *  Dependencies
+ *********************************************************/
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+@@ -72,11 +75,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
++**           or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+-    unsigned long long totalDstSize = 0;
++    U64 totalDstSize = 0;
+ 
+     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+         U32 const magicNumber = MEM_readLE32(src);
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (U64_MAX - totalDstSize < fcs)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+         ZSTD_frameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_frameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..9fe9a12c8a2c 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+-        }
+-        else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+         symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
+     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+         symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+         symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
+                             ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
++                            const ZSTD_longOffset_e isLongOffset);
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1982,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+-
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
++
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
++    dctx->isFrameDecompression = 0;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..becffbd89364 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..0f02526be774 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
++    int isFrameDecompression;
+ #if DYNAMIC_BMI2 != 0
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
++    int maxBlockSizeParam;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..466828e35752 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index 04e1b5c01d9b..8ecf43226af2 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index f4ed952ed485..7d31518e9d5a 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.47.0.rc0
+
diff --git a/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch b/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch
new file mode 100644
index 0000000..392f8fc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch
@@ -0,0 +1,4374 @@
+From 7c2f0545fa986157158c76300a43ab48802d25d3 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 4 Oct 2024 18:04:35 +0200
+Subject: [PATCH] eevdf-next
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/scheduler/sched-deadline.rst |  14 +-
+ drivers/cpufreq/cppc_cpufreq.c             |   6 +-
+ fs/bcachefs/six.c                          |   2 +-
+ fs/select.c                                |   2 +-
+ include/linux/ioprio.h                     |   2 +-
+ include/linux/sched.h                      |  28 +-
+ include/linux/sched/deadline.h             |  14 +-
+ include/linux/sched/prio.h                 |   1 +
+ include/linux/sched/rt.h                   |  33 +-
+ include/uapi/linux/sched/types.h           |   6 +-
+ kernel/freezer.c                           |   2 +-
+ kernel/locking/rtmutex.c                   |   4 +-
+ kernel/locking/rwsem.c                     |   4 +-
+ kernel/locking/ww_mutex.h                  |   2 +-
+ kernel/sched/core.c                        | 248 ++++---
+ kernel/sched/cpufreq_schedutil.c           |   6 +-
+ kernel/sched/deadline.c                    | 465 ++++++++++---
+ kernel/sched/debug.c                       | 198 +++++-
+ kernel/sched/fair.c                        | 750 ++++++++++++++++-----
+ kernel/sched/features.h                    |  30 +-
+ kernel/sched/idle.c                        |  23 +-
+ kernel/sched/rt.c                          | 261 +++----
+ kernel/sched/sched.h                       | 101 ++-
+ kernel/sched/stop_task.c                   |  18 +-
+ kernel/sched/syscalls.c                    | 132 +---
+ kernel/sched/topology.c                    |   8 +
+ kernel/time/hrtimer.c                      |   6 +-
+ kernel/trace/trace_sched_wakeup.c          |   2 +-
+ mm/page-writeback.c                        |   4 +-
+ mm/page_alloc.c                            |   2 +-
+ 30 files changed, 1663 insertions(+), 711 deletions(-)
+
+diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
+index 9fe4846079bb..22838ed8e13a 100644
+--- a/Documentation/scheduler/sched-deadline.rst
++++ b/Documentation/scheduler/sched-deadline.rst
+@@ -749,21 +749,19 @@ Appendix A. Test suite
+  of the command line options. Please refer to rt-app documentation for more
+  details (`<rt-app-sources>/doc/*.json`).
+ 
+- The second testing application is a modification of schedtool, called
+- schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
+- certain pid/application. schedtool-dl is available at:
+- https://github.com/scheduler-tools/schedtool-dl.git.
++ The second testing application is done using chrt which has support
++ for SCHED_DEADLINE.
+ 
+  The usage is straightforward::
+ 
+-  # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
++  # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
+ 
+  With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
+- of 10ms every 100ms (note that parameters are expressed in microseconds).
+- You can also use schedtool to create a reservation for an already running
++ of 10ms every 100ms (note that parameters are expressed in nanoseconds).
++ You can also use chrt to create a reservation for an already running
+  application, given that you know its pid::
+ 
+-  # schedtool -E -t 10000000:100000000 my_app_pid
++  # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
+ 
+ Appendix B. Minimal main()
+ ==========================
+diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
+index bafa32dd375d..1a5ad184d28f 100644
+--- a/drivers/cpufreq/cppc_cpufreq.c
++++ b/drivers/cpufreq/cppc_cpufreq.c
+@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
+ 		 * Fake (unused) bandwidth; workaround to "fix"
+ 		 * priority inheritance.
+ 		 */
+-		.sched_runtime	= 1000000,
+-		.sched_deadline = 10000000,
+-		.sched_period	= 10000000,
++		.sched_runtime	= NSEC_PER_MSEC,
++		.sched_deadline = 10 * NSEC_PER_MSEC,
++		.sched_period	= 10 * NSEC_PER_MSEC,
+ 	};
+ 	int ret;
+ 
+diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
+index 3a494c5d1247..9cbd3c14c94f 100644
+--- a/fs/bcachefs/six.c
++++ b/fs/bcachefs/six.c
+@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock)
+ 	 */
+ 	rcu_read_lock();
+ 	struct task_struct *owner = READ_ONCE(lock->owner);
+-	bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
++	bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
+ 	rcu_read_unlock();
+ 
+ 	return ret;
+diff --git a/fs/select.c b/fs/select.c
+index bc185d111436..bc5762b03945 100644
+--- a/fs/select.c
++++ b/fs/select.c
+@@ -82,7 +82,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
+ 	 * Realtime tasks get a slack of 0 for obvious reasons.
+ 	 */
+ 
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		return 0;
+ 
+ 	ktime_get_ts64(&now);
+diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
+index db1249cd9692..b25377b6ea98 100644
+--- a/include/linux/ioprio.h
++++ b/include/linux/ioprio.h
+@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
+ {
+ 	if (task->policy == SCHED_IDLE)
+ 		return IOPRIO_CLASS_IDLE;
+-	else if (task_is_realtime(task))
++	else if (rt_or_dl_task_policy(task))
+ 		return IOPRIO_CLASS_RT;
+ 	else
+ 		return IOPRIO_CLASS_BE;
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index f8d150343d42..57cf27a3045c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -149,8 +149,9 @@ struct user_event_mm;
+  * Special states are those that do not use the normal wait-loop pattern. See
+  * the comment with set_special_state().
+  */
+-#define is_special_task_state(state)				\
+-	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
++#define is_special_task_state(state)					\
++	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |	\
++		    TASK_DEAD | TASK_FROZEN))
+ 
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ # define debug_normal_state_change(state_value)				\
+@@ -541,9 +542,14 @@ struct sched_entity {
+ 	struct rb_node			run_node;
+ 	u64				deadline;
+ 	u64				min_vruntime;
++	u64				min_slice;
+ 
+ 	struct list_head		group_node;
+-	unsigned int			on_rq;
++	unsigned char			on_rq;
++	unsigned char			sched_delayed;
++	unsigned char			rel_deadline;
++	unsigned char			custom_slice;
++					/* hole */
+ 
+ 	u64				exec_start;
+ 	u64				sum_exec_runtime;
+@@ -639,12 +645,26 @@ struct sched_dl_entity {
+ 	 *
+ 	 * @dl_overrun tells if the task asked to be informed about runtime
+ 	 * overruns.
++	 *
++	 * @dl_server tells if this is a server entity.
++	 *
++	 * @dl_defer tells if this is a deferred or regular server. For
++	 * now only defer server exists.
++	 *
++	 * @dl_defer_armed tells if the deferrable server is waiting
++	 * for the replenishment timer to activate it.
++	 *
++	 * @dl_defer_running tells if the deferrable server is actually
++	 * running, skipping the defer phase.
+ 	 */
+ 	unsigned int			dl_throttled      : 1;
+ 	unsigned int			dl_yielded        : 1;
+ 	unsigned int			dl_non_contending : 1;
+ 	unsigned int			dl_overrun	  : 1;
+ 	unsigned int			dl_server         : 1;
++	unsigned int			dl_defer	  : 1;
++	unsigned int			dl_defer_armed	  : 1;
++	unsigned int			dl_defer_running  : 1;
+ 
+ 	/*
+ 	 * Bandwidth enforcement timer. Each -deadline task has its
+@@ -672,7 +692,7 @@ struct sched_dl_entity {
+ 	 */
+ 	struct rq			*rq;
+ 	dl_server_has_tasks_f		server_has_tasks;
+-	dl_server_pick_f		server_pick;
++	dl_server_pick_f		server_pick_task;
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ 	/*
+diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
+index df3aca89d4f5..3a912ab42bb5 100644
+--- a/include/linux/sched/deadline.h
++++ b/include/linux/sched/deadline.h
+@@ -10,16 +10,16 @@
+ 
+ #include <linux/sched.h>
+ 
+-#define MAX_DL_PRIO		0
+-
+-static inline int dl_prio(int prio)
++static inline bool dl_prio(int prio)
+ {
+-	if (unlikely(prio < MAX_DL_PRIO))
+-		return 1;
+-	return 0;
++	return unlikely(prio < MAX_DL_PRIO);
+ }
+ 
+-static inline int dl_task(struct task_struct *p)
++/*
++ * Returns true if a task has a priority that belongs to DL class. PI-boosted
++ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
++ */
++static inline bool dl_task(struct task_struct *p)
+ {
+ 	return dl_prio(p->prio);
+ }
+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
+index ab83d85e1183..6ab43b4f72f9 100644
+--- a/include/linux/sched/prio.h
++++ b/include/linux/sched/prio.h
+@@ -14,6 +14,7 @@
+  */
+ 
+ #define MAX_RT_PRIO		100
++#define MAX_DL_PRIO		0
+ 
+ #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
+ #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
+diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
+index b2b9e6eb9683..4e3338103654 100644
+--- a/include/linux/sched/rt.h
++++ b/include/linux/sched/rt.h
+@@ -6,19 +6,40 @@
+ 
+ struct task_struct;
+ 
+-static inline int rt_prio(int prio)
++static inline bool rt_prio(int prio)
+ {
+-	if (unlikely(prio < MAX_RT_PRIO))
+-		return 1;
+-	return 0;
++	return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
+ }
+ 
+-static inline int rt_task(struct task_struct *p)
++static inline bool rt_or_dl_prio(int prio)
++{
++	return unlikely(prio < MAX_RT_PRIO);
++}
++
++/*
++ * Returns true if a task has a priority that belongs to RT class. PI-boosted
++ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
++ */
++static inline bool rt_task(struct task_struct *p)
+ {
+ 	return rt_prio(p->prio);
+ }
+ 
+-static inline bool task_is_realtime(struct task_struct *tsk)
++/*
++ * Returns true if a task has a priority that belongs to RT or DL classes.
++ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
++ * PI-boosted tasks.
++ */
++static inline bool rt_or_dl_task(struct task_struct *p)
++{
++	return rt_or_dl_prio(p->prio);
++}
++
++/*
++ * Returns true if a task has a policy that belongs to RT or DL classes.
++ * PI-boosted tasks will return false.
++ */
++static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
+ {
+ 	int policy = tsk->policy;
+ 
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index 90662385689b..bf6e9ae031c1 100644
+--- a/include/uapi/linux/sched/types.h
++++ b/include/uapi/linux/sched/types.h
+@@ -58,9 +58,9 @@
+  *
+  * This is reflected by the following fields of the sched_attr structure:
+  *
+- *  @sched_deadline	representative of the task's deadline
+- *  @sched_runtime	representative of the task's runtime
+- *  @sched_period	representative of the task's period
++ *  @sched_deadline	representative of the task's deadline in nanoseconds
++ *  @sched_runtime	representative of the task's runtime in nanoseconds
++ *  @sched_period	representative of the task's period in nanoseconds
+  *
+  * Given this task model, there are a multiplicity of scheduling algorithms
+  * and policies, that can be used to ensure all the tasks will make their
+diff --git a/kernel/freezer.c b/kernel/freezer.c
+index f57aaf96b829..44bbd7dbd2c8 100644
+--- a/kernel/freezer.c
++++ b/kernel/freezer.c
+@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop)
+ 		bool freeze;
+ 
+ 		raw_spin_lock_irq(&current->pi_lock);
+-		set_current_state(TASK_FROZEN);
++		WRITE_ONCE(current->__state, TASK_FROZEN);
+ 		/* unstale saved_state so that __thaw_task() will wake us up */
+ 		current->saved_state = TASK_RUNNING;
+ 		raw_spin_unlock_irq(&current->pi_lock);
+diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
+index fba1229f1de6..ebebd0eec7f6 100644
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task)
+ {
+ 	int prio = task->prio;
+ 
+-	if (!rt_prio(prio))
++	if (!rt_or_dl_prio(prio))
+ 		return DEFAULT_PRIO;
+ 
+ 	return prio;
+@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ 	 * Note that RT tasks are excluded from same priority (lateral)
+ 	 * steals to prevent the introduction of an unbounded latency.
+ 	 */
+-	if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
++	if (rt_or_dl_prio(waiter->tree.prio))
+ 		return false;
+ 
+ 	return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index 3277df47ab3c..299b793d55e1 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ 			 * if it is an RT task or wait in the wait queue
+ 			 * for too long.
+ 			 */
+-			if (has_handoff || (!rt_task(waiter->task) &&
++			if (has_handoff || (!rt_or_dl_task(waiter->task) &&
+ 					    !time_after(jiffies, waiter->timeout)))
+ 				return false;
+ 
+@@ -916,7 +916,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+ 		if (owner_state != OWNER_WRITER) {
+ 			if (need_resched())
+ 				break;
+-			if (rt_task(current) &&
++			if (rt_or_dl_task(current) &&
+ 			   (prev_owner_state != OWNER_WRITER))
+ 				break;
+ 		}
+diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
+index 3ad2cc4823e5..76d204b7d29c 100644
+--- a/kernel/locking/ww_mutex.h
++++ b/kernel/locking/ww_mutex.h
+@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+ 	int a_prio = a->task->prio;
+ 	int b_prio = b->task->prio;
+ 
+-	if (rt_prio(a_prio) || rt_prio(b_prio)) {
++	if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
+ 
+ 		if (a_prio > b_prio)
+ 			return true;
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index f3951e4a55e5..b4c5d83e54d4 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -163,7 +163,10 @@ static inline int __task_prio(const struct task_struct *p)
+ 	if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ 		return -2;
+ 
+-	if (rt_prio(p->prio)) /* includes deadline */
++	if (p->dl_server)
++		return -1; /* deadline */
++
++	if (rt_or_dl_prio(p->prio))
+ 		return p->prio; /* [-1, 99] */
+ 
+ 	if (p->sched_class == &idle_sched_class)
+@@ -192,8 +195,24 @@ static inline bool prio_less(const struct task_struct *a,
+ 	if (-pb < -pa)
+ 		return false;
+ 
+-	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+-		return !dl_time_before(a->dl.deadline, b->dl.deadline);
++	if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
++		const struct sched_dl_entity *a_dl, *b_dl;
++
++		a_dl = &a->dl;
++		/*
++		 * Since,'a' and 'b' can be CFS tasks served by DL server,
++		 * __task_prio() can return -1 (for DL) even for those. In that
++		 * case, get to the dl_server's DL entity.
++		 */
++		if (a->dl_server)
++			a_dl = a->dl_server;
++
++		b_dl = &b->dl;
++		if (b->dl_server)
++			b_dl = b->dl_server;
++
++		return !dl_time_before(a_dl->deadline, b_dl->deadline);
++	}
+ 
+ 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
+ 		return cfs_prio_less(a, b, in_fi);
+@@ -240,6 +259,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+ 
+ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	if (p->se.sched_delayed)
++		return;
++
+ 	rq->core->core_task_seq++;
+ 
+ 	if (!p->core_cookie)
+@@ -250,6 +272,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+ 
+ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
+ {
++	if (p->se.sched_delayed)
++		return;
++
+ 	rq->core->core_task_seq++;
+ 
+ 	if (sched_core_enqueued(p)) {
+@@ -1269,7 +1294,7 @@ bool sched_can_stop_tick(struct rq *rq)
+ 	 * dequeued by migrating while the constrained task continues to run.
+ 	 * E.g. going from 2->1 without going through pick_next_task().
+ 	 */
+-	if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
++	if (__need_bw_check(rq, rq->curr)) {
+ 		if (cfs_task_bw_constrained(rq->curr))
+ 			return false;
+ 	}
+@@ -1672,6 +1697,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+ 	if (unlikely(!p->sched_class->uclamp_enabled))
+ 		return;
+ 
++	if (p->se.sched_delayed)
++		return;
++
+ 	for_each_clamp_id(clamp_id)
+ 		uclamp_rq_inc_id(rq, p, clamp_id);
+ 
+@@ -1696,6 +1724,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+ 	if (unlikely(!p->sched_class->uclamp_enabled))
+ 		return;
+ 
++	if (p->se.sched_delayed)
++		return;
++
+ 	for_each_clamp_id(clamp_id)
+ 		uclamp_rq_dec_id(rq, p, clamp_id);
+ }
+@@ -1975,14 +2006,21 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ 		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
+ 	}
+ 
+-	uclamp_rq_inc(rq, p);
+ 	p->sched_class->enqueue_task(rq, p, flags);
++	/*
++	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
++	 * ->sched_delayed.
++	 */
++	uclamp_rq_inc(rq, p);
+ 
+ 	if (sched_core_enabled(rq))
+ 		sched_core_enqueue(rq, p);
+ }
+ 
+-void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
++/*
++ * Must only return false when DEQUEUE_SLEEP.
++ */
++inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	if (sched_core_enabled(rq))
+ 		sched_core_dequeue(rq, p, flags);
+@@ -1995,8 +2033,12 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ 	}
+ 
++	/*
++	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
++	 * and mark the task ->sched_delayed.
++	 */
+ 	uclamp_rq_dec(rq, p);
+-	p->sched_class->dequeue_task(rq, p, flags);
++	return p->sched_class->dequeue_task(rq, p, flags);
+ }
+ 
+ void activate_task(struct rq *rq, struct task_struct *p, int flags)
+@@ -2014,12 +2056,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
+ 
+ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+-	WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
++	SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
++
++	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ 	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ 
++	/*
++	 * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
++	 * dequeue_task() and cleared *after* enqueue_task().
++	 */
++
+ 	dequeue_task(rq, p, flags);
+ }
+ 
++static void block_task(struct rq *rq, struct task_struct *p, int flags)
++{
++	if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
++		__block_task(rq, p);
++}
++
+ /**
+  * task_curr - is this task currently executing on a CPU?
+  * @p: the task in question.
+@@ -2233,6 +2288,12 @@ void migrate_disable(void)
+ 	struct task_struct *p = current;
+ 
+ 	if (p->migration_disabled) {
++#ifdef CONFIG_DEBUG_PREEMPT
++		/*
++		 *Warn about overflow half-way through the range.
++		 */
++		WARN_ON_ONCE((s16)p->migration_disabled < 0);
++#endif
+ 		p->migration_disabled++;
+ 		return;
+ 	}
+@@ -2251,14 +2312,20 @@ void migrate_enable(void)
+ 		.flags     = SCA_MIGRATE_ENABLE,
+ 	};
+ 
++#ifdef CONFIG_DEBUG_PREEMPT
++	/*
++	 * Check both overflow from migrate_disable() and superfluous
++	 * migrate_enable().
++	 */
++	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
++		return;
++#endif
++
+ 	if (p->migration_disabled > 1) {
+ 		p->migration_disabled--;
+ 		return;
+ 	}
+ 
+-	if (WARN_ON_ONCE(!p->migration_disabled))
+-		return;
+-
+ 	/*
+ 	 * Ensure stop_task runs either before or after this, and that
+ 	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+@@ -3607,8 +3674,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ 		rq->idle_stamp = 0;
+ 	}
+ #endif
+-
+-	p->dl_server = NULL;
+ }
+ 
+ /*
+@@ -3644,12 +3709,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
+ 
+ 	rq = __task_rq_lock(p, &rf);
+ 	if (task_on_rq_queued(p)) {
++		update_rq_clock(rq);
++		if (p->se.sched_delayed)
++			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
+ 		if (!task_on_cpu(rq, p)) {
+ 			/*
+ 			 * When on_rq && !on_cpu the task is preempted, see if
+ 			 * it should preempt the task that is current now.
+ 			 */
+-			update_rq_clock(rq);
+ 			wakeup_preempt(rq, p, wake_flags);
+ 		}
+ 		ttwu_do_wakeup(p);
+@@ -4029,11 +4096,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ 		 * case the whole 'p->on_rq && ttwu_runnable()' case below
+ 		 * without taking any locks.
+ 		 *
++		 * Specifically, given current runs ttwu() we must be before
++		 * schedule()'s block_task(), as such this must not observe
++		 * sched_delayed.
++		 *
+ 		 * In particular:
+ 		 *  - we rely on Program-Order guarantees for all the ordering,
+ 		 *  - we're serialized against set_special_state() by virtue of
+ 		 *    it disabling IRQs (this allows not taking ->pi_lock).
+ 		 */
++		SCHED_WARN_ON(p->se.sched_delayed);
+ 		if (!ttwu_state_match(p, state, &success))
+ 			goto out;
+ 
+@@ -4322,9 +4394,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
+ 	p->se.vlag			= 0;
+-	p->se.slice			= sysctl_sched_base_slice;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
++	/* A delayed task cannot be in clone(). */
++	SCHED_WARN_ON(p->se.sched_delayed);
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4572,6 +4646,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
++		p->se.custom_slice = 0;
++		p->se.slice = sysctl_sched_base_slice;
+ 
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+@@ -4686,7 +4762,7 @@ void wake_up_new_task(struct task_struct *p)
+ 	update_rq_clock(rq);
+ 	post_init_entity_util_avg(p);
+ 
+-	activate_task(rq, p, ENQUEUE_NOCLOCK);
++	activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
+ 	trace_sched_wakeup_new(p);
+ 	wakeup_preempt(rq, p, WF_FORK);
+ #ifdef CONFIG_SMP
+@@ -5769,8 +5845,8 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
+ 	schedstat_inc(this_rq()->sched_count);
+ }
+ 
+-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+-				  struct rq_flags *rf)
++static void prev_balance(struct rq *rq, struct task_struct *prev,
++			 struct rq_flags *rf)
+ {
+ #ifdef CONFIG_SMP
+ 	const struct sched_class *class;
+@@ -5787,8 +5863,6 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ 			break;
+ 	}
+ #endif
+-
+-	put_prev_task(rq, prev);
+ }
+ 
+ /*
+@@ -5800,6 +5874,8 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 	const struct sched_class *class;
+ 	struct task_struct *p;
+ 
++	rq->dl_server = NULL;
++
+ 	/*
+ 	 * Optimization: we know that if all tasks are in the fair class we can
+ 	 * call that function directly, but only if the @prev task wasn't of a
+@@ -5815,35 +5891,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		/* Assume the next prioritized class is idle_sched_class */
+ 		if (!p) {
+-			put_prev_task(rq, prev);
+-			p = pick_next_task_idle(rq);
++			p = pick_task_idle(rq);
++			put_prev_set_next_task(rq, prev, p);
+ 		}
+ 
+-		/*
+-		 * This is the fast path; it cannot be a DL server pick;
+-		 * therefore even if @p == @prev, ->dl_server must be NULL.
+-		 */
+-		if (p->dl_server)
+-			p->dl_server = NULL;
+-
+ 		return p;
+ 	}
+ 
+ restart:
+-	put_prev_task_balance(rq, prev, rf);
+-
+-	/*
+-	 * We've updated @prev and no longer need the server link, clear it.
+-	 * Must be done before ->pick_next_task() because that can (re)set
+-	 * ->dl_server.
+-	 */
+-	if (prev->dl_server)
+-		prev->dl_server = NULL;
++	prev_balance(rq, prev, rf);
+ 
+ 	for_each_class(class) {
+-		p = class->pick_next_task(rq);
+-		if (p)
+-			return p;
++		if (class->pick_next_task) {
++			p = class->pick_next_task(rq, prev);
++			if (p)
++				return p;
++		} else {
++			p = class->pick_task(rq);
++			if (p) {
++				put_prev_set_next_task(rq, prev, p);
++				return p;
++			}
++		}
+ 	}
+ 
+ 	BUG(); /* The idle class should always have a runnable task. */
+@@ -5873,6 +5942,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
+ 	const struct sched_class *class;
+ 	struct task_struct *p;
+ 
++	rq->dl_server = NULL;
++
+ 	for_each_class(class) {
+ 		p = class->pick_task(rq);
+ 		if (p)
+@@ -5911,6 +5982,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		 * another cpu during offline.
+ 		 */
+ 		rq->core_pick = NULL;
++		rq->core_dl_server = NULL;
+ 		return __pick_next_task(rq, prev, rf);
+ 	}
+ 
+@@ -5929,16 +6001,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+ 
+ 		next = rq->core_pick;
+-		if (next != prev) {
+-			put_prev_task(rq, prev);
+-			set_next_task(rq, next);
+-		}
+-
++		rq->dl_server = rq->core_dl_server;
+ 		rq->core_pick = NULL;
+-		goto out;
++		rq->core_dl_server = NULL;
++		goto out_set_next;
+ 	}
+ 
+-	put_prev_task_balance(rq, prev, rf);
++	prev_balance(rq, prev, rf);
+ 
+ 	smt_mask = cpu_smt_mask(cpu);
+ 	need_sync = !!rq->core->core_cookie;
+@@ -5979,6 +6048,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		next = pick_task(rq);
+ 		if (!next->core_cookie) {
+ 			rq->core_pick = NULL;
++			rq->core_dl_server = NULL;
+ 			/*
+ 			 * For robustness, update the min_vruntime_fi for
+ 			 * unconstrained picks as well.
+@@ -6006,7 +6076,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ 			update_rq_clock(rq_i);
+ 
+-		p = rq_i->core_pick = pick_task(rq_i);
++		rq_i->core_pick = p = pick_task(rq_i);
++		rq_i->core_dl_server = rq_i->dl_server;
++
+ 		if (!max || prio_less(max, p, fi_before))
+ 			max = p;
+ 	}
+@@ -6030,6 +6102,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 		}
+ 
+ 		rq_i->core_pick = p;
++		rq_i->core_dl_server = NULL;
+ 
+ 		if (p == rq_i->idle) {
+ 			if (rq_i->nr_running) {
+@@ -6090,6 +6163,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		if (i == cpu) {
+ 			rq_i->core_pick = NULL;
++			rq_i->core_dl_server = NULL;
+ 			continue;
+ 		}
+ 
+@@ -6098,6 +6172,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 
+ 		if (rq_i->curr == rq_i->core_pick) {
+ 			rq_i->core_pick = NULL;
++			rq_i->core_dl_server = NULL;
+ 			continue;
+ 		}
+ 
+@@ -6105,8 +6180,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ 	}
+ 
+ out_set_next:
+-	set_next_task(rq, next);
+-out:
++	put_prev_set_next_task(rq, prev, next);
+ 	if (rq->core->core_forceidle_count && next == rq->idle)
+ 		queue_core_balance(rq);
+ 
+@@ -6342,19 +6416,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+  * Constants for the sched_mode argument of __schedule().
+  *
+  * The mode argument allows RT enabled kernels to differentiate a
+- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+- * optimize the AND operation out and just check for zero.
++ * preemption from blocking on an 'sleeping' spin/rwlock.
+  */
+-#define SM_NONE			0x0
+-#define SM_PREEMPT		0x1
+-#define SM_RTLOCK_WAIT		0x2
+-
+-#ifndef CONFIG_PREEMPT_RT
+-# define SM_MASK_PREEMPT	(~0U)
+-#else
+-# define SM_MASK_PREEMPT	SM_PREEMPT
+-#endif
++#define SM_IDLE			(-1)
++#define SM_NONE			0
++#define SM_PREEMPT		1
++#define SM_RTLOCK_WAIT		2
+ 
+ /*
+  * __schedule() is the main scheduler function.
+@@ -6395,9 +6462,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+  *
+  * WARNING: must be called with preemption disabled!
+  */
+-static void __sched notrace __schedule(unsigned int sched_mode)
++static void __sched notrace __schedule(int sched_mode)
+ {
+ 	struct task_struct *prev, *next;
++	/*
++	 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
++	 * as a preemption by schedule_debug() and RCU.
++	 */
++	bool preempt = sched_mode > SM_NONE;
+ 	unsigned long *switch_count;
+ 	unsigned long prev_state;
+ 	struct rq_flags rf;
+@@ -6408,13 +6480,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 	rq = cpu_rq(cpu);
+ 	prev = rq->curr;
+ 
+-	schedule_debug(prev, !!sched_mode);
++	schedule_debug(prev, preempt);
+ 
+ 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
+ 		hrtick_clear(rq);
+ 
+ 	local_irq_disable();
+-	rcu_note_context_switch(!!sched_mode);
++	rcu_note_context_switch(preempt);
+ 
+ 	/*
+ 	 * Make sure that signal_pending_state()->signal_pending() below
+@@ -6443,22 +6515,32 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 
+ 	switch_count = &prev->nivcsw;
+ 
++	/* Task state changes only considers SM_PREEMPT as preemption */
++	preempt = sched_mode == SM_PREEMPT;
++
+ 	/*
+ 	 * We must load prev->state once (task_struct::state is volatile), such
+ 	 * that we form a control dependency vs deactivate_task() below.
+ 	 */
+ 	prev_state = READ_ONCE(prev->__state);
+-	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
++	if (sched_mode == SM_IDLE) {
++		if (!rq->nr_running) {
++			next = prev;
++			goto picked;
++		}
++	} else if (!preempt && prev_state) {
+ 		if (signal_pending_state(prev_state, prev)) {
+ 			WRITE_ONCE(prev->__state, TASK_RUNNING);
+ 		} else {
++			int flags = DEQUEUE_NOCLOCK;
++
+ 			prev->sched_contributes_to_load =
+ 				(prev_state & TASK_UNINTERRUPTIBLE) &&
+ 				!(prev_state & TASK_NOLOAD) &&
+ 				!(prev_state & TASK_FROZEN);
+ 
+-			if (prev->sched_contributes_to_load)
+-				rq->nr_uninterruptible++;
++			if (unlikely(is_special_task_state(prev_state)))
++				flags |= DEQUEUE_SPECIAL;
+ 
+ 			/*
+ 			 * __schedule()			ttwu()
+@@ -6471,17 +6553,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 			 *
+ 			 * After this, schedule() must not care about p->state any more.
+ 			 */
+-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+-
+-			if (prev->in_iowait) {
+-				atomic_inc(&rq->nr_iowait);
+-				delayacct_blkio_start();
+-			}
++			block_task(rq, prev, flags);
+ 		}
+ 		switch_count = &prev->nvcsw;
+ 	}
+ 
+ 	next = pick_next_task(rq, prev, &rf);
++picked:
+ 	clear_tsk_need_resched(prev);
+ 	clear_preempt_need_resched();
+ #ifdef CONFIG_SCHED_DEBUG
+@@ -6523,7 +6601,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
+ 		psi_account_irqtime(rq, prev, next);
+ 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+ 
+-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
++		trace_sched_switch(preempt, prev, next, prev_state);
+ 
+ 		/* Also unlocks the rq: */
+ 		rq = context_switch(rq, prev, next, &rf);
+@@ -6599,7 +6677,7 @@ static void sched_update_worker(struct task_struct *tsk)
+ 	}
+ }
+ 
+-static __always_inline void __schedule_loop(unsigned int sched_mode)
++static __always_inline void __schedule_loop(int sched_mode)
+ {
+ 	do {
+ 		preempt_disable();
+@@ -6644,7 +6722,7 @@ void __sched schedule_idle(void)
+ 	 */
+ 	WARN_ON_ONCE(current->__state);
+ 	do {
+-		__schedule(SM_NONE);
++		__schedule(SM_IDLE);
+ 	} while (need_resched());
+ }
+ 
+@@ -8228,8 +8306,6 @@ void __init sched_init(void)
+ #endif /* CONFIG_RT_GROUP_SCHED */
+ 	}
+ 
+-	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+-
+ #ifdef CONFIG_SMP
+ 	init_defrootdomain();
+ #endif
+@@ -8284,8 +8360,13 @@ void __init sched_init(void)
+ 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+ #ifdef CONFIG_RT_GROUP_SCHED
++		/*
++		 * This is required for init cpu because rt.c:__enable_runtime()
++		 * starts working after scheduler_running, which is not the case
++		 * yet.
++		 */
++		rq->rt.rt_runtime = global_rt_runtime();
+ 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
+ #endif
+ #ifdef CONFIG_SMP
+@@ -8317,10 +8398,12 @@ void __init sched_init(void)
+ #endif /* CONFIG_SMP */
+ 		hrtick_rq_init(rq);
+ 		atomic_set(&rq->nr_iowait, 0);
++		fair_server_init(rq);
+ 
+ #ifdef CONFIG_SCHED_CORE
+ 		rq->core = rq;
+ 		rq->core_pick = NULL;
++		rq->core_dl_server = NULL;
+ 		rq->core_enabled = 0;
+ 		rq->core_tree = RB_ROOT;
+ 		rq->core_forceidle_count = 0;
+@@ -8333,6 +8416,7 @@ void __init sched_init(void)
+ 	}
+ 
+ 	set_load_weight(&init_task, false);
++	init_task.se.slice = sysctl_sched_base_slice,
+ 
+ 	/*
+ 	 * The boot idle thread does lazy MMU switching as well:
+@@ -8548,7 +8632,7 @@ void normalize_rt_tasks(void)
+ 		schedstat_set(p->stats.sleep_start, 0);
+ 		schedstat_set(p->stats.block_start, 0);
+ 
+-		if (!dl_task(p) && !rt_task(p)) {
++		if (!rt_or_dl_task(p)) {
+ 			/*
+ 			 * Renice negative nice level userspace
+ 			 * tasks back to 0:
+diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
+index eece6244f9d2..43111a515a28 100644
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
+ 		 * Fake (unused) bandwidth; workaround to "fix"
+ 		 * priority inheritance.
+ 		 */
+-		.sched_runtime	=  1000000,
+-		.sched_deadline = 10000000,
+-		.sched_period	= 10000000,
++		.sched_runtime	= NSEC_PER_MSEC,
++		.sched_deadline = 10 * NSEC_PER_MSEC,
++		.sched_period	= 10 * NSEC_PER_MSEC,
+ 	};
+ 	struct cpufreq_policy *policy = sg_policy->policy;
+ 	int ret;
+diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
+index c5a3691ba6cc..9ce93d0bf452 100644
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+ 		__sub_running_bw(dl_se->dl_bw, dl_rq);
+ }
+ 
+-static void dl_change_utilization(struct task_struct *p, u64 new_bw)
++static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
+ {
+-	struct rq *rq;
+-
+-	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+-
+-	if (task_on_rq_queued(p))
+-		return;
++	if (dl_se->dl_non_contending) {
++		sub_running_bw(dl_se, &rq->dl);
++		dl_se->dl_non_contending = 0;
+ 
+-	rq = task_rq(p);
+-	if (p->dl.dl_non_contending) {
+-		sub_running_bw(&p->dl, &rq->dl);
+-		p->dl.dl_non_contending = 0;
+ 		/*
+ 		 * If the timer handler is currently running and the
+ 		 * timer cannot be canceled, inactive_task_timer()
+@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+ 		 * will not touch the rq's active utilization,
+ 		 * so we are still safe.
+ 		 */
+-		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+-			put_task_struct(p);
++		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
++			if (!dl_server(dl_se))
++				put_task_struct(dl_task_of(dl_se));
++		}
+ 	}
+-	__sub_rq_bw(p->dl.dl_bw, &rq->dl);
++	__sub_rq_bw(dl_se->dl_bw, &rq->dl);
+ 	__add_rq_bw(new_bw, &rq->dl);
+ }
+ 
++static void dl_change_utilization(struct task_struct *p, u64 new_bw)
++{
++	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
++
++	if (task_on_rq_queued(p))
++		return;
++
++	dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
++}
++
+ static void __dl_clear_params(struct sched_dl_entity *dl_se);
+ 
+ /*
+@@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ 	/* for non-boosted task, pi_of(dl_se) == dl_se */
+ 	dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ 	dl_se->runtime = pi_of(dl_se)->dl_runtime;
++
++	/*
++	 * If it is a deferred reservation, and the server
++	 * is not handling an starvation case, defer it.
++	 */
++	if (dl_se->dl_defer & !dl_se->dl_defer_running) {
++		dl_se->dl_throttled = 1;
++		dl_se->dl_defer_armed = 1;
++	}
+ }
+ 
+ /*
+@@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
+ 	replenish_dl_new_period(dl_se, rq);
+ }
+ 
++static int start_dl_timer(struct sched_dl_entity *dl_se);
++static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
++
+ /*
+  * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+  * possibility of a entity lasting more than what it declared, and thus
+@@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
+ 	/*
+ 	 * This could be the case for a !-dl task that is boosted.
+ 	 * Just go with full inherited parameters.
++	 *
++	 * Or, it could be the case of a deferred reservation that
++	 * was not able to consume its runtime in background and
++	 * reached this point with current u > U.
++	 *
++	 * In both cases, set a new period.
+ 	 */
+-	if (dl_se->dl_deadline == 0)
+-		replenish_dl_new_period(dl_se, rq);
++	if (dl_se->dl_deadline == 0 ||
++	    (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
++		dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
++		dl_se->runtime = pi_of(dl_se)->dl_runtime;
++	}
+ 
+ 	if (dl_se->dl_yielded && dl_se->runtime > 0)
+ 		dl_se->runtime = 0;
+@@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
+ 		dl_se->dl_yielded = 0;
+ 	if (dl_se->dl_throttled)
+ 		dl_se->dl_throttled = 0;
++
++	/*
++	 * If this is the replenishment of a deferred reservation,
++	 * clear the flag and return.
++	 */
++	if (dl_se->dl_defer_armed) {
++		dl_se->dl_defer_armed = 0;
++		return;
++	}
++
++	/*
++	 * A this point, if the deferred server is not armed, and the deadline
++	 * is in the future, if it is not running already, throttle the server
++	 * and arm the defer timer.
++	 */
++	if (dl_se->dl_defer && !dl_se->dl_defer_running &&
++	    dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
++		if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
++
++			/*
++			 * Set dl_se->dl_defer_armed and dl_throttled variables to
++			 * inform the start_dl_timer() that this is a deferred
++			 * activation.
++			 */
++			dl_se->dl_defer_armed = 1;
++			dl_se->dl_throttled = 1;
++			if (!start_dl_timer(dl_se)) {
++				/*
++				 * If for whatever reason (delays), a previous timer was
++				 * queued but not serviced, cancel it and clean the
++				 * deferrable server variables intended for start_dl_timer().
++				 */
++				hrtimer_try_to_cancel(&dl_se->dl_timer);
++				dl_se->dl_defer_armed = 0;
++				dl_se->dl_throttled = 0;
++			}
++		}
++	}
+ }
+ 
+ /*
+@@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
+ 		}
+ 
+ 		replenish_dl_new_period(dl_se, rq);
++	} else if (dl_server(dl_se) && dl_se->dl_defer) {
++		/*
++		 * The server can still use its previous deadline, so check if
++		 * it left the dl_defer_running state.
++		 */
++		if (!dl_se->dl_defer_running) {
++			dl_se->dl_defer_armed = 1;
++			dl_se->dl_throttled = 1;
++		}
+ 	}
+ }
+ 
+@@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
+ 	 * We want the timer to fire at the deadline, but considering
+ 	 * that it is actually coming from rq->clock and not from
+ 	 * hrtimer's time base reading.
++	 *
++	 * The deferred reservation will have its timer set to
++	 * (deadline - runtime). At that point, the CBS rule will decide
++	 * if the current deadline can be used, or if a replenishment is
++	 * required to avoid add too much pressure on the system
++	 * (current u > U).
+ 	 */
+-	act = ns_to_ktime(dl_next_period(dl_se));
++	if (dl_se->dl_defer_armed) {
++		WARN_ON_ONCE(!dl_se->dl_throttled);
++		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
++	} else {
++		/* act = deadline - rel-deadline + period */
++		act = ns_to_ktime(dl_next_period(dl_se));
++	}
++
+ 	now = hrtimer_cb_get_time(timer);
+ 	delta = ktime_to_ns(now) - rq_clock(rq);
+ 	act = ktime_add_ns(act, delta);
+@@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+ #endif
+ }
+ 
++/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
++static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
++
++static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
++{
++	struct rq *rq = rq_of_dl_se(dl_se);
++	u64 fw;
++
++	scoped_guard (rq_lock, rq) {
++		struct rq_flags *rf = &scope.rf;
++
++		if (!dl_se->dl_throttled || !dl_se->dl_runtime)
++			return HRTIMER_NORESTART;
++
++		sched_clock_tick();
++		update_rq_clock(rq);
++
++		if (!dl_se->dl_runtime)
++			return HRTIMER_NORESTART;
++
++		if (!dl_se->server_has_tasks(dl_se)) {
++			replenish_dl_entity(dl_se);
++			return HRTIMER_NORESTART;
++		}
++
++		if (dl_se->dl_defer_armed) {
++			/*
++			 * First check if the server could consume runtime in background.
++			 * If so, it is possible to push the defer timer for this amount
++			 * of time. The dl_server_min_res serves as a limit to avoid
++			 * forwarding the timer for a too small amount of time.
++			 */
++			if (dl_time_before(rq_clock(dl_se->rq),
++					   (dl_se->deadline - dl_se->runtime - dl_server_min_res))) {
++
++				/* reset the defer timer */
++				fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime;
++
++				hrtimer_forward_now(timer, ns_to_ktime(fw));
++				return HRTIMER_RESTART;
++			}
++
++			dl_se->dl_defer_running = 1;
++		}
++
++		enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
++
++		if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl))
++			resched_curr(rq);
++
++		__push_dl_task(rq, rf);
++	}
++
++	return HRTIMER_NORESTART;
++}
++
+ /*
+  * This is the bandwidth enforcement timer callback. If here, we know
+  * a task is not on its dl_rq, since the fact that the timer was running
+@@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+ 	struct rq_flags rf;
+ 	struct rq *rq;
+ 
+-	if (dl_server(dl_se)) {
+-		struct rq *rq = rq_of_dl_se(dl_se);
+-		struct rq_flags rf;
+-
+-		rq_lock(rq, &rf);
+-		if (dl_se->dl_throttled) {
+-			sched_clock_tick();
+-			update_rq_clock(rq);
+-
+-			if (dl_se->server_has_tasks(dl_se)) {
+-				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+-				resched_curr(rq);
+-				__push_dl_task(rq, &rf);
+-			} else {
+-				replenish_dl_entity(dl_se);
+-			}
+-
+-		}
+-		rq_unlock(rq, &rf);
+-
+-		return HRTIMER_NORESTART;
+-	}
++	if (dl_server(dl_se))
++		return dl_server_timer(timer, dl_se);
+ 
+ 	p = dl_task_of(dl_se);
+ 	rq = task_rq_lock(p, &rf);
+@@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+ 	return (delta * u_act) >> BW_SHIFT;
+ }
+ 
+-static inline void
+-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+-                        int flags);
+-static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
++s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+ {
+ 	s64 scaled_delta_exec;
+ 
+-	if (unlikely(delta_exec <= 0)) {
+-		if (unlikely(dl_se->dl_yielded))
+-			goto throttle;
+-		return;
+-	}
+-
+-	if (dl_entity_is_special(dl_se))
+-		return;
+-
+ 	/*
+ 	 * For tasks that participate in GRUB, we implement GRUB-PA: the
+ 	 * spare reclaimed bandwidth is used to clock down frequency.
+@@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ 	}
+ 
++	return scaled_delta_exec;
++}
++
++static inline void
++update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
++			int flags);
++static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
++{
++	s64 scaled_delta_exec;
++
++	if (unlikely(delta_exec <= 0)) {
++		if (unlikely(dl_se->dl_yielded))
++			goto throttle;
++		return;
++	}
++
++	if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer)
++		return;
++
++	if (dl_entity_is_special(dl_se))
++		return;
++
++	scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
++
+ 	dl_se->runtime -= scaled_delta_exec;
+ 
++	/*
++	 * The fair server can consume its runtime while throttled (not queued/
++	 * running as regular CFS).
++	 *
++	 * If the server consumes its entire runtime in this state. The server
++	 * is not required for the current period. Thus, reset the server by
++	 * starting a new period, pushing the activation.
++	 */
++	if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
++		/*
++		 * If the server was previously activated - the starving condition
++		 * took place, it this point it went away because the fair scheduler
++		 * was able to get runtime in background. So return to the initial
++		 * state.
++		 */
++		dl_se->dl_defer_running = 0;
++
++		hrtimer_try_to_cancel(&dl_se->dl_timer);
++
++		replenish_dl_new_period(dl_se, dl_se->rq);
++
++		/*
++		 * Not being able to start the timer seems problematic. If it could not
++		 * be started for whatever reason, we need to "unthrottle" the DL server
++		 * and queue right away. Otherwise nothing might queue it. That's similar
++		 * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn.
++		 */
++		WARN_ON_ONCE(!start_dl_timer(dl_se));
++
++		return;
++	}
++
+ throttle:
+ 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
+ 		dl_se->dl_throttled = 1;
+@@ -1381,6 +1547,14 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 			resched_curr(rq);
+ 	}
+ 
++	/*
++	 * The fair server (sole dl_server) does not account for real-time
++	 * workload because it is running fair work.
++	 */
++	if (dl_se == &rq->fair_server)
++		return;
++
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	/*
+ 	 * Because -- for now -- we share the rt bandwidth, we need to
+ 	 * account our runtime there too, otherwise actual rt tasks
+@@ -1405,34 +1579,155 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
+ 			rt_rq->rt_time += delta_exec;
+ 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ 	}
++#endif
++}
++
++/*
++ * In the non-defer mode, the idle time is not accounted, as the
++ * server provides a guarantee.
++ *
++ * If the dl_server is in defer mode, the idle time is also considered
++ * as time available for the fair server, avoiding a penalty for the
++ * rt scheduler that did not consumed that time.
++ */
++void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
++{
++	s64 delta_exec, scaled_delta_exec;
++
++	if (!rq->fair_server.dl_defer)
++		return;
++
++	/* no need to discount more */
++	if (rq->fair_server.runtime < 0)
++		return;
++
++	delta_exec = rq_clock_task(rq) - p->se.exec_start;
++	if (delta_exec < 0)
++		return;
++
++	scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
++
++	rq->fair_server.runtime -= scaled_delta_exec;
++
++	if (rq->fair_server.runtime < 0) {
++		rq->fair_server.dl_defer_running = 0;
++		rq->fair_server.runtime = 0;
++	}
++
++	p->se.exec_start = rq_clock_task(rq);
+ }
+ 
+ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+ {
+-	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
++	/* 0 runtime = fair server disabled */
++	if (dl_se->dl_runtime)
++		update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
+ 
+ void dl_server_start(struct sched_dl_entity *dl_se)
+ {
++	struct rq *rq = dl_se->rq;
++
++	/*
++	 * XXX: the apply do not work fine at the init phase for the
++	 * fair server because things are not yet set. We need to improve
++	 * this before getting generic.
++	 */
+ 	if (!dl_server(dl_se)) {
++		u64 runtime =  50 * NSEC_PER_MSEC;
++		u64 period = 1000 * NSEC_PER_MSEC;
++
++		dl_server_apply_params(dl_se, runtime, period, 1);
++
+ 		dl_se->dl_server = 1;
++		dl_se->dl_defer = 1;
+ 		setup_new_dl_entity(dl_se);
+ 	}
++
++	if (!dl_se->dl_runtime)
++		return;
++
+ 	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
++	if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
++		resched_curr(dl_se->rq);
+ }
+ 
+ void dl_server_stop(struct sched_dl_entity *dl_se)
+ {
++	if (!dl_se->dl_runtime)
++		return;
++
+ 	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
++	hrtimer_try_to_cancel(&dl_se->dl_timer);
++	dl_se->dl_defer_armed = 0;
++	dl_se->dl_throttled = 0;
+ }
+ 
+ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ 		    dl_server_has_tasks_f has_tasks,
+-		    dl_server_pick_f pick)
++		    dl_server_pick_f pick_task)
+ {
+ 	dl_se->rq = rq;
+ 	dl_se->server_has_tasks = has_tasks;
+-	dl_se->server_pick = pick;
++	dl_se->server_pick_task = pick_task;
++}
++
++void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
++{
++	u64 new_bw = dl_se->dl_bw;
++	int cpu = cpu_of(rq);
++	struct dl_bw *dl_b;
++
++	dl_b = dl_bw_of(cpu_of(rq));
++	guard(raw_spinlock)(&dl_b->lock);
++
++	if (!dl_bw_cpus(cpu))
++		return;
++
++	__dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
++}
++
++int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
++{
++	u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
++	u64 new_bw = to_ratio(period, runtime);
++	struct rq *rq = dl_se->rq;
++	int cpu = cpu_of(rq);
++	struct dl_bw *dl_b;
++	unsigned long cap;
++	int retval = 0;
++	int cpus;
++
++	dl_b = dl_bw_of(cpu);
++	guard(raw_spinlock)(&dl_b->lock);
++
++	cpus = dl_bw_cpus(cpu);
++	cap = dl_bw_capacity(cpu);
++
++	if (__dl_overflow(dl_b, cap, old_bw, new_bw))
++		return -EBUSY;
++
++	if (init) {
++		__add_rq_bw(new_bw, &rq->dl);
++		__dl_add(dl_b, new_bw, cpus);
++	} else {
++		__dl_sub(dl_b, dl_se->dl_bw, cpus);
++		__dl_add(dl_b, new_bw, cpus);
++
++		dl_rq_change_utilization(rq, dl_se, new_bw);
++	}
++
++	dl_se->dl_runtime = runtime;
++	dl_se->dl_deadline = period;
++	dl_se->dl_period = period;
++
++	dl_se->runtime = 0;
++	dl_se->deadline = 0;
++
++	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
++	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
++
++	return retval;
+ }
+ 
+ /*
+@@ -1729,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
+ 	 * be counted in the active utilization; hence, we need to call
+ 	 * add_running_bw().
+ 	 */
+-	if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
++	if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ 		if (flags & ENQUEUE_WAKEUP)
+ 			task_contending(dl_se, flags);
+ 
+@@ -1751,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
+ 		setup_new_dl_entity(dl_se);
+ 	}
+ 
++	/*
++	 * If the reservation is still throttled, e.g., it got replenished but is a
++	 * deferred task and still got to wait, don't enqueue.
++	 */
++	if (dl_se->dl_throttled && start_dl_timer(dl_se))
++		return;
++
++	/*
++	 * We're about to enqueue, make sure we're not ->dl_throttled!
++	 * In case the timer was not started, say because the defer time
++	 * has passed, mark as not throttled and mark unarmed.
++	 * Also cancel earlier timers, since letting those run is pointless.
++	 */
++	if (dl_se->dl_throttled) {
++		hrtimer_try_to_cancel(&dl_se->dl_timer);
++		dl_se->dl_defer_armed = 0;
++		dl_se->dl_throttled = 0;
++	}
++
+ 	__enqueue_dl_entity(dl_se);
+ }
+ 
+@@ -1840,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ 		enqueue_pushable_dl_task(rq, p);
+ }
+ 
+-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
++static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	update_curr_dl(rq);
+ 
+@@ -1850,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ 	dequeue_dl_entity(&p->dl, flags);
+ 	if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ 		dequeue_pushable_dl_task(rq, p);
++
++	return true;
+ }
+ 
+ /*
+@@ -2068,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
+ 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ 
+ 	deadline_queue_push_tasks(rq);
++
++	if (hrtick_enabled(rq))
++		start_hrtick_dl(rq, &p->dl);
+ }
+ 
+ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
+@@ -2080,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
+ 	return __node_2_dle(left);
+ }
+ 
+-static struct task_struct *pick_task_dl(struct rq *rq)
++/*
++ * __pick_next_task_dl - Helper to pick the next -deadline task to run.
++ * @rq: The runqueue to pick the next task from.
++ */
++static struct task_struct *__pick_task_dl(struct rq *rq)
+ {
+ 	struct sched_dl_entity *dl_se;
+ 	struct dl_rq *dl_rq = &rq->dl;
+@@ -2094,14 +2417,13 @@ static struct task_struct *pick_task_dl(struct rq *rq)
+ 	WARN_ON_ONCE(!dl_se);
+ 
+ 	if (dl_server(dl_se)) {
+-		p = dl_se->server_pick(dl_se);
++		p = dl_se->server_pick_task(dl_se);
+ 		if (!p) {
+-			WARN_ON_ONCE(1);
+ 			dl_se->dl_yielded = 1;
+ 			update_curr_dl_se(rq, dl_se, 0);
+ 			goto again;
+ 		}
+-		p->dl_server = dl_se;
++		rq->dl_server = dl_se;
+ 	} else {
+ 		p = dl_task_of(dl_se);
+ 	}
+@@ -2109,24 +2431,12 @@ static struct task_struct *pick_task_dl(struct rq *rq)
+ 	return p;
+ }
+ 
+-static struct task_struct *pick_next_task_dl(struct rq *rq)
++static struct task_struct *pick_task_dl(struct rq *rq)
+ {
+-	struct task_struct *p;
+-
+-	p = pick_task_dl(rq);
+-	if (!p)
+-		return p;
+-
+-	if (!p->dl_server)
+-		set_next_task_dl(rq, p, true);
+-
+-	if (hrtick_enabled(rq))
+-		start_hrtick_dl(rq, &p->dl);
+-
+-	return p;
++	return __pick_task_dl(rq);
+ }
+ 
+-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
++static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
+ {
+ 	struct sched_dl_entity *dl_se = &p->dl;
+ 	struct dl_rq *dl_rq = &rq->dl;
+@@ -2818,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_dl,
+ 
+-	.pick_next_task		= pick_next_task_dl,
++	.pick_task		= pick_task_dl,
+ 	.put_prev_task		= put_prev_task_dl,
+ 	.set_next_task		= set_next_task_dl,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_dl,
+-	.pick_task		= pick_task_dl,
+ 	.select_task_rq		= select_task_rq_dl,
+ 	.migrate_task_rq	= migrate_task_rq_dl,
+ 	.set_cpus_allowed       = set_cpus_allowed_dl,
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index c1eb9a1afd13..de1dc5264b3f 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
+ 	.release	= seq_release,
+ };
+ 
++enum dl_param {
++	DL_RUNTIME = 0,
++	DL_PERIOD,
++};
++
++static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
++static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC;     /* 100 us */
++
++static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
++				       size_t cnt, loff_t *ppos, enum dl_param param)
++{
++	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
++	struct rq *rq = cpu_rq(cpu);
++	u64 runtime, period;
++	size_t err;
++	int retval;
++	u64 value;
++
++	err = kstrtoull_from_user(ubuf, cnt, 10, &value);
++	if (err)
++		return err;
++
++	scoped_guard (rq_lock_irqsave, rq) {
++		runtime  = rq->fair_server.dl_runtime;
++		period = rq->fair_server.dl_period;
++
++		switch (param) {
++		case DL_RUNTIME:
++			if (runtime == value)
++				break;
++			runtime = value;
++			break;
++		case DL_PERIOD:
++			if (value == period)
++				break;
++			period = value;
++			break;
++		}
++
++		if (runtime > period ||
++		    period > fair_server_period_max ||
++		    period < fair_server_period_min) {
++			return  -EINVAL;
++		}
++
++		if (rq->cfs.h_nr_running) {
++			update_rq_clock(rq);
++			dl_server_stop(&rq->fair_server);
++		}
++
++		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
++		if (retval)
++			cnt = retval;
++
++		if (!runtime)
++			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
++					cpu_of(rq));
++
++		if (rq->cfs.h_nr_running)
++			dl_server_start(&rq->fair_server);
++	}
++
++	*ppos += cnt;
++	return cnt;
++}
++
++static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
++{
++	unsigned long cpu = (unsigned long) m->private;
++	struct rq *rq = cpu_rq(cpu);
++	u64 value;
++
++	switch (param) {
++	case DL_RUNTIME:
++		value = rq->fair_server.dl_runtime;
++		break;
++	case DL_PERIOD:
++		value = rq->fair_server.dl_period;
++		break;
++	}
++
++	seq_printf(m, "%llu\n", value);
++	return 0;
++
++}
++
++static ssize_t
++sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
++				size_t cnt, loff_t *ppos)
++{
++	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
++}
++
++static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
++{
++	return sched_fair_server_show(m, v, DL_RUNTIME);
++}
++
++static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
++{
++	return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
++}
++
++static const struct file_operations fair_server_runtime_fops = {
++	.open		= sched_fair_server_runtime_open,
++	.write		= sched_fair_server_runtime_write,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
++static ssize_t
++sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
++			       size_t cnt, loff_t *ppos)
++{
++	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
++}
++
++static int sched_fair_server_period_show(struct seq_file *m, void *v)
++{
++	return sched_fair_server_show(m, v, DL_PERIOD);
++}
++
++static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
++{
++	return single_open(filp, sched_fair_server_period_show, inode->i_private);
++}
++
++static const struct file_operations fair_server_period_fops = {
++	.open		= sched_fair_server_period_open,
++	.write		= sched_fair_server_period_write,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
+ static struct dentry *debugfs_sched;
+ 
++static void debugfs_fair_server_init(void)
++{
++	struct dentry *d_fair;
++	unsigned long cpu;
++
++	d_fair = debugfs_create_dir("fair_server", debugfs_sched);
++	if (!d_fair)
++		return;
++
++	for_each_possible_cpu(cpu) {
++		struct dentry *d_cpu;
++		char buf[32];
++
++		snprintf(buf, sizeof(buf), "cpu%lu", cpu);
++		d_cpu = debugfs_create_dir(buf, d_fair);
++
++		debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
++		debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
++	}
++}
++
+ static __init int sched_init_debug(void)
+ {
+ 	struct dentry __maybe_unused *numa;
+@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
+ 
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
++	debugfs_fair_server_init();
++
+ 	return 0;
+ }
+ late_initcall(sched_init_debug);
+@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 	else
+ 		SEQ_printf(m, " %c", task_state_to_char(p));
+ 
+-	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
++	SEQ_printf(m, " %15s %5d %9Ld.%06ld   %c   %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld   %5d ",
+ 		p->comm, task_pid_nr(p),
+ 		SPLIT_NS(p->se.vruntime),
+ 		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ 		SPLIT_NS(p->se.deadline),
++		p->se.custom_slice ? 'S' : ' ',
+ 		SPLIT_NS(p->se.slice),
+ 		SPLIT_NS(p->se.sum_exec_runtime),
+ 		(long long)(p->nvcsw + p->nivcsw),
+ 		p->prio);
+ 
+-	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
++	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
+-		SPLIT_NS(p->se.sum_exec_runtime),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+-	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
++	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
+ #endif
+ #ifdef CONFIG_CGROUP_SCHED
+-	SEQ_printf_task_group_path(m, task_group(p), " %s")
++	SEQ_printf_task_group_path(m, task_group(p), "        %s")
+ #endif
+ 
+ 	SEQ_printf(m, "\n");
+@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+ 
+ 	SEQ_printf(m, "\n");
+ 	SEQ_printf(m, "runnable tasks:\n");
+-	SEQ_printf(m, " S            task   PID         tree-key  switches  prio"
+-		   "     wait-time             sum-exec        sum-sleep\n");
++	SEQ_printf(m, " S            task   PID       vruntime   eligible    "
++		   "deadline             slice          sum-exec      switches  "
++		   "prio         wait-time        sum-sleep       sum-block"
++#ifdef CONFIG_NUMA_BALANCING
++		   "  node   group-id"
++#endif
++#ifdef CONFIG_CGROUP_SCHED
++		   "  group-path"
++#endif
++		   "\n");
+ 	SEQ_printf(m, "-------------------------------------------------------"
+-		   "------------------------------------------------------\n");
++		   "------------------------------------------------------"
++		   "------------------------------------------------------"
++#ifdef CONFIG_NUMA_BALANCING
++		   "--------------"
++#endif
++#ifdef CONFIG_CGROUP_SCHED
++		   "--------------"
++#endif
++		   "\n");
+ 
+ 	rcu_read_lock();
+ 	for_each_process_thread(g, p) {
+@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ 	SEQ_printf(m, "\n");
+ 	SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
+ #endif
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+-			SPLIT_NS(cfs_rq->exec_clock));
+ 
+ 	raw_spin_rq_lock_irqsave(rq, flags);
+ 	root = __pick_root_entity(cfs_rq);
+@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ 			SPLIT_NS(right_vruntime));
+ 	spread = right_vruntime - left_vruntime;
+ 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
+-	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
+-			cfs_rq->nr_spread_over);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
+@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+ 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+ 
+ 	PU(rt_nr_running);
++
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	P(rt_throttled);
+ 	PN(rt_time);
+ 	PN(rt_runtime);
++#endif
+ 
+ #undef PN
+ #undef PU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 91b242e47db7..c89e7f1693d4 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -792,8 +792,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ 	}
+ 
+ 	/* ensure we never gain time by being placed backwards. */
+-	u64_u32_store(cfs_rq->min_vruntime,
+-		      __update_min_vruntime(cfs_rq, vruntime));
++	cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
++}
++
++static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
++{
++	struct sched_entity *root = __pick_root_entity(cfs_rq);
++	struct sched_entity *curr = cfs_rq->curr;
++	u64 min_slice = ~0ULL;
++
++	if (curr && curr->on_rq)
++		min_slice = curr->slice;
++
++	if (root)
++		min_slice = min(min_slice, root->min_slice);
++
++	return min_slice;
+ }
+ 
+ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -812,19 +826,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node
+ 	}
+ }
+ 
++static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
++{
++	if (node) {
++		struct sched_entity *rse = __node_2_se(node);
++		if (rse->min_slice < se->min_slice)
++			se->min_slice = rse->min_slice;
++	}
++}
++
+ /*
+  * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
+  */
+ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
+ {
+ 	u64 old_min_vruntime = se->min_vruntime;
++	u64 old_min_slice = se->min_slice;
+ 	struct rb_node *node = &se->run_node;
+ 
+ 	se->min_vruntime = se->vruntime;
+ 	__min_vruntime_update(se, node->rb_right);
+ 	__min_vruntime_update(se, node->rb_left);
+ 
+-	return se->min_vruntime == old_min_vruntime;
++	se->min_slice = se->slice;
++	__min_slice_update(se, node->rb_right);
++	__min_slice_update(se, node->rb_left);
++
++	return se->min_vruntime == old_min_vruntime &&
++	       se->min_slice == old_min_slice;
+ }
+ 
+ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+@@ -837,6 +866,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	avg_vruntime_add(cfs_rq, se);
+ 	se->min_vruntime = se->vruntime;
++	se->min_slice = se->slice;
+ 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ 				__entity_less, &min_vruntime_cb);
+ }
+@@ -987,17 +1017,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+  * this is probably good enough.
+  */
+-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
++static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	if ((s64)(se->vruntime - se->deadline) < 0)
+-		return;
++		return false;
+ 
+ 	/*
+ 	 * For EEVDF the virtual time slope is determined by w_i (iow.
+ 	 * nice) while the request time r_i is determined by
+ 	 * sysctl_sched_base_slice.
+ 	 */
+-	se->slice = sysctl_sched_base_slice;
++	if (!se->custom_slice)
++		se->slice = sysctl_sched_base_slice;
+ 
+ 	/*
+ 	 * EEVDF: vd_i = ve_i + r_i / w_i
+@@ -1007,10 +1038,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	/*
+ 	 * The task has consumed its request, reschedule.
+ 	 */
+-	if (cfs_rq->nr_running > 1) {
+-		resched_curr(rq_of(cfs_rq));
+-		clear_buddies(cfs_rq, se);
+-	}
++	return true;
+ }
+ 
+ #include "pelt.h"
+@@ -1148,6 +1176,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+ 		dl_server_update(p->dl_server, delta_exec);
+ }
+ 
++static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
++{
++	if (!sched_feat(PREEMPT_SHORT))
++		return false;
++
++	if (curr->vlag == curr->deadline)
++		return false;
++
++	return !entity_eligible(cfs_rq, curr);
++}
++
++static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
++				    struct sched_entity *pse, struct sched_entity *se)
++{
++	if (!sched_feat(PREEMPT_SHORT))
++		return false;
++
++	if (pse->slice >= se->slice)
++		return false;
++
++	if (!entity_eligible(cfs_rq, pse))
++		return false;
++
++	if (entity_before(pse, se))
++		return true;
++
++	if (!entity_eligible(cfs_rq, se))
++		return true;
++
++	return false;
++}
++
+ /*
+  * Used by other classes to account runtime.
+  */
+@@ -1169,23 +1229,44 @@ s64 update_curr_common(struct rq *rq)
+ static void update_curr(struct cfs_rq *cfs_rq)
+ {
+ 	struct sched_entity *curr = cfs_rq->curr;
++	struct rq *rq = rq_of(cfs_rq);
+ 	s64 delta_exec;
++	bool resched;
+ 
+ 	if (unlikely(!curr))
+ 		return;
+ 
+-	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
++	delta_exec = update_curr_se(rq, curr);
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
+-	update_deadline(cfs_rq, curr);
++	resched = update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+ 
+-	if (entity_is_task(curr))
+-		update_curr_task(task_of(curr), delta_exec);
++	if (entity_is_task(curr)) {
++		struct task_struct *p = task_of(curr);
++
++		update_curr_task(p, delta_exec);
++
++		/*
++		 * Any fair task that runs outside of fair_server should
++		 * account against fair_server such that it can account for
++		 * this time and possibly avoid running this period.
++		 */
++		if (p->dl_server != &rq->fair_server)
++			dl_server_update(&rq->fair_server, delta_exec);
++	}
+ 
+ 	account_cfs_rq_runtime(cfs_rq, delta_exec);
++
++	if (cfs_rq->nr_running == 1)
++		return;
++
++	if (resched || did_preempt_short(cfs_rq, curr)) {
++		resched_curr(rq);
++		clear_buddies(cfs_rq, curr);
++	}
+ }
+ 
+ static void update_curr_fair(struct rq *rq)
+@@ -5200,7 +5281,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ 	s64 lag = 0;
+ 
+-	se->slice = sysctl_sched_base_slice;
++	if (!se->custom_slice)
++		se->slice = sysctl_sched_base_slice;
+ 	vslice = calc_delta_fair(se->slice, se);
+ 
+ 	/*
+@@ -5281,6 +5363,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 
+ 	se->vruntime = vruntime - lag;
+ 
++	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
++		se->deadline += se->vruntime;
++		se->rel_deadline = 0;
++		return;
++	}
++
+ 	/*
+ 	 * When joining the competition; the existing tasks will be,
+ 	 * on average, halfway through their slice, as such start tasks
+@@ -5300,6 +5388,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+ 
+ static inline bool cfs_bandwidth_used(void);
+ 
++static void
++requeue_delayed_entity(struct sched_entity *se);
++
+ static void
+ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -5387,19 +5478,47 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+ 
+-static void
++static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
++{
++	se->sched_delayed = 0;
++	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
++		se->vlag = 0;
++}
++
++static bool
+ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+-	int action = UPDATE_TG;
++	bool sleep = flags & DEQUEUE_SLEEP;
++
++	update_curr(cfs_rq);
++
++	if (flags & DEQUEUE_DELAYED) {
++		SCHED_WARN_ON(!se->sched_delayed);
++	} else {
++		bool delay = sleep;
++		/*
++		 * DELAY_DEQUEUE relies on spurious wakeups, special task
++		 * states must not suffer spurious wakeups, excempt them.
++		 */
++		if (flags & DEQUEUE_SPECIAL)
++			delay = false;
++
++		SCHED_WARN_ON(delay && se->sched_delayed);
+ 
++		if (sched_feat(DELAY_DEQUEUE) && delay &&
++		    !entity_eligible(cfs_rq, se)) {
++			if (cfs_rq->next == se)
++				cfs_rq->next = NULL;
++			update_load_avg(cfs_rq, se, 0);
++			se->sched_delayed = 1;
++			return false;
++		}
++	}
++
++	int action = UPDATE_TG;
+ 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ 		action |= DO_DETACH;
+ 
+-	/*
+-	 * Update run-time statistics of the 'current'.
+-	 */
+-	update_curr(cfs_rq);
+-
+ 	/*
+ 	 * When dequeuing a sched_entity, we must:
+ 	 *   - Update loads to have both entity and cfs_rq synced with now.
+@@ -5417,6 +5536,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	clear_buddies(cfs_rq, se);
+ 
+ 	update_entity_lag(cfs_rq, se);
++	if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
++		se->deadline -= se->vruntime;
++		se->rel_deadline = 1;
++	}
++
+ 	if (se != cfs_rq->curr)
+ 		__dequeue_entity(cfs_rq, se);
+ 	se->on_rq = 0;
+@@ -5436,8 +5560,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
+ 		update_min_vruntime(cfs_rq);
+ 
++	if (flags & DEQUEUE_DELAYED)
++		finish_delayed_dequeue_entity(se);
++
+ 	if (cfs_rq->nr_running == 0)
+ 		update_idle_cfs_rq_clock_pelt(cfs_rq);
++
++	return true;
+ }
+ 
+ static void
+@@ -5463,6 +5592,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	}
+ 
+ 	update_stats_curr_start(cfs_rq, se);
++	SCHED_WARN_ON(cfs_rq->curr);
+ 	cfs_rq->curr = se;
+ 
+ 	/*
+@@ -5483,6 +5613,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
+ }
+ 
++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
++
+ /*
+  * Pick the next process, keeping these things in mind, in this order:
+  * 1) keep things fair between processes/task groups
+@@ -5491,16 +5623,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+  * 4) do not run the "skip" process, if something else is available
+  */
+ static struct sched_entity *
+-pick_next_entity(struct cfs_rq *cfs_rq)
++pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+ {
+ 	/*
+ 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
+ 	 */
+ 	if (sched_feat(NEXT_BUDDY) &&
+-	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
++	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
++		/* ->next will never be delayed */
++		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ 		return cfs_rq->next;
++	}
+ 
+-	return pick_eevdf(cfs_rq);
++	struct sched_entity *se = pick_eevdf(cfs_rq);
++	if (se->sched_delayed) {
++		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++		SCHED_WARN_ON(se->sched_delayed);
++		SCHED_WARN_ON(se->on_rq);
++		return NULL;
++	}
++	return se;
+ }
+ 
+ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+@@ -5524,6 +5666,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ 		/* in !on_rq case, update occurred at dequeue */
+ 		update_load_avg(cfs_rq, prev, 0);
+ 	}
++	SCHED_WARN_ON(cfs_rq->curr != prev);
+ 	cfs_rq->curr = NULL;
+ }
+ 
+@@ -5787,6 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ 	struct sched_entity *se;
+ 	long task_delta, idle_task_delta, dequeue = 1;
++	long rq_h_nr_running = rq->cfs.h_nr_running;
+ 
+ 	raw_spin_lock(&cfs_b->lock);
+ 	/* This will start the period timer if necessary */
+@@ -5820,11 +5964,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	idle_task_delta = cfs_rq->idle_h_nr_running;
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
++		int flags;
++
+ 		/* throttled entity or throttle-on-deactivate */
+ 		if (!se->on_rq)
+ 			goto done;
+ 
+-		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
++		/*
++		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
++		 * This avoids teaching dequeue_entities() about throttled
++		 * entities and keeps things relatively simple.
++		 */
++		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
++		if (se->sched_delayed)
++			flags |= DEQUEUE_DELAYED;
++		dequeue_entity(qcfs_rq, se, flags);
+ 
+ 		if (cfs_rq_is_idle(group_cfs_rq(se)))
+ 			idle_task_delta = cfs_rq->h_nr_running;
+@@ -5858,6 +6012,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	/* At this point se is NULL and we are at root level*/
+ 	sub_nr_running(rq, task_delta);
+ 
++	/* Stop the fair server if throttling resulted in no runnable tasks */
++	if (rq_h_nr_running && !rq->cfs.h_nr_running)
++		dl_server_stop(&rq->fair_server);
+ done:
+ 	/*
+ 	 * Note: distribution will already see us throttled via the
+@@ -5876,6 +6033,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ 	struct sched_entity *se;
+ 	long task_delta, idle_task_delta;
++	long rq_h_nr_running = rq->cfs.h_nr_running;
+ 
+ 	se = cfs_rq->tg->se[cpu_of(rq)];
+ 
+@@ -5913,7 +6071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ 
+-		if (se->on_rq)
++		/* Handle any unfinished DELAY_DEQUEUE business first. */
++		if (se->sched_delayed) {
++			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
++
++			dequeue_entity(qcfs_rq, se, flags);
++		} else if (se->on_rq)
+ 			break;
+ 		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+ 
+@@ -5945,6 +6108,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 			goto unthrottle_throttle;
+ 	}
+ 
++	/* Start the fair server if un-throttling resulted in new runnable tasks */
++	if (!rq_h_nr_running && rq->cfs.h_nr_running)
++		dl_server_start(&rq->fair_server);
++
+ 	/* At this point se is NULL and we are at root level*/
+ 	add_nr_running(rq, task_delta);
+ 
+@@ -6577,7 +6744,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+ {
+ 	int cpu = cpu_of(rq);
+ 
+-	if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
++	if (!cfs_bandwidth_used())
+ 		return;
+ 
+ 	if (!tick_nohz_full_cpu(cpu))
+@@ -6760,6 +6927,37 @@ static int sched_idle_cpu(int cpu)
+ }
+ #endif
+ 
++static void
++requeue_delayed_entity(struct sched_entity *se)
++{
++	struct cfs_rq *cfs_rq = cfs_rq_of(se);
++
++	/*
++	 * se->sched_delayed should imply: se->on_rq == 1.
++	 * Because a delayed entity is one that is still on
++	 * the runqueue competing until elegibility.
++	 */
++	SCHED_WARN_ON(!se->sched_delayed);
++	SCHED_WARN_ON(!se->on_rq);
++
++	if (sched_feat(DELAY_ZERO)) {
++		update_entity_lag(cfs_rq, se);
++		if (se->vlag > 0) {
++			cfs_rq->nr_running--;
++			if (se != cfs_rq->curr)
++				__dequeue_entity(cfs_rq, se);
++			se->vlag = 0;
++			place_entity(cfs_rq, se, 0);
++			if (se != cfs_rq->curr)
++				__enqueue_entity(cfs_rq, se);
++			cfs_rq->nr_running++;
++		}
++	}
++
++	update_load_avg(cfs_rq, se, 0);
++	se->sched_delayed = 0;
++}
++
+ /*
+  * The enqueue_task method is called before nr_running is
+  * increased. Here we update the fair scheduling stats and
+@@ -6772,6 +6970,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	struct sched_entity *se = &p->se;
+ 	int idle_h_nr_running = task_has_idle_policy(p);
+ 	int task_new = !(flags & ENQUEUE_WAKEUP);
++	int rq_h_nr_running = rq->cfs.h_nr_running;
++	u64 slice = 0;
+ 
+ 	/*
+ 	 * The code below (indirectly) updates schedutil which looks at
+@@ -6779,7 +6979,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	 * Let's add the task's estimated utilization to the cfs_rq's
+ 	 * estimated utilization, before we update schedutil.
+ 	 */
+-	util_est_enqueue(&rq->cfs, p);
++	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
++		util_est_enqueue(&rq->cfs, p);
++
++	if (flags & ENQUEUE_DELAYED) {
++		requeue_delayed_entity(se);
++		return;
++	}
+ 
+ 	/*
+ 	 * If in_iowait is set, the code below may not trigger any cpufreq
+@@ -6790,10 +6996,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ 
+ 	for_each_sched_entity(se) {
+-		if (se->on_rq)
++		if (se->on_rq) {
++			if (se->sched_delayed)
++				requeue_delayed_entity(se);
+ 			break;
++		}
+ 		cfs_rq = cfs_rq_of(se);
++
++		/*
++		 * Basically set the slice of group entries to the min_slice of
++		 * their respective cfs_rq. This ensures the group can service
++		 * its entities in the desired time-frame.
++		 */
++		if (slice) {
++			se->slice = slice;
++			se->custom_slice = 1;
++		}
+ 		enqueue_entity(cfs_rq, se, flags);
++		slice = cfs_rq_min_slice(cfs_rq);
+ 
+ 		cfs_rq->h_nr_running++;
+ 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+@@ -6815,6 +7035,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		se_update_runnable(se);
+ 		update_cfs_group(se);
+ 
++		se->slice = slice;
++		slice = cfs_rq_min_slice(cfs_rq);
++
+ 		cfs_rq->h_nr_running++;
+ 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ 
+@@ -6826,6 +7049,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			goto enqueue_throttle;
+ 	}
+ 
++	if (!rq_h_nr_running && rq->cfs.h_nr_running) {
++		/* Account for idle runtime */
++		if (!rq->nr_running)
++			dl_server_update_idle_time(rq, rq->curr);
++		dl_server_start(&rq->fair_server);
++	}
++
+ 	/* At this point se is NULL and we are at root level*/
+ 	add_nr_running(rq, 1);
+ 
+@@ -6855,36 +7085,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ static void set_next_buddy(struct sched_entity *se);
+ 
+ /*
+- * The dequeue_task method is called before nr_running is
+- * decreased. We remove the task from the rbtree and
+- * update the fair scheduling stats:
++ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
++ * failing half-way through and resume the dequeue later.
++ *
++ * Returns:
++ * -1 - dequeue delayed
++ *  0 - dequeue throttled
++ *  1 - dequeue complete
+  */
+-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
+ {
+-	struct cfs_rq *cfs_rq;
+-	struct sched_entity *se = &p->se;
+-	int task_sleep = flags & DEQUEUE_SLEEP;
+-	int idle_h_nr_running = task_has_idle_policy(p);
+ 	bool was_sched_idle = sched_idle_rq(rq);
++	int rq_h_nr_running = rq->cfs.h_nr_running;
++	bool task_sleep = flags & DEQUEUE_SLEEP;
++	bool task_delayed = flags & DEQUEUE_DELAYED;
++	struct task_struct *p = NULL;
++	int idle_h_nr_running = 0;
++	int h_nr_running = 0;
++	struct cfs_rq *cfs_rq;
++	u64 slice = 0;
+ 
+-	util_est_dequeue(&rq->cfs, p);
++	if (entity_is_task(se)) {
++		p = task_of(se);
++		h_nr_running = 1;
++		idle_h_nr_running = task_has_idle_policy(p);
++	} else {
++		cfs_rq = group_cfs_rq(se);
++		slice = cfs_rq_min_slice(cfs_rq);
++	}
+ 
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+-		dequeue_entity(cfs_rq, se, flags);
+ 
+-		cfs_rq->h_nr_running--;
++		if (!dequeue_entity(cfs_rq, se, flags)) {
++			if (p && &p->se == se)
++				return -1;
++
++			break;
++		}
++
++		cfs_rq->h_nr_running -= h_nr_running;
+ 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ 
+ 		if (cfs_rq_is_idle(cfs_rq))
+-			idle_h_nr_running = 1;
++			idle_h_nr_running = h_nr_running;
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			goto dequeue_throttle;
++			return 0;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+ 		if (cfs_rq->load.weight) {
++			slice = cfs_rq_min_slice(cfs_rq);
++
+ 			/* Avoid re-evaluating load for this entity: */
+ 			se = parent_entity(se);
+ 			/*
+@@ -6896,6 +7149,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			break;
+ 		}
+ 		flags |= DEQUEUE_SLEEP;
++		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
+ 	}
+ 
+ 	for_each_sched_entity(se) {
+@@ -6905,28 +7159,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		se_update_runnable(se);
+ 		update_cfs_group(se);
+ 
+-		cfs_rq->h_nr_running--;
++		se->slice = slice;
++		slice = cfs_rq_min_slice(cfs_rq);
++
++		cfs_rq->h_nr_running -= h_nr_running;
+ 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ 
+ 		if (cfs_rq_is_idle(cfs_rq))
+-			idle_h_nr_running = 1;
++			idle_h_nr_running = h_nr_running;
+ 
+ 		/* end evaluation on encountering a throttled cfs_rq */
+ 		if (cfs_rq_throttled(cfs_rq))
+-			goto dequeue_throttle;
+-
++			return 0;
+ 	}
+ 
+-	/* At this point se is NULL and we are at root level*/
+-	sub_nr_running(rq, 1);
++	sub_nr_running(rq, h_nr_running);
++
++	if (rq_h_nr_running && !rq->cfs.h_nr_running)
++		dl_server_stop(&rq->fair_server);
+ 
+ 	/* balance early to pull high priority tasks */
+ 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ 		rq->next_balance = jiffies;
+ 
+-dequeue_throttle:
+-	util_est_update(&rq->cfs, p, task_sleep);
++	if (p && task_delayed) {
++		SCHED_WARN_ON(!task_sleep);
++		SCHED_WARN_ON(p->on_rq != 1);
++
++		/* Fix-up what dequeue_task_fair() skipped */
++		hrtick_update(rq);
++
++		/* Fix-up what block_task() skipped. */
++		__block_task(rq, p);
++	}
++
++	return 1;
++}
++
++/*
++ * The dequeue_task method is called before nr_running is
++ * decreased. We remove the task from the rbtree and
++ * update the fair scheduling stats:
++ */
++static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
++{
++	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
++		util_est_dequeue(&rq->cfs, p);
++
++	if (dequeue_entities(rq, &p->se, flags) < 0) {
++		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
++		return false;
++	}
++
++	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ 	hrtick_update(rq);
++	return true;
+ }
+ 
+ #ifdef CONFIG_SMP
+@@ -7824,6 +8111,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+ 	return cpu_util(cpu, p, -1, 0);
+ }
+ 
++/*
++ * This function computes an effective utilization for the given CPU, to be
++ * used for frequency selection given the linear relation: f = u * f_max.
++ *
++ * The scheduler tracks the following metrics:
++ *
++ *   cpu_util_{cfs,rt,dl,irq}()
++ *   cpu_bw_dl()
++ *
++ * Where the cfs,rt and dl util numbers are tracked with the same metric and
++ * synchronized windows and are thus directly comparable.
++ *
++ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
++ * which excludes things like IRQ and steal-time. These latter are then accrued
++ * in the IRQ utilization.
++ *
++ * The DL bandwidth number OTOH is not a measured metric but a value computed
++ * based on the task model parameters and gives the minimal utilization
++ * required to meet deadlines.
++ */
++unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
++				 unsigned long *min,
++				 unsigned long *max)
++{
++	unsigned long util, irq, scale;
++	struct rq *rq = cpu_rq(cpu);
++
++	scale = arch_scale_cpu_capacity(cpu);
++
++	/*
++	 * Early check to see if IRQ/steal time saturates the CPU, can be
++	 * because of inaccuracies in how we track these -- see
++	 * update_irq_load_avg().
++	 */
++	irq = cpu_util_irq(rq);
++	if (unlikely(irq >= scale)) {
++		if (min)
++			*min = scale;
++		if (max)
++			*max = scale;
++		return scale;
++	}
++
++	if (min) {
++		/*
++		 * The minimum utilization returns the highest level between:
++		 * - the computed DL bandwidth needed with the IRQ pressure which
++		 *   steals time to the deadline task.
++		 * - The minimum performance requirement for CFS and/or RT.
++		 */
++		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
++
++		/*
++		 * When an RT task is runnable and uclamp is not used, we must
++		 * ensure that the task will run at maximum compute capacity.
++		 */
++		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
++			*min = max(*min, scale);
++	}
++
++	/*
++	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
++	 * CFS tasks and we use the same metric to track the effective
++	 * utilization (PELT windows are synchronized) we can directly add them
++	 * to obtain the CPU's actual utilization.
++	 */
++	util = util_cfs + cpu_util_rt(rq);
++	util += cpu_util_dl(rq);
++
++	/*
++	 * The maximum hint is a soft bandwidth requirement, which can be lower
++	 * than the actual utilization because of uclamp_max requirements.
++	 */
++	if (max)
++		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
++
++	if (util >= scale)
++		return scale;
++
++	/*
++	 * There is still idle time; further improve the number by using the
++	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
++	 * need to scale the task numbers:
++	 *
++	 *              max - irq
++	 *   U' = irq + --------- * U
++	 *                 max
++	 */
++	util = scale_irq_capacity(util, irq, scale);
++	util += irq;
++
++	return min(scale, util);
++}
++
++unsigned long sched_cpu_util(int cpu)
++{
++	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
++}
++
+ /*
+  * energy_env - Utilization landscape for energy estimation.
+  * @task_busy_time: Utilization contribution by the task for which we test the
+@@ -8308,7 +8694,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+ 
+ static void task_dead_fair(struct task_struct *p)
+ {
+-	remove_entity_load_avg(&p->se);
++	struct sched_entity *se = &p->se;
++
++	if (se->sched_delayed) {
++		struct rq_flags rf;
++		struct rq *rq;
++
++		rq = task_rq_lock(p, &rf);
++		if (se->sched_delayed) {
++			update_rq_clock(rq);
++			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++		}
++		task_rq_unlock(rq, p, &rf);
++	}
++
++	remove_entity_load_avg(se);
+ }
+ 
+ /*
+@@ -8344,7 +8744,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
+ static int
+ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+-	if (rq->nr_running)
++	if (sched_fair_runnable(rq))
+ 		return 1;
+ 
+ 	return sched_balance_newidle(rq, rf) != 0;
+@@ -8430,7 +8830,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
+ 	cfs_rq = cfs_rq_of(se);
+ 	update_curr(cfs_rq);
+ 	/*
+-	 * XXX pick_eevdf(cfs_rq) != se ?
++	 * If @p has a shorter slice than current and @p is eligible, override
++	 * current's slice protection in order to allow preemption.
++	 *
++	 * Note that even if @p does not turn out to be the most eligible
++	 * task at this moment, current's slice protection will be lost.
++	 */
++	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
++		se->vlag = se->deadline + 1;
++
++	/*
++	 * If @p has become the most eligible task, force preemption.
+ 	 */
+ 	if (pick_eevdf(cfs_rq) == pse)
+ 		goto preempt;
+@@ -8441,7 +8851,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
+ 	resched_curr(rq);
+ }
+ 
+-#ifdef CONFIG_SMP
+ static struct task_struct *pick_task_fair(struct rq *rq)
+ {
+ 	struct sched_entity *se;
+@@ -8453,95 +8862,58 @@ static struct task_struct *pick_task_fair(struct rq *rq)
+ 		return NULL;
+ 
+ 	do {
+-		struct sched_entity *curr = cfs_rq->curr;
++		/* Might not have done put_prev_entity() */
++		if (cfs_rq->curr && cfs_rq->curr->on_rq)
++			update_curr(cfs_rq);
+ 
+-		/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+-		if (curr) {
+-			if (curr->on_rq)
+-				update_curr(cfs_rq);
+-			else
+-				curr = NULL;
++		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
++			goto again;
+ 
+-			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+-				goto again;
+-		}
+-
+-		se = pick_next_entity(cfs_rq);
++		se = pick_next_entity(rq, cfs_rq);
++		if (!se)
++			goto again;
+ 		cfs_rq = group_cfs_rq(se);
+ 	} while (cfs_rq);
+ 
+ 	return task_of(se);
+ }
+-#endif
++
++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+ 
+ struct task_struct *
+ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ {
+-	struct cfs_rq *cfs_rq = &rq->cfs;
+ 	struct sched_entity *se;
+ 	struct task_struct *p;
+ 	int new_tasks;
+ 
+ again:
+-	if (!sched_fair_runnable(rq))
++	p = pick_task_fair(rq);
++	if (!p)
+ 		goto idle;
++	se = &p->se;
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+-	if (!prev || prev->sched_class != &fair_sched_class)
++	if (prev->sched_class != &fair_sched_class)
+ 		goto simple;
+ 
++	__put_prev_set_next_dl_server(rq, prev, p);
++
+ 	/*
+ 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ 	 * likely that a next task is from the same cgroup as the current.
+ 	 *
+ 	 * Therefore attempt to avoid putting and setting the entire cgroup
+ 	 * hierarchy, only change the part that actually changes.
+-	 */
+-
+-	do {
+-		struct sched_entity *curr = cfs_rq->curr;
+-
+-		/*
+-		 * Since we got here without doing put_prev_entity() we also
+-		 * have to consider cfs_rq->curr. If it is still a runnable
+-		 * entity, update_curr() will update its vruntime, otherwise
+-		 * forget we've ever seen it.
+-		 */
+-		if (curr) {
+-			if (curr->on_rq)
+-				update_curr(cfs_rq);
+-			else
+-				curr = NULL;
+-
+-			/*
+-			 * This call to check_cfs_rq_runtime() will do the
+-			 * throttle and dequeue its entity in the parent(s).
+-			 * Therefore the nr_running test will indeed
+-			 * be correct.
+-			 */
+-			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+-				cfs_rq = &rq->cfs;
+-
+-				if (!cfs_rq->nr_running)
+-					goto idle;
+-
+-				goto simple;
+-			}
+-		}
+-
+-		se = pick_next_entity(cfs_rq);
+-		cfs_rq = group_cfs_rq(se);
+-	} while (cfs_rq);
+-
+-	p = task_of(se);
+-
+-	/*
++	 *
+ 	 * Since we haven't yet done put_prev_entity and if the selected task
+ 	 * is a different task than we started out with, try and touch the
+ 	 * least amount of cfs_rqs.
+ 	 */
+ 	if (prev != p) {
+ 		struct sched_entity *pse = &prev->se;
++		struct cfs_rq *cfs_rq;
+ 
+ 		while (!(cfs_rq = is_same_group(se, pse))) {
+ 			int se_depth = se->depth;
+@@ -8559,38 +8931,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
+ 
+ 		put_prev_entity(cfs_rq, pse);
+ 		set_next_entity(cfs_rq, se);
+-	}
+-
+-	goto done;
+-simple:
+-#endif
+-	if (prev)
+-		put_prev_task(rq, prev);
+ 
+-	do {
+-		se = pick_next_entity(cfs_rq);
+-		set_next_entity(cfs_rq, se);
+-		cfs_rq = group_cfs_rq(se);
+-	} while (cfs_rq);
++		__set_next_task_fair(rq, p, true);
++	}
+ 
+-	p = task_of(se);
++	return p;
+ 
+-done: __maybe_unused;
+-#ifdef CONFIG_SMP
+-	/*
+-	 * Move the next running task to the front of
+-	 * the list, so our cfs_tasks list becomes MRU
+-	 * one.
+-	 */
+-	list_move(&p->se.group_node, &rq->cfs_tasks);
++simple:
+ #endif
+-
+-	if (hrtick_enabled_fair(rq))
+-		hrtick_start_fair(rq, p);
+-
+-	update_misfit_status(p, rq);
+-	sched_fair_update_stop_tick(rq, p);
+-
++	put_prev_set_next_task(rq, prev, p);
+ 	return p;
+ 
+ idle:
+@@ -8619,15 +8968,34 @@ done: __maybe_unused;
+ 	return NULL;
+ }
+ 
+-static struct task_struct *__pick_next_task_fair(struct rq *rq)
++static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
++{
++	return pick_next_task_fair(rq, prev, NULL);
++}
++
++static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
++{
++	return !!dl_se->rq->cfs.nr_running;
++}
++
++static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+ {
+-	return pick_next_task_fair(rq, NULL, NULL);
++	return pick_task_fair(dl_se->rq);
++}
++
++void fair_server_init(struct rq *rq)
++{
++	struct sched_dl_entity *dl_se = &rq->fair_server;
++
++	init_dl_entity(dl_se);
++
++	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+ }
+ 
+ /*
+  * Account for a descheduled task:
+  */
+-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
+ 	struct sched_entity *se = &prev->se;
+ 	struct cfs_rq *cfs_rq;
+@@ -12721,22 +13089,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+  */
+ static void task_fork_fair(struct task_struct *p)
+ {
+-	struct sched_entity *se = &p->se, *curr;
+-	struct cfs_rq *cfs_rq;
+-	struct rq *rq = this_rq();
+-	struct rq_flags rf;
+-
+-	rq_lock(rq, &rf);
+-	update_rq_clock(rq);
+-
+ 	set_task_max_allowed_capacity(p);
+-
+-	cfs_rq = task_cfs_rq(current);
+-	curr = cfs_rq->curr;
+-	if (curr)
+-		update_curr(cfs_rq);
+-	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
+-	rq_unlock(rq, &rf);
+ }
+ 
+ /*
+@@ -12848,10 +13201,28 @@ static void attach_task_cfs_rq(struct task_struct *p)
+ static void switched_from_fair(struct rq *rq, struct task_struct *p)
+ {
+ 	detach_task_cfs_rq(p);
++	/*
++	 * Since this is called after changing class, this is a little weird
++	 * and we cannot use DEQUEUE_DELAYED.
++	 */
++	if (p->se.sched_delayed) {
++		/* First, dequeue it from its new class' structures */
++		dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
++		/*
++		 * Now, clean up the fair_sched_class side of things
++		 * related to sched_delayed being true and that wasn't done
++		 * due to the generic dequeue not using DEQUEUE_DELAYED.
++		 */
++		finish_delayed_dequeue_entity(&p->se);
++		p->se.rel_deadline = 0;
++		__block_task(rq, p);
++	}
+ }
+ 
+ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ {
++	SCHED_WARN_ON(p->se.sched_delayed);
++
+ 	attach_task_cfs_rq(p);
+ 
+ 	set_task_max_allowed_capacity(p);
+@@ -12869,12 +13240,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ 	}
+ }
+ 
+-/* Account for a task changing its policy or group.
+- *
+- * This routine is mostly called to set cfs_rq->curr field when a task
+- * migrates between groups/classes.
+- */
+-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ {
+ 	struct sched_entity *se = &p->se;
+ 
+@@ -12887,6 +13253,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ 		list_move(&se->group_node, &rq->cfs_tasks);
+ 	}
+ #endif
++	if (!first)
++		return;
++
++	SCHED_WARN_ON(se->sched_delayed);
++
++	if (hrtick_enabled_fair(rq))
++		hrtick_start_fair(rq, p);
++
++	update_misfit_status(p, rq);
++	sched_fair_update_stop_tick(rq, p);
++}
++
++/*
++ * Account for a task changing its policy or group.
++ *
++ * This routine is mostly called to set cfs_rq->curr field when a task
++ * migrates between groups/classes.
++ */
++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
++{
++	struct sched_entity *se = &p->se;
+ 
+ 	for_each_sched_entity(se) {
+ 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+@@ -12895,12 +13282,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ 		/* ensure bandwidth has been allocated on our new cfs_rq */
+ 		account_cfs_rq_runtime(cfs_rq, 0);
+ 	}
++
++	__set_next_task_fair(rq, p, first);
+ }
+ 
+ void init_cfs_rq(struct cfs_rq *cfs_rq)
+ {
+ 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+-	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
++	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ #ifdef CONFIG_SMP
+ 	raw_spin_lock_init(&cfs_rq->removed.lock);
+ #endif
+@@ -13002,28 +13391,35 @@ void online_fair_sched_group(struct task_group *tg)
+ 
+ void unregister_fair_sched_group(struct task_group *tg)
+ {
+-	unsigned long flags;
+-	struct rq *rq;
+ 	int cpu;
+ 
+ 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ 
+ 	for_each_possible_cpu(cpu) {
+-		if (tg->se[cpu])
+-			remove_entity_load_avg(tg->se[cpu]);
++		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
++		struct sched_entity *se = tg->se[cpu];
++		struct rq *rq = cpu_rq(cpu);
++
++		if (se) {
++			if (se->sched_delayed) {
++				guard(rq_lock_irqsave)(rq);
++				if (se->sched_delayed) {
++					update_rq_clock(rq);
++					dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
++				}
++				list_del_leaf_cfs_rq(cfs_rq);
++			}
++			remove_entity_load_avg(se);
++		}
+ 
+ 		/*
+ 		 * Only empty task groups can be destroyed; so we can speculatively
+ 		 * check on_list without danger of it being re-added.
+ 		 */
+-		if (!tg->cfs_rq[cpu]->on_list)
+-			continue;
+-
+-		rq = cpu_rq(cpu);
+-
+-		raw_spin_rq_lock_irqsave(rq, flags);
+-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+-		raw_spin_rq_unlock_irqrestore(rq, flags);
++		if (cfs_rq->on_list) {
++			guard(rq_lock_irqsave)(rq);
++			list_del_leaf_cfs_rq(cfs_rq);
++		}
+ 	}
+ }
+ 
+@@ -13213,13 +13609,13 @@ DEFINE_SCHED_CLASS(fair) = {
+ 
+ 	.wakeup_preempt		= check_preempt_wakeup_fair,
+ 
++	.pick_task		= pick_task_fair,
+ 	.pick_next_task		= __pick_next_task_fair,
+ 	.put_prev_task		= put_prev_task_fair,
+ 	.set_next_task          = set_next_task_fair,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_fair,
+-	.pick_task		= pick_task_fair,
+ 	.select_task_rq		= select_task_rq_fair,
+ 	.migrate_task_rq	= migrate_task_rq_fair,
+ 
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 143f55df890b..290874079f60 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -5,8 +5,24 @@
+  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+ SCHED_FEAT(PLACE_LAG, true)
++/*
++ * Give new tasks half a slice to ease into the competition.
++ */
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
++/*
++ * Preserve relative virtual deadline on 'migration'.
++ */
++SCHED_FEAT(PLACE_REL_DEADLINE, true)
++/*
++ * Inhibit (wakeup) preemption until the current task has either matched the
++ * 0-lag point or until is has exhausted it's slice.
++ */
+ SCHED_FEAT(RUN_TO_PARITY, true)
++/*
++ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
++ * current.
++ */
++SCHED_FEAT(PREEMPT_SHORT, true)
+ 
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+@@ -21,6 +37,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
+  */
+ SCHED_FEAT(CACHE_HOT_BUDDY, true)
+ 
++/*
++ * Delay dequeueing tasks until they get selected or woken.
++ *
++ * By delaying the dequeue for non-eligible tasks, they remain in the
++ * competition and can burn off their negative lag. When they get selected
++ * they'll have positive lag by definition.
++ *
++ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
++ */
++SCHED_FEAT(DELAY_DEQUEUE, true)
++SCHED_FEAT(DELAY_ZERO, true)
++
+ /*
+  * Allow wakeup-time preemption of the current task:
+  */
+@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true)
+ SCHED_FEAT(UTIL_EST, true)
+ 
+ SCHED_FEAT(LATENCY_WARN, false)
+-
+-SCHED_FEAT(HZ_BW, true)
+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
+index 6e78d071beb5..7a105a0123aa 100644
+--- a/kernel/sched/idle.c
++++ b/kernel/sched/idle.c
+@@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
+ 	resched_curr(rq);
+ }
+ 
+-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
++	dl_server_update_idle_time(rq, prev);
+ }
+ 
+ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
+ {
+ 	update_idle_core(rq);
+ 	schedstat_inc(rq->sched_goidle);
++	next->se.exec_start = rq_clock_task(rq);
+ }
+ 
+-#ifdef CONFIG_SMP
+-static struct task_struct *pick_task_idle(struct rq *rq)
++struct task_struct *pick_task_idle(struct rq *rq)
+ {
+ 	return rq->idle;
+ }
+-#endif
+-
+-struct task_struct *pick_next_task_idle(struct rq *rq)
+-{
+-	struct task_struct *next = rq->idle;
+-
+-	set_next_task_idle(rq, next, true);
+-
+-	return next;
+-}
+ 
+ /*
+  * It is not legal to sleep in the idle task - print a warning
+  * message if some code attempts to do it:
+  */
+-static void
++static bool
+ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	raw_spin_rq_unlock_irq(rq);
+ 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ 	dump_stack();
+ 	raw_spin_rq_lock_irq(rq);
++	return true;
+ }
+ 
+ /*
+@@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_idle,
+ 
+-	.pick_next_task		= pick_next_task_idle,
++	.pick_task		= pick_task_idle,
+ 	.put_prev_task		= put_prev_task_idle,
+ 	.set_next_task          = set_next_task_idle,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_idle,
+-	.pick_task		= pick_task_idle,
+ 	.select_task_rq		= select_task_rq_idle,
+ 	.set_cpus_allowed	= set_cpus_allowed_common,
+ #endif
+diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
+index 310523c1b9e3..172c588de542 100644
+--- a/kernel/sched/rt.c
++++ b/kernel/sched/rt.c
+@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE;
+ /* More than 4 hours if BW_SHIFT equals 20. */
+ static const u64 max_rt_runtime = MAX_BW;
+ 
+-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+-
+-struct rt_bandwidth def_rt_bandwidth;
+-
+ /*
+  * period over which we measure -rt task CPU usage in us.
+  * default: 1s
+@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void)
+ late_initcall(sched_rt_sysctl_init);
+ #endif
+ 
++void init_rt_rq(struct rt_rq *rt_rq)
++{
++	struct rt_prio_array *array;
++	int i;
++
++	array = &rt_rq->active;
++	for (i = 0; i < MAX_RT_PRIO; i++) {
++		INIT_LIST_HEAD(array->queue + i);
++		__clear_bit(i, array->bitmap);
++	}
++	/* delimiter for bitsearch: */
++	__set_bit(MAX_RT_PRIO, array->bitmap);
++
++#if defined CONFIG_SMP
++	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
++	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
++	rt_rq->overloaded = 0;
++	plist_head_init(&rt_rq->pushable_tasks);
++#endif /* CONFIG_SMP */
++	/* We start is dequeued state, because no RT tasks are queued */
++	rt_rq->rt_queued = 0;
++
++#ifdef CONFIG_RT_GROUP_SCHED
++	rt_rq->rt_time = 0;
++	rt_rq->rt_throttled = 0;
++	rt_rq->rt_runtime = 0;
++	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
++#endif
++}
++
++#ifdef CONFIG_RT_GROUP_SCHED
++
++static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
++
+ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+ {
+ 	struct rt_bandwidth *rt_b =
+@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+ 	do_start_rt_bandwidth(rt_b);
+ }
+ 
+-void init_rt_rq(struct rt_rq *rt_rq)
+-{
+-	struct rt_prio_array *array;
+-	int i;
+-
+-	array = &rt_rq->active;
+-	for (i = 0; i < MAX_RT_PRIO; i++) {
+-		INIT_LIST_HEAD(array->queue + i);
+-		__clear_bit(i, array->bitmap);
+-	}
+-	/* delimiter for bit-search: */
+-	__set_bit(MAX_RT_PRIO, array->bitmap);
+-
+-#if defined CONFIG_SMP
+-	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+-	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+-	rt_rq->overloaded = 0;
+-	plist_head_init(&rt_rq->pushable_tasks);
+-#endif /* CONFIG_SMP */
+-	/* We start is dequeued state, because no RT tasks are queued */
+-	rt_rq->rt_queued = 0;
+-
+-	rt_rq->rt_time = 0;
+-	rt_rq->rt_throttled = 0;
+-	rt_rq->rt_runtime = 0;
+-	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+-}
+-
+-#ifdef CONFIG_RT_GROUP_SCHED
+ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+ {
+ 	hrtimer_cancel(&rt_b->rt_period_timer);
+@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg)
+ {
+ 	if (tg->rt_se)
+ 		destroy_rt_bandwidth(&tg->rt_bandwidth);
+-
+ }
+ 
+ void free_rt_sched_group(struct task_group *tg)
+@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+ 	if (!tg->rt_se)
+ 		goto err;
+ 
+-	init_rt_bandwidth(&tg->rt_bandwidth,
+-			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
++	init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
+ 
+ 	for_each_possible_cpu(i) {
+ 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
+@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+ 	return &rt_rq->tg->rt_bandwidth;
+ }
+ 
+-#else /* !CONFIG_RT_GROUP_SCHED */
+-
+-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+-{
+-	return rt_rq->rt_runtime;
+-}
+-
+-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+-{
+-	return ktime_to_ns(def_rt_bandwidth.rt_period);
+-}
+-
+-typedef struct rt_rq *rt_rq_iter_t;
+-
+-#define for_each_rt_rq(rt_rq, iter, rq) \
+-	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+-
+-#define for_each_sched_rt_entity(rt_se) \
+-	for (; rt_se; rt_se = NULL)
+-
+-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+-{
+-	return NULL;
+-}
+-
+-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+-{
+-	struct rq *rq = rq_of_rt_rq(rt_rq);
+-
+-	if (!rt_rq->rt_nr_running)
+-		return;
+-
+-	enqueue_top_rt_rq(rt_rq);
+-	resched_curr(rq);
+-}
+-
+-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+-{
+-	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+-}
+-
+-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+-{
+-	return rt_rq->rt_throttled;
+-}
+-
+-static inline const struct cpumask *sched_rt_period_mask(void)
+-{
+-	return cpu_online_mask;
+-}
+-
+-static inline
+-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+-{
+-	return &cpu_rq(cpu)->rt;
+-}
+-
+-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+-{
+-	return &def_rt_bandwidth;
+-}
+-
+-#endif /* CONFIG_RT_GROUP_SCHED */
+-
+ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+ {
+ 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	const struct cpumask *span;
+ 
+ 	span = sched_rt_period_mask();
+-#ifdef CONFIG_RT_GROUP_SCHED
++
+ 	/*
+ 	 * FIXME: isolated CPUs should really leave the root task group,
+ 	 * whether they are isolcpus or were isolated via cpusets, lest
+@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	 */
+ 	if (rt_b == &root_task_group.rt_bandwidth)
+ 		span = cpu_online_mask;
+-#endif
++
+ 	for_each_cpu(i, span) {
+ 		int enqueue = 0;
+ 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ 	return idle;
+ }
+ 
+-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+-{
+-#ifdef CONFIG_RT_GROUP_SCHED
+-	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+-
+-	if (rt_rq)
+-		return rt_rq->highest_prio.curr;
+-#endif
+-
+-	return rt_task_of(rt_se)->prio;
+-}
+-
+ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ {
+ 	u64 runtime = sched_rt_runtime(rt_rq);
+@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ 	return 0;
+ }
+ 
++#else /* !CONFIG_RT_GROUP_SCHED */
++
++typedef struct rt_rq *rt_rq_iter_t;
++
++#define for_each_rt_rq(rt_rq, iter, rq) \
++	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
++
++#define for_each_sched_rt_entity(rt_se) \
++	for (; rt_se; rt_se = NULL)
++
++static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
++{
++	return NULL;
++}
++
++static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
++{
++	struct rq *rq = rq_of_rt_rq(rt_rq);
++
++	if (!rt_rq->rt_nr_running)
++		return;
++
++	enqueue_top_rt_rq(rt_rq);
++	resched_curr(rq);
++}
++
++static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
++{
++	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
++}
++
++static inline int rt_rq_throttled(struct rt_rq *rt_rq)
++{
++	return false;
++}
++
++static inline const struct cpumask *sched_rt_period_mask(void)
++{
++	return cpu_online_mask;
++}
++
++static inline
++struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
++{
++	return &cpu_rq(cpu)->rt;
++}
++
++#ifdef CONFIG_SMP
++static void __enable_runtime(struct rq *rq) { }
++static void __disable_runtime(struct rq *rq) { }
++#endif
++
++#endif /* CONFIG_RT_GROUP_SCHED */
++
++static inline int rt_se_prio(struct sched_rt_entity *rt_se)
++{
++#ifdef CONFIG_RT_GROUP_SCHED
++	struct rt_rq *rt_rq = group_rt_rq(rt_se);
++
++	if (rt_rq)
++		return rt_rq->highest_prio.curr;
++#endif
++
++	return rt_task_of(rt_se)->prio;
++}
++
+ /*
+  * Update the current task's runtime statistics. Skip current tasks that
+  * are not in our scheduling class.
+@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+ static void update_curr_rt(struct rq *rq)
+ {
+ 	struct task_struct *curr = rq->curr;
+-	struct sched_rt_entity *rt_se = &curr->rt;
+ 	s64 delta_exec;
+ 
+ 	if (curr->sched_class != &rt_sched_class)
+@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq)
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
++#ifdef CONFIG_RT_GROUP_SCHED
++	struct sched_rt_entity *rt_se = &curr->rt;
++
+ 	if (!rt_bandwidth_enabled())
+ 		return;
+ 
+@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq)
+ 				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
+ 		}
+ 	}
++#endif
+ }
+ 
+ static void
+@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ static void
+ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ {
+-	start_rt_bandwidth(&def_rt_bandwidth);
+ }
+ 
+ static inline
+@@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 		enqueue_pushable_task(rq, p);
+ }
+ 
+-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
++static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	struct sched_rt_entity *rt_se = &p->rt;
+ 
+@@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 	dequeue_rt_entity(rt_se, flags);
+ 
+ 	dequeue_pushable_task(rq, p);
++
++	return true;
+ }
+ 
+ /*
+@@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
+ 	return p;
+ }
+ 
+-static struct task_struct *pick_next_task_rt(struct rq *rq)
+-{
+-	struct task_struct *p = pick_task_rt(rq);
+-
+-	if (p)
+-		set_next_task_rt(rq, p, true);
+-
+-	return p;
+-}
+-
+-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
++static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
+ {
+ 	struct sched_rt_entity *rt_se = &p->rt;
+ 	struct rt_rq *rt_rq = &rq->rt;
+@@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_rt,
+ 
+-	.pick_next_task		= pick_next_task_rt,
++	.pick_task		= pick_task_rt,
+ 	.put_prev_task		= put_prev_task_rt,
+ 	.set_next_task          = set_next_task_rt,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_rt,
+-	.pick_task		= pick_task_rt,
+ 	.select_task_rq		= select_task_rq_rt,
+ 	.set_cpus_allowed       = set_cpus_allowed_common,
+ 	.rq_online              = rq_online_rt,
+@@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+ #ifdef CONFIG_SYSCTL
+ static int sched_rt_global_constraints(void)
+ {
+-	unsigned long flags;
+-	int i;
+-
+-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+-	for_each_possible_cpu(i) {
+-		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+-
+-		raw_spin_lock(&rt_rq->rt_runtime_lock);
+-		rt_rq->rt_runtime = global_rt_runtime();
+-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+-	}
+-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+-
+ 	return 0;
+ }
+ #endif /* CONFIG_SYSCTL */
+@@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void)
+ 
+ static void sched_rt_do_global(void)
+ {
+-	unsigned long flags;
+-
+-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+-	def_rt_bandwidth.rt_runtime = global_rt_runtime();
+-	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+ }
+ 
+ static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 432b43aa091c..10b72dcb57e4 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -68,6 +68,7 @@
+ #include <linux/wait_api.h>
+ #include <linux/wait_bit.h>
+ #include <linux/workqueue_api.h>
++#include <linux/delayacct.h>
+ 
+ #include <trace/events/power.h>
+ #include <trace/events/sched.h>
+@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
+ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+ extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
+ extern int  dl_bw_check_overflow(int cpu);
+-
++extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
+ /*
+  * SCHED_DEADLINE supports servers (nested scheduling) with the following
+  * interface:
+@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
+ extern void dl_server_stop(struct sched_dl_entity *dl_se);
+ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ 		    dl_server_has_tasks_f has_tasks,
+-		    dl_server_pick_f pick);
++		    dl_server_pick_f pick_task);
++
++extern void dl_server_update_idle_time(struct rq *rq,
++		    struct task_struct *p);
++extern void fair_server_init(struct rq *rq);
++extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
++extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
++		    u64 runtime, u64 period, bool init);
+ 
+ #ifdef CONFIG_CGROUP_SCHED
+ 
+@@ -599,17 +607,12 @@ struct cfs_rq {
+ 	s64			avg_vruntime;
+ 	u64			avg_load;
+ 
+-	u64			exec_clock;
+ 	u64			min_vruntime;
+ #ifdef CONFIG_SCHED_CORE
+ 	unsigned int		forceidle_seq;
+ 	u64			min_vruntime_fi;
+ #endif
+ 
+-#ifndef CONFIG_64BIT
+-	u64			min_vruntime_copy;
+-#endif
+-
+ 	struct rb_root_cached	tasks_timeline;
+ 
+ 	/*
+@@ -619,10 +622,6 @@ struct cfs_rq {
+ 	struct sched_entity	*curr;
+ 	struct sched_entity	*next;
+ 
+-#ifdef	CONFIG_SCHED_DEBUG
+-	unsigned int		nr_spread_over;
+-#endif
+-
+ #ifdef CONFIG_SMP
+ 	/*
+ 	 * CFS load tracking
+@@ -726,13 +725,13 @@ struct rt_rq {
+ #endif /* CONFIG_SMP */
+ 	int			rt_queued;
+ 
++#ifdef CONFIG_RT_GROUP_SCHED
+ 	int			rt_throttled;
+ 	u64			rt_time;
+ 	u64			rt_runtime;
+ 	/* Nests inside the rq lock: */
+ 	raw_spinlock_t		rt_runtime_lock;
+ 
+-#ifdef CONFIG_RT_GROUP_SCHED
+ 	unsigned int		rt_nr_boosted;
+ 
+ 	struct rq		*rq;
+@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se)
+ 
+ static inline long se_runnable(struct sched_entity *se)
+ {
++	if (se->sched_delayed)
++		return false;
++
+ 	if (entity_is_task(se))
+ 		return !!se->on_rq;
+ 	else
+@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { }
+ 
+ static inline long se_runnable(struct sched_entity *se)
+ {
++	if (se->sched_delayed)
++		return false;
++
+ 	return !!se->on_rq;
+ }
+ 
+@@ -1044,6 +1049,8 @@ struct rq {
+ 	struct rt_rq		rt;
+ 	struct dl_rq		dl;
+ 
++	struct sched_dl_entity	fair_server;
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	/* list of leaf cfs_rq on this CPU: */
+ 	struct list_head	leaf_cfs_rq_list;
+@@ -1059,6 +1066,7 @@ struct rq {
+ 	unsigned int		nr_uninterruptible;
+ 
+ 	struct task_struct __rcu	*curr;
++	struct sched_dl_entity	*dl_server;
+ 	struct task_struct	*idle;
+ 	struct task_struct	*stop;
+ 	unsigned long		next_balance;
+@@ -1158,7 +1166,6 @@ struct rq {
+ 	/* latency stats */
+ 	struct sched_info	rq_sched_info;
+ 	unsigned long long	rq_cpu_time;
+-	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+ 
+ 	/* sys_sched_yield() stats */
+ 	unsigned int		yld_count;
+@@ -1187,6 +1194,7 @@ struct rq {
+ 	/* per rq */
+ 	struct rq		*core;
+ 	struct task_struct	*core_pick;
++	struct sched_dl_entity	*core_dl_server;
+ 	unsigned int		core_enabled;
+ 	unsigned int		core_sched_seq;
+ 	struct rb_root		core_tree;
+@@ -2247,11 +2255,13 @@ extern const u32		sched_prio_to_wmult[40];
+  *
+  */
+ 
+-#define DEQUEUE_SLEEP		0x01
++#define DEQUEUE_SLEEP		0x01 /* Matches ENQUEUE_WAKEUP */
+ #define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
+ #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
+ #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
++#define DEQUEUE_SPECIAL		0x10
+ #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
++#define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+ 
+ #define ENQUEUE_WAKEUP		0x01
+ #define ENQUEUE_RESTORE		0x02
+@@ -2267,6 +2277,7 @@ extern const u32		sched_prio_to_wmult[40];
+ #endif
+ #define ENQUEUE_INITIAL		0x80
+ #define ENQUEUE_MIGRATING	0x100
++#define ENQUEUE_DELAYED		0x200
+ 
+ #define RETRY_TASK		((void *)-1UL)
+ 
+@@ -2285,23 +2296,31 @@ struct sched_class {
+ #endif
+ 
+ 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
++	bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ 	void (*yield_task)   (struct rq *rq);
+ 	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ 
+ 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ 
+-	struct task_struct *(*pick_next_task)(struct rq *rq);
++	struct task_struct *(*pick_task)(struct rq *rq);
++	/*
++	 * Optional! When implemented pick_next_task() should be equivalent to:
++	 *
++	 *   next = pick_task();
++	 *   if (next) {
++	 *       put_prev_task(prev);
++	 *       set_next_task_first(next);
++	 *   }
++	 */
++	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ 
+-	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
++	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
+ 	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ 
+ #ifdef CONFIG_SMP
+ 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ 
+-	struct task_struct * (*pick_task)(struct rq *rq);
+-
+ 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ 
+ 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+@@ -2345,7 +2364,7 @@ struct sched_class {
+ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+ {
+ 	WARN_ON_ONCE(rq->curr != prev);
+-	prev->sched_class->put_prev_task(rq, prev);
++	prev->sched_class->put_prev_task(rq, prev, NULL);
+ }
+ 
+ static inline void set_next_task(struct rq *rq, struct task_struct *next)
+@@ -2353,6 +2372,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
+ 	next->sched_class->set_next_task(rq, next, false);
+ }
+ 
++static inline void
++__put_prev_set_next_dl_server(struct rq *rq,
++			      struct task_struct *prev,
++			      struct task_struct *next)
++{
++	prev->dl_server = NULL;
++	next->dl_server = rq->dl_server;
++	rq->dl_server = NULL;
++}
++
++static inline void put_prev_set_next_task(struct rq *rq,
++					  struct task_struct *prev,
++					  struct task_struct *next)
++{
++	WARN_ON_ONCE(rq->curr != prev);
++
++	__put_prev_set_next_dl_server(rq, prev, next);
++
++	if (next == prev)
++		return;
++
++	prev->sched_class->put_prev_task(rq, prev, next);
++	next->sched_class->set_next_task(rq, next, true);
++}
+ 
+ /*
+  * Helper to define a sched_class instance; each one is placed in a separate
+@@ -2408,7 +2451,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
+ }
+ 
+ extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+-extern struct task_struct *pick_next_task_idle(struct rq *rq);
++extern struct task_struct *pick_task_idle(struct rq *rq);
+ 
+ #define SCA_CHECK		0x01
+ #define SCA_MIGRATE_DISABLE	0x02
+@@ -2515,7 +2558,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
+ extern void resched_curr(struct rq *rq);
+ extern void resched_cpu(int cpu);
+ 
+-extern struct rt_bandwidth def_rt_bandwidth;
+ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+ 
+@@ -2586,6 +2628,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
+ 	sched_update_tick_dependency(rq);
+ }
+ 
++static inline void __block_task(struct rq *rq, struct task_struct *p)
++{
++	WRITE_ONCE(p->on_rq, 0);
++	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
++	if (p->sched_contributes_to_load)
++		rq->nr_uninterruptible++;
++
++	if (p->in_iowait) {
++		atomic_inc(&rq->nr_iowait);
++		delayacct_blkio_start();
++	}
++}
++
+ extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+ 
+@@ -3607,7 +3662,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c
+ extern void __setscheduler_prio(struct task_struct *p, int prio);
+ extern void set_load_weight(struct task_struct *p, bool update_load);
+ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
++extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+ 
+ extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ 				const struct sched_class *prev_class,
+diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
+index b1b8fe61c532..058dd42e3d9b 100644
+--- a/kernel/sched/stop_task.c
++++ b/kernel/sched/stop_task.c
+@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq)
+ 	return rq->stop;
+ }
+ 
+-static struct task_struct *pick_next_task_stop(struct rq *rq)
+-{
+-	struct task_struct *p = pick_task_stop(rq);
+-
+-	if (p)
+-		set_next_task_stop(rq, p, true);
+-
+-	return p;
+-}
+-
+ static void
+ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	add_nr_running(rq, 1);
+ }
+ 
+-static void
++static bool
+ dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+ 	sub_nr_running(rq, 1);
++	return true;
+ }
+ 
+ static void yield_task_stop(struct rq *rq)
+@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq)
+ 	BUG(); /* the stop task should never yield, its pointless. */
+ }
+ 
+-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
++static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+ {
+ 	update_curr_common(rq);
+ }
+@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = {
+ 
+ 	.wakeup_preempt		= wakeup_preempt_stop,
+ 
+-	.pick_next_task		= pick_next_task_stop,
++	.pick_task		= pick_task_stop,
+ 	.put_prev_task		= put_prev_task_stop,
+ 	.set_next_task          = set_next_task_stop,
+ 
+ #ifdef CONFIG_SMP
+ 	.balance		= balance_stop,
+-	.pick_task		= pick_task_stop,
+ 	.select_task_rq		= select_task_rq_stop,
+ 	.set_cpus_allowed	= set_cpus_allowed_common,
+ #endif
+diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
+index ae1b42775ef9..c62acf509b74 100644
+--- a/kernel/sched/syscalls.c
++++ b/kernel/sched/syscalls.c
+@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p)
+ 	 * keep the priority unchanged. Otherwise, update priority
+ 	 * to the normal priority:
+ 	 */
+-	if (!rt_prio(p->prio))
++	if (!rt_or_dl_prio(p->prio))
+ 		return p->normal_prio;
+ 	return p->prio;
+ }
+@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu)
+ 
+ #endif
+ 
+-#ifdef CONFIG_SMP
+-/*
+- * This function computes an effective utilization for the given CPU, to be
+- * used for frequency selection given the linear relation: f = u * f_max.
+- *
+- * The scheduler tracks the following metrics:
+- *
+- *   cpu_util_{cfs,rt,dl,irq}()
+- *   cpu_bw_dl()
+- *
+- * Where the cfs,rt and dl util numbers are tracked with the same metric and
+- * synchronized windows and are thus directly comparable.
+- *
+- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+- * which excludes things like IRQ and steal-time. These latter are then accrued
+- * in the IRQ utilization.
+- *
+- * The DL bandwidth number OTOH is not a measured metric but a value computed
+- * based on the task model parameters and gives the minimal utilization
+- * required to meet deadlines.
+- */
+-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+-				 unsigned long *min,
+-				 unsigned long *max)
+-{
+-	unsigned long util, irq, scale;
+-	struct rq *rq = cpu_rq(cpu);
+-
+-	scale = arch_scale_cpu_capacity(cpu);
+-
+-	/*
+-	 * Early check to see if IRQ/steal time saturates the CPU, can be
+-	 * because of inaccuracies in how we track these -- see
+-	 * update_irq_load_avg().
+-	 */
+-	irq = cpu_util_irq(rq);
+-	if (unlikely(irq >= scale)) {
+-		if (min)
+-			*min = scale;
+-		if (max)
+-			*max = scale;
+-		return scale;
+-	}
+-
+-	if (min) {
+-		/*
+-		 * The minimum utilization returns the highest level between:
+-		 * - the computed DL bandwidth needed with the IRQ pressure which
+-		 *   steals time to the deadline task.
+-		 * - The minimum performance requirement for CFS and/or RT.
+-		 */
+-		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+-
+-		/*
+-		 * When an RT task is runnable and uclamp is not used, we must
+-		 * ensure that the task will run at maximum compute capacity.
+-		 */
+-		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+-			*min = max(*min, scale);
+-	}
+-
+-	/*
+-	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
+-	 * CFS tasks and we use the same metric to track the effective
+-	 * utilization (PELT windows are synchronized) we can directly add them
+-	 * to obtain the CPU's actual utilization.
+-	 */
+-	util = util_cfs + cpu_util_rt(rq);
+-	util += cpu_util_dl(rq);
+-
+-	/*
+-	 * The maximum hint is a soft bandwidth requirement, which can be lower
+-	 * than the actual utilization because of uclamp_max requirements.
+-	 */
+-	if (max)
+-		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+-
+-	if (util >= scale)
+-		return scale;
+-
+-	/*
+-	 * There is still idle time; further improve the number by using the
+-	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+-	 * need to scale the task numbers:
+-	 *
+-	 *              max - irq
+-	 *   U' = irq + --------- * U
+-	 *                 max
+-	 */
+-	util = scale_irq_capacity(util, irq, scale);
+-	util += irq;
+-
+-	return min(scale, util);
+-}
+-
+-unsigned long sched_cpu_util(int cpu)
+-{
+-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+-}
+-#endif /* CONFIG_SMP */
+-
+ /**
+  * find_process_by_pid - find a process with a matching PID value.
+  * @pid: the pid in question.
+@@ -401,10 +300,20 @@ static void __setscheduler_params(struct task_struct *p,
+ 
+ 	p->policy = policy;
+ 
+-	if (dl_policy(policy))
++	if (dl_policy(policy)) {
+ 		__setparam_dl(p, attr);
+-	else if (fair_policy(policy))
++	} else if (fair_policy(policy)) {
+ 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
++		if (attr->sched_runtime) {
++			p->se.custom_slice = 1;
++			p->se.slice = clamp_t(u64, attr->sched_runtime,
++					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
++					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
++		} else {
++			p->se.custom_slice = 0;
++			p->se.slice = sysctl_sched_base_slice;
++		}
++	}
+ 
+ 	/*
+ 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
+@@ -700,7 +609,9 @@ int __sched_setscheduler(struct task_struct *p,
+ 	 * but store a possible modification of reset_on_fork.
+ 	 */
+ 	if (unlikely(policy == p->policy)) {
+-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
++		if (fair_policy(policy) &&
++		    (attr->sched_nice != task_nice(p) ||
++		     (attr->sched_runtime != p->se.slice)))
+ 			goto change;
+ 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ 			goto change;
+@@ -846,6 +757,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
+ 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
+ 	};
+ 
++	if (p->se.custom_slice)
++		attr.sched_runtime = p->se.slice;
++
+ 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+@@ -1012,12 +926,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+-	if (task_has_dl_policy(p))
++	if (task_has_dl_policy(p)) {
+ 		__getparam_dl(p, attr);
+-	else if (task_has_rt_policy(p))
++	} else if (task_has_rt_policy(p)) {
+ 		attr->sched_priority = p->rt_priority;
+-	else
++	} else {
+ 		attr->sched_nice = task_nice(p);
++		attr->sched_runtime = p->se.slice;
++	}
+ }
+ 
+ /**
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 76504b776d03..9748a4c8d668 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
+ 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ 		set_rq_online(rq);
+ 
++	/*
++	 * Because the rq is not a task, dl_add_task_root_domain() did not
++	 * move the fair server bw to the rd if it already started.
++	 * Add it now.
++	 */
++	if (rq->fair_server.dl_server)
++		__dl_server_attach_root(&rq->fair_server, rq);
++
+ 	rq_unlock_irqrestore(rq, &rf);
+ 
+ 	if (old_rd)
+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
+index b8ee320208d4..f4be3abbb47b 100644
+--- a/kernel/time/hrtimer.c
++++ b/kernel/time/hrtimer.c
+@@ -1975,7 +1975,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ 	 * expiry.
+ 	 */
+ 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+-		if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
++		if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
+ 			mode |= HRTIMER_MODE_HARD;
+ 	}
+ 
+@@ -2075,7 +2075,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
+ 	u64 slack;
+ 
+ 	slack = current->timer_slack_ns;
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		slack = 0;
+ 
+ 	hrtimer_init_sleeper_on_stack(&t, clockid, mode);
+@@ -2280,7 +2280,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ 	 * Override any slack passed by the user if under
+ 	 * rt contraints.
+ 	 */
+-	if (rt_task(current))
++	if (rt_or_dl_task(current))
+ 		delta = 0;
+ 
+ 	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
+diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
+index 130ca7e7787e..ae2ace5e515a 100644
+--- a/kernel/trace/trace_sched_wakeup.c
++++ b/kernel/trace/trace_sched_wakeup.c
+@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
+ 	 *  - wakeup_dl handles tasks belonging to sched_dl class only.
+ 	 */
+ 	if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
+-	    (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
++	    (wakeup_rt && !rt_or_dl_task(p)) ||
+ 	    (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
+ 		return;
+ 
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 3bd08b60a9b3..9bd709077621 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -426,7 +426,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
+ 		bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
+ 
+ 	tsk = current;
+-	if (rt_task(tsk)) {
++	if (rt_or_dl_task(tsk)) {
+ 		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
+ 		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
+ 	}
+@@ -485,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
+ 	else
+ 		dirty = vm_dirty_ratio * node_memory / 100;
+ 
+-	if (rt_task(tsk))
++	if (rt_or_dl_task(tsk))
+ 		dirty += dirty / 4;
+ 
+ 	/*
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f8b4dae35fc3..da29ddf87cd8 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -4008,7 +4008,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
+ 		 */
+ 		if (alloc_flags & ALLOC_MIN_RESERVE)
+ 			alloc_flags &= ~ALLOC_CPUSET;
+-	} else if (unlikely(rt_task(current)) && in_task())
++	} else if (unlikely(rt_or_dl_task(current)) && in_task())
+ 		alloc_flags |= ALLOC_MIN_RESERVE;
+ 
+ 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+-- 
+2.47.0.rc0
+
diff --git a/sys-kernel/git-sources/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.11/0002-bbr3.patch
similarity index 98%
rename from sys-kernel/git-sources/0003-bbr3.patch
rename to sys-kernel/gentoo-sources-6.11/0002-bbr3.patch
index f59737d..b106e5e 100644
--- a/sys-kernel/git-sources/0003-bbr3.patch
+++ b/sys-kernel/gentoo-sources-6.11/0002-bbr3.patch
@@ -1,6 +1,6 @@
-From 76485d8c7c1cc6ab2f9d755ef5bf09ca98a9f81a Mon Sep 17 00:00:00 2001
+From 694e2eec893e51c71b3faa821f561b8c387b3bb7 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 17 Jun 2024 15:16:10 +0200
+Date: Fri, 4 Oct 2024 17:06:44 +0200
 Subject: [PATCH 02/10] bbr3
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
@@ -39,7 +39,7 @@ index 6a5e08b937b3..27aab715490e 100644
  		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
  		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
-index 7d6b1254c92d..2ce55f444434 100644
+index c0deaafebfdc..d53f042d936e 100644
 --- a/include/net/inet_connection_sock.h
 +++ b/include/net/inet_connection_sock.h
 @@ -137,8 +137,8 @@ struct inet_connection_sock {
@@ -54,7 +54,7 @@ index 7d6b1254c92d..2ce55f444434 100644
  
  #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
 diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 060e95b331a2..953244eefe7d 100644
+index 196c148fce8a..f37256b8abfd 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
 @@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
@@ -66,7 +66,7 @@ index 060e95b331a2..953244eefe7d 100644
  
  enum tcp_tw_status {
  	TCP_TW_SUCCESS = 0,
-@@ -778,6 +780,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
  
  u32 tcp_delack_max(const struct sock *sk);
  
@@ -82,7 +82,7 @@ index 060e95b331a2..953244eefe7d 100644
  /* Compute the actual rto_min value */
  static inline u32 tcp_rto_min(const struct sock *sk)
  {
-@@ -883,6 +894,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
  	return max_t(s64, t1 - t0, 0);
  }
  
@@ -94,7 +94,7 @@ index 060e95b331a2..953244eefe7d 100644
  /* provide the departure time in us unit */
  static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
  {
-@@ -972,9 +988,14 @@ struct tcp_skb_cb {
+@@ -973,9 +989,14 @@ struct tcp_skb_cb {
  			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
  			__u32 delivered;
  			/* start of send pipeline phase */
@@ -111,7 +111,7 @@ index 060e95b331a2..953244eefe7d 100644
  		} tx;   /* only used for outgoing skbs */
  		union {
  			struct inet_skb_parm	h4;
-@@ -1078,6 +1099,7 @@ enum tcp_ca_event {
+@@ -1087,6 +1108,7 @@ enum tcp_ca_event {
  	CA_EVENT_LOSS,		/* loss timeout */
  	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
  	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
@@ -119,7 +119,7 @@ index 060e95b331a2..953244eefe7d 100644
  };
  
  /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
-@@ -1100,7 +1122,11 @@ enum tcp_ca_ack_event_flags {
+@@ -1109,7 +1131,11 @@ enum tcp_ca_ack_event_flags {
  #define TCP_CONG_NON_RESTRICTED 0x1
  /* Requires ECN/ECT set on all packets */
  #define TCP_CONG_NEEDS_ECN	0x2
@@ -132,7 +132,7 @@ index 060e95b331a2..953244eefe7d 100644
  
  union tcp_cc_info;
  
-@@ -1120,10 +1146,13 @@ struct ack_sample {
+@@ -1129,10 +1155,13 @@ struct ack_sample {
   */
  struct rate_sample {
  	u64  prior_mstamp; /* starting timestamp for interval */
@@ -147,7 +147,7 @@ index 060e95b331a2..953244eefe7d 100644
  	long interval_us;	/* time for tp->delivered to incr "delivered" */
  	u32 snd_interval_us;	/* snd interval for delivered packets */
  	u32 rcv_interval_us;	/* rcv interval for delivered packets */
-@@ -1134,7 +1163,9 @@ struct rate_sample {
+@@ -1143,7 +1172,9 @@ struct rate_sample {
  	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
  	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
  	bool is_retrans;	/* is sample from retransmission? */
@@ -157,7 +157,7 @@ index 060e95b331a2..953244eefe7d 100644
  };
  
  struct tcp_congestion_ops {
-@@ -1158,8 +1189,11 @@ struct tcp_congestion_ops {
+@@ -1167,8 +1198,11 @@ struct tcp_congestion_ops {
  	/* hook for packet ack accounting (optional) */
  	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
  
@@ -171,7 +171,7 @@ index 060e95b331a2..953244eefe7d 100644
  
  	/* call when packets are delivered to update cwnd and pacing rate,
  	 * after all the ca_state processing. (optional)
-@@ -1225,6 +1259,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+@@ -1234,6 +1268,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
  }
  #endif
  
@@ -186,7 +186,7 @@ index 060e95b331a2..953244eefe7d 100644
  static inline bool tcp_ca_needs_ecn(const struct sock *sk)
  {
  	const struct inet_connection_sock *icsk = inet_csk(sk);
-@@ -1244,6 +1286,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+@@ -1253,6 +1295,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
  void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
  
  /* From tcp_rate.c */
@@ -194,7 +194,7 @@ index 060e95b331a2..953244eefe7d 100644
  void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
  void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
  			    struct rate_sample *rs);
-@@ -1256,6 +1299,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+@@ -1265,6 +1308,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
  	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
  }
  
@@ -216,7 +216,7 @@ index 060e95b331a2..953244eefe7d 100644
  /* These functions determine how the current flow behaves in respect of SACK
   * handling. SACK is negotiated with the peer, and therefore it can vary
   * between different flows.
-@@ -2418,7 +2476,7 @@ struct tcp_plb_state {
+@@ -2416,7 +2474,7 @@ struct tcp_plb_state {
  	u8	consec_cong_rounds:5, /* consecutive congested rounds */
  		unused:3;
  	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
@@ -324,7 +324,7 @@ index 8e94ed7c56a0..50dc9970cad2 100644
  choice
  	prompt "Default TCP congestion control"
 diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
-index 18227757ec0c..f180befc28bd 100644
+index 3f88d0961e5b..4273cac333f6 100644
 --- a/net/ipv4/bpf_tcp_ca.c
 +++ b/net/ipv4/bpf_tcp_ca.c
 @@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
@@ -355,10 +355,10 @@ index 18227757ec0c..f180befc28bd 100644
  	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
  	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index e6790ea74877..b63e27eba536 100644
+index 831a18dc7aa6..d9faa8fef55e 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
-@@ -3120,6 +3120,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int flags)
  	tp->rx_opt.dsack = 0;
  	tp->rx_opt.num_sacks = 0;
  	tp->rcv_ooopack = 0;
@@ -366,7 +366,7 @@ index e6790ea74877..b63e27eba536 100644
  
  
  	/* Clean up fastopen related fields */
-@@ -3846,6 +3847,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+@@ -3849,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
  		info->tcpi_options |= TCPI_OPT_ECN;
  	if (tp->ecn_flags & TCP_ECN_SEEN)
  		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
@@ -3020,7 +3020,7 @@ index 760941e55153..a180fa648d5e 100644
  MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
 +MODULE_VERSION(__stringify(BBR_VERSION));
 diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
-index 28ffcfbeef14..7b13915ba288 100644
+index 0306d257fa64..28f581c0dab7 100644
 --- a/net/ipv4/tcp_cong.c
 +++ b/net/ipv4/tcp_cong.c
 @@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
@@ -3032,10 +3032,10 @@ index 28ffcfbeef14..7b13915ba288 100644
  		icsk->icsk_ca_ops->init(sk);
  	if (tcp_ca_needs_ecn(sk))
 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index 9c04a9c8be9d..2c89efbc8ddf 100644
+index e37488d3453f..62eef7d067c2 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
-@@ -365,7 +365,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
  			tcp_enter_quickack_mode(sk, 2);
  		break;
  	case INET_ECN_CE:
@@ -3044,7 +3044,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
  
  		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
-@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
  		tp->ecn_flags |= TCP_ECN_SEEN;
  		break;
  	default:
@@ -3053,7 +3053,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
  		tp->ecn_flags |= TCP_ECN_SEEN;
  		break;
-@@ -1115,7 +1115,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
   */
  static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
  {
@@ -3066,7 +3066,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  }
  
  void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
-@@ -1496,6 +1501,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
  	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
  	tcp_skb_pcount_add(skb, -pcount);
  
@@ -3084,7 +3084,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  	/* When we're adding to gso_segs == 1, gso_size will be zero,
  	 * in theory this shouldn't be necessary but as long as DSACK
  	 * code can come after this skb later on it's better to keep
-@@ -3764,7 +3780,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+@@ -3799,7 +3815,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
  /* This routine deals with acks during a TLP episode and ends an episode by
   * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
   */
@@ -3094,7 +3094,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
-@@ -3781,6 +3798,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+@@ -3816,6 +3833,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
  		/* ACK advances: there was a loss, so reduce cwnd. Reset
  		 * tlp_high_seq in tcp_init_cwnd_reduction()
  		 */
@@ -3102,7 +3102,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  		tcp_init_cwnd_reduction(sk);
  		tcp_set_ca_state(sk, TCP_CA_CWR);
  		tcp_end_cwnd_reduction(sk);
-@@ -3791,6 +3809,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+@@ -3826,6 +3844,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
  			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
  		/* Pure dupack: original and TLP probe arrived; no loss */
  		tp->tlp_high_seq = 0;
@@ -3114,7 +3114,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  	}
  }
  
-@@ -3899,6 +3922,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -3934,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
  	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
  	rs.prior_in_flight = tcp_packets_in_flight(tp);
@@ -3122,7 +3122,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  
  	/* ts_recent update must be made after we are sure that the packet
  	 * is in window.
-@@ -3973,7 +3997,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -4008,7 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	tcp_rack_update_reo_wnd(sk, &rs);
  
  	if (tp->tlp_high_seq)
@@ -3131,7 +3131,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  
  	if (tcp_ack_is_dubious(sk, flag)) {
  		if (!(flag & (FLAG_SND_UNA_ADVANCED |
-@@ -3997,6 +4021,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -4032,6 +4056,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	delivered = tcp_newly_delivered(sk, delivered, flag);
  	lost = tp->lost - lost;			/* freshly marked lost */
  	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
@@ -3139,7 +3139,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
  	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
  	tcp_xmit_recovery(sk, rexmit);
-@@ -4016,7 +4041,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -4051,7 +4076,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	tcp_ack_probe(sk);
  
  	if (tp->tlp_high_seq)
@@ -3148,7 +3148,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  	return 1;
  
  old_ack:
-@@ -5671,13 +5696,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+@@ -5718,13 +5743,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
  
  	    /* More than one full frame received... */
  	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -3166,10 +3166,10 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644
  	    tcp_in_quickack_mode(sk) ||
  	    /* Protocol state mandates a one-time immediate ACK */
 diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
-index 538c06f95918..e4c861c071ae 100644
+index a19a9dbd3409..e0ef8406a326 100644
 --- a/net/ipv4/tcp_minisocks.c
 +++ b/net/ipv4/tcp_minisocks.c
-@@ -460,6 +460,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
  	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
  	bool ca_got_dst = false;
  
@@ -3179,7 +3179,7 @@ index 538c06f95918..e4c861c071ae 100644
  		const struct tcp_congestion_ops *ca;
  
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 95618d0e78e4..3f4bdd2b6476 100644
+index 16c48df8df4c..6c3a1895238e 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
@@ -3274,7 +3274,7 @@ index 95618d0e78e4..3f4bdd2b6476 100644
  }
  
 @@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
  			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
  			tcp_init_tso_segs(skb, mss_now);
 +			tcp_set_tx_in_flight(sk, skb);
@@ -3370,10 +3370,10 @@ index a8f6d9d06f2e..8737f2134648 100644
  	rs->interval_us = max(snd_us, ack_us);
  
 diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index 5bfd76a31af6..0c63590c5fce 100644
+index 4d40615dc8fc..f27941201ef2 100644
 --- a/net/ipv4/tcp_timer.c
 +++ b/net/ipv4/tcp_timer.c
-@@ -684,6 +684,7 @@ void tcp_write_timer_handler(struct sock *sk)
+@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk)
  		return;
  	}
  
@@ -3382,6 +3382,5 @@ index 5bfd76a31af6..0c63590c5fce 100644
  	event = icsk->icsk_pending;
  
 -- 
-2.45.2
-
+2.47.0.rc0
 
diff --git a/sys-kernel/git-sources/0009-ntsync.patch b/sys-kernel/gentoo-sources-6.11/0007-ntsync.patch
similarity index 99%
rename from sys-kernel/git-sources/0009-ntsync.patch
rename to sys-kernel/gentoo-sources-6.11/0007-ntsync.patch
index 436dbf5..9092ec2 100644
--- a/sys-kernel/git-sources/0009-ntsync.patch
+++ b/sys-kernel/gentoo-sources-6.11/0007-ntsync.patch
@@ -1,7 +1,7 @@
-From de83b2d5a68b825a0741a17cac95dd3690a51162 Mon Sep 17 00:00:00 2001
+From 2087698c3f9af692a9e088307a8f25da094bc7a2 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 17 Jun 2024 15:29:00 +0200
-Subject: [PATCH 09/10] ntsync
+Date: Fri, 4 Oct 2024 17:08:21 +0200
+Subject: [PATCH 07/10] ntsync
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -24,7 +24,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c
 
 diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
-index 5926115ec0ed..1e78586662fb 100644
+index 274cc7546efc..9c1b15cd89ab 100644
 --- a/Documentation/userspace-api/index.rst
 +++ b/Documentation/userspace-api/index.rst
 @@ -63,6 +63,7 @@ Everything else
@@ -440,10 +440,10 @@ index 000000000000..767844637a7d
 +  ``objs`` and in ``alert``. If this is attempted, the function fails
 +  with ``EINVAL``.
 diff --git a/MAINTAINERS b/MAINTAINERS
-index cf9c9221c388..cf155b1f9480 100644
+index cc40a9d9b8cd..2cd7168dc401 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
-@@ -15997,6 +15997,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
+@@ -16319,6 +16319,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
  F:	Documentation/filesystems/ntfs3.rst
  F:	fs/ntfs3/
  
@@ -460,7 +460,7 @@ index cf9c9221c388..cf155b1f9480 100644
  M:	Finn Thain <fthain@linux-m68k.org>
  L:	linux-m68k@lists.linux-m68k.org
 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
-index faf983680040..2907b5c23368 100644
+index 41c54051347a..bde398e12696 100644
 --- a/drivers/misc/Kconfig
 +++ b/drivers/misc/Kconfig
 @@ -507,7 +507,6 @@ config OPEN_DICE
@@ -1633,11 +1633,11 @@ index dcfa38fdc93c..4a8095a3fc34 100644
  
  #endif
 diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
-index 9039f3709aff..d5aeaa8fe3ca 100644
+index bc8fe9e8f7f2..b1296bd8eb3f 100644
 --- a/tools/testing/selftests/Makefile
 +++ b/tools/testing/selftests/Makefile
-@@ -16,6 +16,7 @@ TARGETS += damon
- TARGETS += devices
+@@ -17,6 +17,7 @@ TARGETS += devices/error_logs
+ TARGETS += devices/probe
  TARGETS += dmabuf-heaps
  TARGETS += drivers/dma-buf
 +TARGETS += drivers/ntsync
@@ -3085,6 +3085,5 @@ index 000000000000..5fa2c9a0768c
 +
 +TEST_HARNESS_MAIN
 -- 
-2.45.2
-
+2.47.0.rc0
 
diff --git a/sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch b/sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch
similarity index 99%
rename from sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch
rename to sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch
index 99c2a35..50b57d7 100644
--- a/sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch
+++ b/sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch
@@ -1,7 +1,7 @@
-From a2c8bc637c7a2e45c1189f2e92f3712715d957ba Mon Sep 17 00:00:00 2001
+From f3788bc44e2875141e8cf16b36365cb2bac541a6 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 3 Aug 2024 09:34:27 +0200
-Subject: [PATCH 10/12] perf-per-core
+Date: Fri, 4 Oct 2024 17:08:44 +0200
+Subject: [PATCH 08/10] perf-per-core
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -29,7 +29,7 @@ index 7352ab89a55a..c12837e61bda 100644
  
  System topology examples
 diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
-index 0c5e7a7c43ac..cd808b699ccc 100644
+index b985ca79cf97..8206038a01ac 100644
 --- a/arch/x86/events/rapl.c
 +++ b/arch/x86/events/rapl.c
 @@ -39,6 +39,10 @@
@@ -856,7 +856,7 @@ index 0c5e7a7c43ac..cd808b699ccc 100644
  }
  module_exit(intel_rapl_exit);
 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
-index cb4f6c513c48..1ffe4260bef6 100644
+index a75a07f4931f..5a59713ec62b 100644
 --- a/arch/x86/include/asm/processor.h
 +++ b/arch/x86/include/asm/processor.h
 @@ -98,6 +98,7 @@ struct cpuinfo_topology {
@@ -868,7 +868,7 @@ index cb4f6c513c48..1ffe4260bef6 100644
  	// AMD Node ID and Nodes per Package info
  	u32			amd_node_id;
 diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
-index e5b203fe7956..8c2fea7dd065 100644
+index aef70336d624..672fccf9f845 100644
 --- a/arch/x86/include/asm/topology.h
 +++ b/arch/x86/include/asm/topology.h
 @@ -137,6 +137,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
@@ -904,5 +904,5 @@ index 9a6069e7133c..23722aa21e2f 100644
  
  	/* Package relative core ID */
 -- 
-2.46.0.rc1
+2.47.0.rc0
 
diff --git a/sys-kernel/git-sources/0010-zstd.patch b/sys-kernel/gentoo-sources-6.11/0010-zstd.patch
similarity index 99%
rename from sys-kernel/git-sources/0010-zstd.patch
rename to sys-kernel/gentoo-sources-6.11/0010-zstd.patch
index 5b692da..347041b 100644
--- a/sys-kernel/git-sources/0010-zstd.patch
+++ b/sys-kernel/gentoo-sources-6.11/0010-zstd.patch
@@ -1,6 +1,6 @@
-From db3817dff7110c38462a1f918adec6a422f75406 Mon Sep 17 00:00:00 2001
+From cf0e4ae5c086f49c71b2a5aad50a589d8aa1799e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 17 Jun 2024 15:29:10 +0200
+Date: Fri, 4 Oct 2024 17:09:19 +0200
 Subject: [PATCH 10/10] zstd
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
@@ -18648,6 +18648,5 @@ index f4ed952ed485..7d31518e9d5a 100644
  EXPORT_SYMBOL(zstd_reset_dstream);
  
 -- 
-2.45.2
-
+2.47.0.rc0
 
diff --git a/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch
new file mode 100644
index 0000000..3b8a030
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch
@@ -0,0 +1,55 @@
+From b27b06990e40226b04623ee1a863e807cebee48f Mon Sep 17 00:00:00 2001
+From: Andre Ramnitz <tux.rising@gmail.com>
+Date: Tue, 21 Mar 2023 00:12:08 +0100
+Subject: glitched: additional timer tick frequencies.
+
+---
+ kernel/Kconfig.hz | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index 38ef6d06888e..f648df15ef4c 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -5,7 +5,7 @@
+ 
+ choice
+ 	prompt "Timer frequency"
+-	default HZ_250
++	default HZ_600
+ 	help
+ 	 Allows the configuration of the timer frequency. It is customary
+ 	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+@@ -40,6 +40,20 @@ choice
+ 	 on SMP and NUMA systems and exactly dividing by both PAL and
+ 	 NTSC frame rates for video and multimedia work.
+ 
++	config HZ_600
++		bool "600 HZ"
++	help
++	 600 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with great smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
++	config HZ_900
++		bool "900 HZ"
++	help
++	 900 Hz is a good timer frequency for desktops. Provides fast
++	 interactivity with great smoothness. Like 300HZ on
++	 steroids.
++
+ 	config HZ_1000
+ 		bool "1000 HZ"
+ 	help
+@@ -53,6 +67,8 @@ config HZ
+ 	default 100 if HZ_100
+ 	default 250 if HZ_250
+ 	default 300 if HZ_300
++	default 600 if HZ_600
++	default 900 if HZ_900
+ 	default 1000 if HZ_1000
+ 
+ config SCHED_HRTICK
+-- 
+2.39.2
+
diff --git a/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch b/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch
new file mode 100644
index 0000000..7654052
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch
@@ -0,0 +1,958 @@
+From 5ddf15cb65a8c14868cdc743474bd0a4fa9b586f Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Fri, 13 Dec 2024 23:03:09 +0800
+Subject: [PATCH] preempt-lazy
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ arch/x86/Kconfig                   |   1 +
+ arch/x86/include/asm/thread_info.h |   6 +-
+ include/linux/entry-common.h       |   3 +-
+ include/linux/entry-kvm.h          |   5 +-
+ include/linux/preempt.h            |   8 +-
+ include/linux/rcupdate.h           |   2 +-
+ include/linux/rcutree.h            |   2 +-
+ include/linux/sched.h              |   3 +-
+ include/linux/srcutiny.h           |   2 +-
+ include/linux/thread_info.h        |  21 +++++-
+ include/linux/trace_events.h       |   8 +-
+ kernel/Kconfig.preempt             |  25 ++++++-
+ kernel/entry/common.c              |   2 +-
+ kernel/entry/kvm.c                 |   4 +-
+ kernel/rcu/Kconfig                 |   4 +-
+ kernel/rcu/srcutiny.c              |  14 ++--
+ kernel/rcu/tree_plugin.h           |  22 ++++--
+ kernel/sched/core.c                | 116 +++++++++++++++++++++++++----
+ kernel/sched/debug.c               |   7 +-
+ kernel/sched/fair.c                |   6 +-
+ kernel/sched/sched.h               |   1 +
+ kernel/trace/trace.c               |   2 +
+ kernel/trace/trace_osnoise.c       |  32 ++++----
+ kernel/trace/trace_output.c        |  16 +++-
+ 24 files changed, 232 insertions(+), 80 deletions(-)
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index f127d0f1024e..4b28c191ae31 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -93,6 +93,7 @@ config X86
+ 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+ 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ 	select ARCH_HAS_PMEM_API		if X86_64
++	select ARCH_HAS_PREEMPT_LAZY
+ 	select ARCH_HAS_PTE_DEVMAP		if X86_64
+ 	select ARCH_HAS_PTE_SPECIAL
+ 	select ARCH_HAS_HW_PTE_YOUNG
+diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
+index 12da7dfd5ef1..a55c214f3ba6 100644
+--- a/arch/x86/include/asm/thread_info.h
++++ b/arch/x86/include/asm/thread_info.h
+@@ -87,8 +87,9 @@ struct thread_info {
+ #define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
+ #define TIF_SIGPENDING		2	/* signal pending */
+ #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+-#define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
+-#define TIF_SSBD		5	/* Speculative store bypass disable */
++#define TIF_NEED_RESCHED_LAZY	4	/* Lazy rescheduling needed */
++#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
++#define TIF_SSBD		6	/* Speculative store bypass disable */
+ #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
+ #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
+ #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
+@@ -110,6 +111,7 @@ struct thread_info {
+ #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+ #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
+ #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
+ #define _TIF_SSBD		(1 << TIF_SSBD)
+ #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
+diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
+index 1e50cdb83ae5..fc61d0205c97 100644
+--- a/include/linux/entry-common.h
++++ b/include/linux/entry-common.h
+@@ -64,7 +64,8 @@
+ 
+ #define EXIT_TO_USER_MODE_WORK						\
+ 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
+-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
++	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
++	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
+ 	 ARCH_EXIT_TO_USER_MODE_WORK)
+ 
+ /**
+diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
+index 6813171afccb..16149f6625e4 100644
+--- a/include/linux/entry-kvm.h
++++ b/include/linux/entry-kvm.h
+@@ -17,8 +17,9 @@
+ #endif
+ 
+ #define XFER_TO_GUEST_MODE_WORK						\
+-	(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL |	\
+-	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
++	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
++	 _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME |			\
++	 ARCH_XFER_TO_GUEST_MODE_WORK)
+ 
+ struct kvm_vcpu;
+ 
+diff --git a/include/linux/preempt.h b/include/linux/preempt.h
+index ce76f1a45722..ca86235ac15c 100644
+--- a/include/linux/preempt.h
++++ b/include/linux/preempt.h
+@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+ extern bool preempt_model_none(void);
+ extern bool preempt_model_voluntary(void);
+ extern bool preempt_model_full(void);
++extern bool preempt_model_lazy(void);
+ 
+ #else
+ 
+@@ -502,6 +503,11 @@ static inline bool preempt_model_full(void)
+ 	return IS_ENABLED(CONFIG_PREEMPT);
+ }
+ 
++static inline bool preempt_model_lazy(void)
++{
++	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
++}
++
+ #endif
+ 
+ static inline bool preempt_model_rt(void)
+@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void)
+  */
+ static inline bool preempt_model_preemptible(void)
+ {
+-	return preempt_model_full() || preempt_model_rt();
++	return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
+ }
+ 
+ #endif /* __LINUX_PREEMPT_H */
+diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
+index 48e5c03df1dd..257e9ae34414 100644
+--- a/include/linux/rcupdate.h
++++ b/include/linux/rcupdate.h
+@@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void)
+ 
+ static inline void __rcu_read_unlock(void)
+ {
+-	preempt_enable();
+ 	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ 		rcu_read_unlock_strict();
++	preempt_enable();
+ }
+ 
+ static inline int rcu_preempt_depth(void)
+diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
+index 90a684f94776..ae8b5cb475a3 100644
+--- a/include/linux/rcutree.h
++++ b/include/linux/rcutree.h
+@@ -104,7 +104,7 @@ extern int rcu_scheduler_active;
+ void rcu_end_inkernel_boot(void);
+ bool rcu_inkernel_boot_has_ended(void);
+ bool rcu_is_watching(void);
+-#ifndef CONFIG_PREEMPTION
++#ifndef CONFIG_PREEMPT_RCU
+ void rcu_all_qs(void);
+ #endif
+ 
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index bb343136ddd0..ade641760900 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk)
+ 
+ static inline void clear_tsk_need_resched(struct task_struct *tsk)
+ {
+-	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
++	atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
++			   (atomic_long_t *)&task_thread_info(tsk)->flags);
+ }
+ 
+ static inline int test_tsk_need_resched(struct task_struct *tsk)
+diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
+index 4d96bbdb45f0..1635c5e2662f 100644
+--- a/include/linux/srcutiny.h
++++ b/include/linux/srcutiny.h
+@@ -64,7 +64,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
+ {
+ 	int idx;
+ 
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
+ 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1);
+ 	preempt_enable();
+diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
+index 9ea0b28068f4..cf2446c9c30d 100644
+--- a/include/linux/thread_info.h
++++ b/include/linux/thread_info.h
+@@ -59,6 +59,14 @@ enum syscall_work_bit {
+ 
+ #include <asm/thread_info.h>
+ 
++#ifndef TIF_NEED_RESCHED_LAZY
++#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
++#error Inconsistent PREEMPT_LAZY
++#endif
++#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
++#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
++#endif
++
+ #ifdef __KERNEL__
+ 
+ #ifndef arch_set_restart_data
+@@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
+ 
+ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ 
+-static __always_inline bool tif_need_resched(void)
++static __always_inline bool tif_test_bit(int bit)
+ {
+-	return arch_test_bit(TIF_NEED_RESCHED,
++	return arch_test_bit(bit,
+ 			     (unsigned long *)(&current_thread_info()->flags));
+ }
+ 
+ #else
+ 
+-static __always_inline bool tif_need_resched(void)
++static __always_inline bool tif_test_bit(int bit)
+ {
+-	return test_bit(TIF_NEED_RESCHED,
++	return test_bit(bit,
+ 			(unsigned long *)(&current_thread_info()->flags));
+ }
+ 
+ #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
+ 
++static __always_inline bool tif_need_resched(void)
++{
++	return tif_test_bit(TIF_NEED_RESCHED);
++}
++
+ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+ static inline int arch_within_stack_frames(const void * const stack,
+ 					   const void * const stackend,
+diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
+index 42bedcddd511..4cae6f258137 100644
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -184,8 +184,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
+ 
+ enum trace_flag_type {
+ 	TRACE_FLAG_IRQS_OFF		= 0x01,
+-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
+-	TRACE_FLAG_NEED_RESCHED		= 0x04,
++	TRACE_FLAG_NEED_RESCHED		= 0x02,
++	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x04,
+ 	TRACE_FLAG_HARDIRQ		= 0x08,
+ 	TRACE_FLAG_SOFTIRQ		= 0x10,
+ 	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
+@@ -211,11 +211,11 @@ static inline unsigned int tracing_gen_ctx(void)
+ 
+ static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+ {
+-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
++	return tracing_gen_ctx_irq_test(0);
+ }
+ static inline unsigned int tracing_gen_ctx(void)
+ {
+-	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
++	return tracing_gen_ctx_irq_test(0);
+ }
+ #endif
+ 
+diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
+index fe782cd77388..7c1b29a3a491 100644
+--- a/kernel/Kconfig.preempt
++++ b/kernel/Kconfig.preempt
+@@ -11,12 +11,16 @@ config PREEMPT_BUILD
+ 	select PREEMPTION
+ 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ 
++config ARCH_HAS_PREEMPT_LAZY
++	bool
++
+ choice
+ 	prompt "Preemption Model"
+ 	default PREEMPT_NONE
+ 
+ config PREEMPT_NONE
+ 	bool "No Forced Preemption (Server)"
++	depends on !PREEMPT_RT
+ 	select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
+ 	help
+ 	  This is the traditional Linux preemption model, geared towards
+@@ -32,6 +36,7 @@ config PREEMPT_NONE
+ config PREEMPT_VOLUNTARY
+ 	bool "Voluntary Kernel Preemption (Desktop)"
+ 	depends on !ARCH_NO_PREEMPT
++	depends on !PREEMPT_RT
+ 	select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
+ 	help
+ 	  This option reduces the latency of the kernel by adding more
+@@ -51,7 +56,7 @@ config PREEMPT_VOLUNTARY
+ config PREEMPT
+ 	bool "Preemptible Kernel (Low-Latency Desktop)"
+ 	depends on !ARCH_NO_PREEMPT
+-	select PREEMPT_BUILD
++	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
+ 	help
+ 	  This option reduces the latency of the kernel by making
+ 	  all kernel code (that is not executing in a critical section)
+@@ -67,6 +72,20 @@ config PREEMPT
+ 	  embedded system with latency requirements in the milliseconds
+ 	  range.
+ 
++config PREEMPT_LAZY
++	bool "Scheduler controlled preemption model"
++	depends on !ARCH_NO_PREEMPT
++	depends on ARCH_HAS_PREEMPT_LAZY
++	select PREEMPT_BUILD if !PREEMPT_DYNAMIC
++	help
++	  This option provides a scheduler driven preemption model that
++	  is fundamentally similar to full preemption, but is less
++	  eager to preempt SCHED_NORMAL tasks in an attempt to
++	  reduce lock holder preemption and recover some of the performance
++	  gains seen from using Voluntary preemption.
++
++endchoice
++
+ config PREEMPT_RT
+ 	bool "Fully Preemptible Kernel (Real-Time)"
+ 	depends on EXPERT && ARCH_SUPPORTS_RT
+@@ -84,8 +103,6 @@ config PREEMPT_RT
+ 	  Select this if you are building a kernel for systems which
+ 	  require real-time guarantees.
+ 
+-endchoice
+-
+ config PREEMPT_COUNT
+        bool
+ 
+@@ -95,7 +112,7 @@ config PREEMPTION
+ 
+ config PREEMPT_DYNAMIC
+ 	bool "Preemption behaviour defined on boot"
+-	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
++	depends on HAVE_PREEMPT_DYNAMIC
+ 	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
+ 	select PREEMPT_BUILD
+ 	default y if HAVE_PREEMPT_DYNAMIC_CALL
+diff --git a/kernel/entry/common.c b/kernel/entry/common.c
+index 5b6934e23c21..e33691d5adf7 100644
+--- a/kernel/entry/common.c
++++ b/kernel/entry/common.c
+@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ 
+ 		local_irq_enable_exit_to_user(ti_work);
+ 
+-		if (ti_work & _TIF_NEED_RESCHED)
++		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
+ 			schedule();
+ 
+ 		if (ti_work & _TIF_UPROBE)
+diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
+index 2e0f75bcb7fd..8485f63863af 100644
+--- a/kernel/entry/kvm.c
++++ b/kernel/entry/kvm.c
+@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+ 			return -EINTR;
+ 		}
+ 
+-		if (ti_work & _TIF_NEED_RESCHED)
++		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
+ 			schedule();
+ 
+ 		if (ti_work & _TIF_NOTIFY_RESUME)
+@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+ 			return ret;
+ 
+ 		ti_work = read_thread_flags();
+-	} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
++	} while (ti_work & XFER_TO_GUEST_MODE_WORK);
+ 	return 0;
+ }
+ 
+diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
+index 3e079de0f5b4..9d52f87fac27 100644
+--- a/kernel/rcu/Kconfig
++++ b/kernel/rcu/Kconfig
+@@ -18,7 +18,7 @@ config TREE_RCU
+ 
+ config PREEMPT_RCU
+ 	bool
+-	default y if PREEMPTION
++	default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
+ 	select TREE_RCU
+ 	help
+ 	  This option selects the RCU implementation that is
+@@ -91,7 +91,7 @@ config NEED_TASKS_RCU
+ 
+ config TASKS_RCU
+ 	bool
+-	default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
++	default NEED_TASKS_RCU && PREEMPTION
+ 	select IRQ_WORK
+ 
+ config FORCE_TASKS_RUDE_RCU
+diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
+index 4dcbf8aa80ff..f688bdad293e 100644
+--- a/kernel/rcu/srcutiny.c
++++ b/kernel/rcu/srcutiny.c
+@@ -98,7 +98,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
+ {
+ 	int newval;
+ 
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+ 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ 	preempt_enable();
+@@ -120,7 +120,7 @@ void srcu_drive_gp(struct work_struct *wp)
+ 	struct srcu_struct *ssp;
+ 
+ 	ssp = container_of(wp, struct srcu_struct, srcu_work);
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
+ 		preempt_enable();
+ 		return; /* Already running or nothing to do. */
+@@ -138,7 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
+ 	WRITE_ONCE(ssp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+ 	preempt_enable();
+ 	swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ 	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ 	preempt_enable();
+@@ -159,7 +159,7 @@ void srcu_drive_gp(struct work_struct *wp)
+ 	 * at interrupt level, but the ->srcu_gp_running checks will
+ 	 * straighten that out.
+ 	 */
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	WRITE_ONCE(ssp->srcu_gp_running, false);
+ 	idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+ 	preempt_enable();
+@@ -172,7 +172,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+ {
+ 	unsigned long cookie;
+ 
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	cookie = get_state_synchronize_srcu(ssp);
+ 	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+ 		preempt_enable();
+@@ -199,7 +199,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ 
+ 	rhp->func = func;
+ 	rhp->next = NULL;
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	local_irq_save(flags);
+ 	*ssp->srcu_cb_tail = rhp;
+ 	ssp->srcu_cb_tail = &rhp->next;
+@@ -261,7 +261,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+ {
+ 	unsigned long ret;
+ 
+-	preempt_disable();  // Needed for PREEMPT_AUTO
++	preempt_disable();  // Needed for PREEMPT_LAZY
+ 	ret = get_state_synchronize_srcu(ssp);
+ 	srcu_gp_start_if_needed(ssp);
+ 	preempt_enable();
+diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
+index 1c7cbd145d5e..304e3405e6ec 100644
+--- a/kernel/rcu/tree_plugin.h
++++ b/kernel/rcu/tree_plugin.h
+@@ -832,8 +832,17 @@ void rcu_read_unlock_strict(void)
+ {
+ 	struct rcu_data *rdp;
+ 
+-	if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
++	if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
+ 		return;
++
++	/*
++	 * rcu_report_qs_rdp() can only be invoked with a stable rdp and
++	 * from the local CPU.
++	 *
++	 * The in_atomic_preempt_off() check ensures that we come here holding
++	 * the last preempt_count (which will get dropped once we return to
++	 * __rcu_read_unlock().
++	 */
+ 	rdp = this_cpu_ptr(&rcu_data);
+ 	rdp->cpu_no_qs.b.norm = false;
+ 	rcu_report_qs_rdp(rdp);
+@@ -974,13 +983,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
+  */
+ static void rcu_flavor_sched_clock_irq(int user)
+ {
+-	if (user || rcu_is_cpu_rrupt_from_idle()) {
++	if (user || rcu_is_cpu_rrupt_from_idle() ||
++	     (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
++	      (preempt_count() == HARDIRQ_OFFSET))) {
+ 
+ 		/*
+ 		 * Get here if this CPU took its interrupt from user
+-		 * mode or from the idle loop, and if this is not a
+-		 * nested interrupt.  In this case, the CPU is in
+-		 * a quiescent state, so note it.
++		 * mode, from the idle loop without this being a nested
++		 * interrupt, or while not holding the task preempt count
++		 * (with PREEMPT_COUNT=y). In this case, the CPU is in a
++		 * quiescent state, so note it.
+ 		 *
+ 		 * No memory barrier is required here because rcu_qs()
+ 		 * references only CPU-local variables that other CPUs
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 76b27b2a9c56..e82948e247c1 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq)
+  * this avoids any races wrt polling state changes and thereby avoids
+  * spurious IPIs.
+  */
+-static inline bool set_nr_and_not_polling(struct task_struct *p)
++static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
+ {
+-	struct thread_info *ti = task_thread_info(p);
+-	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
++	return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
+ }
+ 
+ /*
+@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p)
+ }
+ 
+ #else
+-static inline bool set_nr_and_not_polling(struct task_struct *p)
++static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
+ {
+-	set_tsk_need_resched(p);
++	atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
+ 	return true;
+ }
+ 
+@@ -1076,28 +1075,66 @@ void wake_up_q(struct wake_q_head *head)
+  * might also involve a cross-CPU call to trigger the scheduler on
+  * the target CPU.
+  */
+-void resched_curr(struct rq *rq)
++static void __resched_curr(struct rq *rq, int tif)
+ {
+ 	struct task_struct *curr = rq->curr;
++	struct thread_info *cti = task_thread_info(curr);
+ 	int cpu;
+ 
+ 	lockdep_assert_rq_held(rq);
+ 
+-	if (test_tsk_need_resched(curr))
++	if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
++		tif = TIF_NEED_RESCHED;
++
++	if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
+ 		return;
+ 
+ 	cpu = cpu_of(rq);
+ 
+ 	if (cpu == smp_processor_id()) {
+-		set_tsk_need_resched(curr);
+-		set_preempt_need_resched();
++		set_ti_thread_flag(cti, tif);
++		if (tif == TIF_NEED_RESCHED)
++			set_preempt_need_resched();
+ 		return;
+ 	}
+ 
+-	if (set_nr_and_not_polling(curr))
+-		smp_send_reschedule(cpu);
+-	else
++	if (set_nr_and_not_polling(cti, tif)) {
++		if (tif == TIF_NEED_RESCHED)
++			smp_send_reschedule(cpu);
++	} else {
+ 		trace_sched_wake_idle_without_ipi(cpu);
++	}
++}
++
++void resched_curr(struct rq *rq)
++{
++	__resched_curr(rq, TIF_NEED_RESCHED);
++}
++
++#ifdef CONFIG_PREEMPT_DYNAMIC
++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
++static __always_inline bool dynamic_preempt_lazy(void)
++{
++	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
++}
++#else
++static __always_inline bool dynamic_preempt_lazy(void)
++{
++	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
++}
++#endif
++
++static __always_inline int tif_need_resched_lazy(void)
++{
++	if (dynamic_preempt_lazy())
++		return TIF_NEED_RESCHED_LAZY;
++
++	return TIF_NEED_RESCHED;
++}
++
++void resched_curr_lazy(struct rq *rq)
++{
++	__resched_curr(rq, tif_need_resched_lazy());
+ }
+ 
+ void resched_cpu(int cpu)
+@@ -1192,7 +1229,7 @@ static void wake_up_idle_cpu(int cpu)
+ 	 * and testing of the above solutions didn't appear to report
+ 	 * much benefits.
+ 	 */
+-	if (set_nr_and_not_polling(rq->idle))
++	if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
+ 		smp_send_reschedule(cpu);
+ 	else
+ 		trace_sched_wake_idle_without_ipi(cpu);
+@@ -5604,6 +5641,10 @@ void sched_tick(void)
+ 	update_rq_clock(rq);
+ 	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ 	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
++
++	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
++		resched_curr(rq);
++
+ 	curr->sched_class->task_tick(rq, curr, 0);
+ 	if (sched_feat(LATENCY_WARN))
+ 		resched_latency = cpu_resched_latency(rq);
+@@ -7219,7 +7260,7 @@ int __sched __cond_resched(void)
+ 		return 1;
+ 	}
+ 	/*
+-	 * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
++	 * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
+ 	 * whether the current CPU is in an RCU read-side critical section,
+ 	 * so the tick can report quiescent states even for CPUs looping
+ 	 * in kernel context.  In contrast, in non-preemptible kernels,
+@@ -7228,6 +7269,8 @@ int __sched __cond_resched(void)
+ 	 * RCU quiescent state.  Therefore, the following code causes
+ 	 * cond_resched() to report a quiescent state, but only when RCU
+ 	 * is in urgent need of one.
++	 * A third case, preemptible, but non-PREEMPT_RCU provides for
++	 * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
+ 	 */
+ #ifndef CONFIG_PREEMPT_RCU
+ 	rcu_all_qs();
+@@ -7352,6 +7395,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
++ *   dynamic_preempt_lazy       <- false
+  *
+  * VOLUNTARY:
+  *   cond_resched               <- __cond_resched
+@@ -7359,6 +7403,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
++ *   dynamic_preempt_lazy       <- false
+  *
+  * FULL:
+  *   cond_resched               <- RET0
+@@ -7366,6 +7411,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
+  *   preempt_schedule           <- preempt_schedule
+  *   preempt_schedule_notrace   <- preempt_schedule_notrace
+  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
++ *   dynamic_preempt_lazy       <- false
++ *
++ * LAZY:
++ *   cond_resched               <- RET0
++ *   might_resched              <- RET0
++ *   preempt_schedule           <- preempt_schedule
++ *   preempt_schedule_notrace   <- preempt_schedule_notrace
++ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
++ *   dynamic_preempt_lazy       <- true
+  */
+ 
+ enum {
+@@ -7373,30 +7427,41 @@ enum {
+ 	preempt_dynamic_none,
+ 	preempt_dynamic_voluntary,
+ 	preempt_dynamic_full,
++	preempt_dynamic_lazy,
+ };
+ 
+ int preempt_dynamic_mode = preempt_dynamic_undefined;
+ 
+ int sched_dynamic_mode(const char *str)
+ {
++#ifndef CONFIG_PREEMPT_RT
+ 	if (!strcmp(str, "none"))
+ 		return preempt_dynamic_none;
+ 
+ 	if (!strcmp(str, "voluntary"))
+ 		return preempt_dynamic_voluntary;
++#endif
+ 
+ 	if (!strcmp(str, "full"))
+ 		return preempt_dynamic_full;
+ 
++#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
++	if (!strcmp(str, "lazy"))
++		return preempt_dynamic_lazy;
++#endif
++
+ 	return -EINVAL;
+ }
+ 
++#define preempt_dynamic_key_enable(f)	static_key_enable(&sk_dynamic_##f.key)
++#define preempt_dynamic_key_disable(f)	static_key_disable(&sk_dynamic_##f.key)
++
+ #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+ #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
+ #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
+ #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+-#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+-#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
++#define preempt_dynamic_enable(f)	preempt_dynamic_key_enable(f)
++#define preempt_dynamic_disable(f)	preempt_dynamic_key_disable(f)
+ #else
+ #error "Unsupported PREEMPT_DYNAMIC mechanism"
+ #endif
+@@ -7416,6 +7481,7 @@ static void __sched_dynamic_update(int mode)
+ 	preempt_dynamic_enable(preempt_schedule);
+ 	preempt_dynamic_enable(preempt_schedule_notrace);
+ 	preempt_dynamic_enable(irqentry_exit_cond_resched);
++	preempt_dynamic_key_disable(preempt_lazy);
+ 
+ 	switch (mode) {
+ 	case preempt_dynamic_none:
+@@ -7425,6 +7491,7 @@ static void __sched_dynamic_update(int mode)
+ 		preempt_dynamic_disable(preempt_schedule);
+ 		preempt_dynamic_disable(preempt_schedule_notrace);
+ 		preempt_dynamic_disable(irqentry_exit_cond_resched);
++		preempt_dynamic_key_disable(preempt_lazy);
+ 		if (mode != preempt_dynamic_mode)
+ 			pr_info("Dynamic Preempt: none\n");
+ 		break;
+@@ -7436,6 +7503,7 @@ static void __sched_dynamic_update(int mode)
+ 		preempt_dynamic_disable(preempt_schedule);
+ 		preempt_dynamic_disable(preempt_schedule_notrace);
+ 		preempt_dynamic_disable(irqentry_exit_cond_resched);
++		preempt_dynamic_key_disable(preempt_lazy);
+ 		if (mode != preempt_dynamic_mode)
+ 			pr_info("Dynamic Preempt: voluntary\n");
+ 		break;
+@@ -7447,9 +7515,22 @@ static void __sched_dynamic_update(int mode)
+ 		preempt_dynamic_enable(preempt_schedule);
+ 		preempt_dynamic_enable(preempt_schedule_notrace);
+ 		preempt_dynamic_enable(irqentry_exit_cond_resched);
++		preempt_dynamic_key_disable(preempt_lazy);
+ 		if (mode != preempt_dynamic_mode)
+ 			pr_info("Dynamic Preempt: full\n");
+ 		break;
++
++	case preempt_dynamic_lazy:
++		if (!klp_override)
++			preempt_dynamic_disable(cond_resched);
++		preempt_dynamic_disable(might_resched);
++		preempt_dynamic_enable(preempt_schedule);
++		preempt_dynamic_enable(preempt_schedule_notrace);
++		preempt_dynamic_enable(irqentry_exit_cond_resched);
++		preempt_dynamic_key_enable(preempt_lazy);
++		if (mode != preempt_dynamic_mode)
++			pr_info("Dynamic Preempt: lazy\n");
++		break;
+ 	}
+ 
+ 	preempt_dynamic_mode = mode;
+@@ -7512,6 +7593,8 @@ static void __init preempt_dynamic_init(void)
+ 			sched_dynamic_update(preempt_dynamic_none);
+ 		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ 			sched_dynamic_update(preempt_dynamic_voluntary);
++		} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
++			sched_dynamic_update(preempt_dynamic_lazy);
+ 		} else {
+ 			/* Default static call setting, nothing to do */
+ 			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+@@ -7532,6 +7615,7 @@ static void __init preempt_dynamic_init(void)
+ PREEMPT_MODEL_ACCESSOR(none);
+ PREEMPT_MODEL_ACCESSOR(voluntary);
+ PREEMPT_MODEL_ACCESSOR(full);
++PREEMPT_MODEL_ACCESSOR(lazy);
+ 
+ #else /* !CONFIG_PREEMPT_DYNAMIC: */
+ 
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index f4035c7a0fa1..a48b2a701ec2 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ static int sched_dynamic_show(struct seq_file *m, void *v)
+ {
+ 	static const char * preempt_modes[] = {
+-		"none", "voluntary", "full"
++		"none", "voluntary", "full", "lazy",
+ 	};
+-	int i;
++	int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
++	int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ 
+-	for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
++	for (; i < j; i++) {
+ 		if (preempt_dynamic_mode == i)
+ 			seq_puts(m, "(");
+ 		seq_puts(m, preempt_modes[i]);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 54e7c4c3e2c5..10e9484d1d43 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1264,7 +1264,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 		return;
+ 
+ 	if (resched || did_preempt_short(cfs_rq, curr)) {
+-		resched_curr(rq);
++		resched_curr_lazy(rq);
+ 		clear_buddies(cfs_rq, curr);
+ 	}
+ }
+@@ -5691,7 +5691,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+ 	 * validating it and just reschedule.
+ 	 */
+ 	if (queued) {
+-		resched_curr(rq_of(cfs_rq));
++		resched_curr_lazy(rq_of(cfs_rq));
+ 		return;
+ 	}
+ 	/*
+@@ -8855,7 +8855,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
+ 	return;
+ 
+ preempt:
+-	resched_curr(rq);
++	resched_curr_lazy(rq);
+ }
+ 
+ static struct task_struct *pick_task_fair(struct rq *rq)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index c5d6012794de..b5f3890f3050 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2696,6 +2696,7 @@ extern void init_sched_rt_class(void);
+ extern void init_sched_fair_class(void);
+ 
+ extern void resched_curr(struct rq *rq);
++extern void resched_curr_lazy(struct rq *rq);
+ extern void resched_cpu(int cpu);
+ 
+ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
+index 6a891e00aa7f..acbed0ffe083 100644
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -2563,6 +2563,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
+ 		trace_flags |= TRACE_FLAG_NEED_RESCHED;
+ 	if (test_preempt_need_resched())
+ 		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
++	if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY))
++		trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+ 	return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ 		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
+ }
+diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
+index a50ed23bee77..4a9087112526 100644
+--- a/kernel/trace/trace_osnoise.c
++++ b/kernel/trace/trace_osnoise.c
+@@ -1537,27 +1537,25 @@ static int run_osnoise(void)
+ 
+ 		/*
+ 		 * In some cases, notably when running on a nohz_full CPU with
+-		 * a stopped tick PREEMPT_RCU has no way to account for QSs.
+-		 * This will eventually cause unwarranted noise as PREEMPT_RCU
+-		 * will force preemption as the means of ending the current
+-		 * grace period. We avoid this problem by calling
+-		 * rcu_momentary_eqs(), which performs a zero duration
+-		 * EQS allowing PREEMPT_RCU to end the current grace period.
+-		 * This call shouldn't be wrapped inside an RCU critical
+-		 * section.
++		 * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
++		 * account for QSs. This will eventually cause unwarranted
++		 * noise as RCU forces preemption as the means of ending the
++		 * current grace period.  We avoid this by calling
++		 * rcu_momentary_eqs(), which performs a zero duration EQS
++		 * allowing RCU to end the current grace period. This call
++		 * shouldn't be wrapped inside an RCU critical section.
+ 		 *
+-		 * Note that in non PREEMPT_RCU kernels QSs are handled through
+-		 * cond_resched()
++		 * Normally QSs for other cases are handled through cond_resched().
++		 * For simplicity, however, we call rcu_momentary_eqs() for all
++		 * configurations here.
+ 		 */
+-		if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+-			if (!disable_irq)
+-				local_irq_disable();
++		if (!disable_irq)
++			local_irq_disable();
+ 
+-			rcu_momentary_eqs();
++		rcu_momentary_eqs();
+ 
+-			if (!disable_irq)
+-				local_irq_enable();
+-		}
++		if (!disable_irq)
++			local_irq_enable();
+ 
+ 		/*
+ 		 * For the non-preemptive kernel config: let threads runs, if
+diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
+index 868f2f912f28..23ca2155306b 100644
+--- a/kernel/trace/trace_output.c
++++ b/kernel/trace/trace_output.c
+@@ -460,20 +460,32 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+ 		(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
+ 		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ 		bh_off ? 'b' :
+-		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
++		!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' :
+ 		'.';
+ 
+-	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
++	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY |
+ 				TRACE_FLAG_PREEMPT_RESCHED)) {
++	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
++		need_resched = 'B';
++		break;
+ 	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+ 		need_resched = 'N';
+ 		break;
++	case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
++		need_resched = 'L';
++		break;
++	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY:
++		need_resched = 'b';
++		break;
+ 	case TRACE_FLAG_NEED_RESCHED:
+ 		need_resched = 'n';
+ 		break;
+ 	case TRACE_FLAG_PREEMPT_RESCHED:
+ 		need_resched = 'p';
+ 		break;
++	case TRACE_FLAG_NEED_RESCHED_LAZY:
++		need_resched = 'l';
++		break;
+ 	default:
+ 		need_resched = '.';
+ 		break;
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch b/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch
new file mode 100644
index 0000000..20e109a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch
@@ -0,0 +1,902 @@
+From 5b24edbe81299a51cf1694d0e33c33d995e2c04d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:50:07 +0100
+Subject: [PATCH 02/12] amd-pstate
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/pm/amd-pstate.rst |   4 +-
+ arch/x86/include/asm/cpufeatures.h          |   3 +-
+ arch/x86/include/asm/intel-family.h         |   6 +
+ arch/x86/include/asm/processor.h            |  18 ++
+ arch/x86/include/asm/topology.h             |   9 +
+ arch/x86/kernel/acpi/cppc.c                 |  23 ++
+ arch/x86/kernel/cpu/debugfs.c               |   1 +
+ arch/x86/kernel/cpu/scattered.c             |   3 +-
+ arch/x86/kernel/cpu/topology_amd.c          |   3 +
+ arch/x86/kernel/cpu/topology_common.c       |  34 +++
+ arch/x86/kernel/smpboot.c                   |  14 +-
+ arch/x86/mm/init.c                          |  23 +-
+ drivers/cpufreq/amd-pstate-ut.c             |   6 +-
+ drivers/cpufreq/amd-pstate.c                | 235 +++++++++-----------
+ tools/arch/x86/include/asm/cpufeatures.h    |   2 +-
+ 15 files changed, 239 insertions(+), 145 deletions(-)
+
+diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
+index 210a808b74ec..412423c54f25 100644
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -251,9 +251,7 @@ performance supported in `AMD CPPC Performance Capability <perf_cap_>`_).
+ In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
+ table, so we need to expose it to sysfs. If boost is not active, but
+ still supported, this maximum frequency will be larger than the one in
+-``cpuinfo``. On systems that support preferred core, the driver will have
+-different values for some cores than others and this will reflect the values
+-advertised by the platform at bootup.
++``cpuinfo``.
+ This attribute is read-only.
+ 
+ ``amd_pstate_lowest_nonlinear_freq``
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index 913fd3a7bac6..a7c93191b7c6 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -473,7 +473,8 @@
+ #define X86_FEATURE_BHI_CTRL		(21*32+ 2) /* BHI_DIS_S HW control available */
+ #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* BHI_DIS_S HW control enabled */
+ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
+-#define X86_FEATURE_FAST_CPPC		(21*32 + 5) /* AMD Fast CPPC */
++#define X86_FEATURE_AMD_FAST_CPPC	(21*32 + 5) /* Fast CPPC */
++#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
+ 
+ /*
+  * BUG word(s)
+diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
+index 1a42f829667a..736764472048 100644
+--- a/arch/x86/include/asm/intel-family.h
++++ b/arch/x86/include/asm/intel-family.h
+@@ -183,4 +183,10 @@
+ /* Family 19 */
+ #define INTEL_PANTHERCOVE_X		IFM(19, 0x01) /* Diamond Rapids */
+ 
++/* CPU core types */
++enum intel_cpu_type {
++	INTEL_CPU_TYPE_ATOM = 0x20,
++	INTEL_CPU_TYPE_CORE = 0x40,
++};
++
+ #endif /* _ASM_X86_INTEL_FAMILY_H */
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 2d776635aa53..20e6009381ed 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -105,6 +105,24 @@ struct cpuinfo_topology {
+ 	// Cache level topology IDs
+ 	u32			llc_id;
+ 	u32			l2c_id;
++
++	// Hardware defined CPU-type
++	union {
++		u32		cpu_type;
++		struct {
++			// CPUID.1A.EAX[23-0]
++			u32	intel_native_model_id	:24;
++			// CPUID.1A.EAX[31-24]
++			u32	intel_type		:8;
++		};
++		struct {
++			// CPUID 0x80000026.EBX
++			u32	amd_num_processors	:16,
++				amd_power_eff_ranking	:8,
++				amd_native_model_id	:4,
++				amd_type		:4;
++		};
++	};
+ };
+ 
+ struct cpuinfo_x86 {
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index 92f3664dd933..fd41103ad342 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -114,6 +114,12 @@ enum x86_topology_domains {
+ 	TOPO_MAX_DOMAIN,
+ };
+ 
++enum x86_topology_cpu_type {
++	TOPO_CPU_TYPE_PERFORMANCE,
++	TOPO_CPU_TYPE_EFFICIENCY,
++	TOPO_CPU_TYPE_UNKNOWN,
++};
++
+ struct x86_topology_system {
+ 	unsigned int	dom_shifts[TOPO_MAX_DOMAIN];
+ 	unsigned int	dom_size[TOPO_MAX_DOMAIN];
+@@ -149,6 +155,9 @@ extern unsigned int __max_threads_per_core;
+ extern unsigned int __num_threads_per_package;
+ extern unsigned int __num_cores_per_package;
+ 
++const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c);
++enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c);
++
+ static inline unsigned int topology_max_packages(void)
+ {
+ 	return __max_logical_packages;
+diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
+index aab9d0570841..d745dd586303 100644
+--- a/arch/x86/kernel/acpi/cppc.c
++++ b/arch/x86/kernel/acpi/cppc.c
+@@ -239,8 +239,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+  */
+ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+ {
++	enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
+ 	bool prefcore;
+ 	int ret;
++	u32 tmp;
+ 
+ 	ret = amd_detect_prefcore(&prefcore);
+ 	if (ret)
+@@ -266,6 +268,27 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+ 			break;
+ 		}
+ 	}
++
++	/* detect if running on heterogeneous design */
++	if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) {
++		switch (core_type) {
++		case TOPO_CPU_TYPE_UNKNOWN:
++			pr_warn("Undefined core type found for cpu %d\n", cpu);
++			break;
++		case TOPO_CPU_TYPE_PERFORMANCE:
++			/* use the max scale for performance cores */
++			*numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
++			return 0;
++		case TOPO_CPU_TYPE_EFFICIENCY:
++			/* use the highest perf value for efficiency cores */
++			ret = amd_get_highest_perf(cpu, &tmp);
++			if (ret)
++				return ret;
++			*numerator = tmp;
++			return 0;
++		}
++	}
++
+ 	*numerator = CPPC_HIGHEST_PERF_PREFCORE;
+ 
+ 	return 0;
+diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
+index 3baf3e435834..10719aba6276 100644
+--- a/arch/x86/kernel/cpu/debugfs.c
++++ b/arch/x86/kernel/cpu/debugfs.c
+@@ -22,6 +22,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
+ 	seq_printf(m, "die_id:              %u\n", c->topo.die_id);
+ 	seq_printf(m, "cu_id:               %u\n", c->topo.cu_id);
+ 	seq_printf(m, "core_id:             %u\n", c->topo.core_id);
++	seq_printf(m, "cpu_type:            %s\n", get_topology_cpu_type_name(c));
+ 	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
+ 	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
+ 	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
+diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
+index c84c30188fdf..307a91741534 100644
+--- a/arch/x86/kernel/cpu/scattered.c
++++ b/arch/x86/kernel/cpu/scattered.c
+@@ -45,13 +45,14 @@ static const struct cpuid_bit cpuid_bits[] = {
+ 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
+ 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
+ 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+-	{ X86_FEATURE_FAST_CPPC, 	CPUID_EDX, 15, 0x80000007, 0 },
++	{ X86_FEATURE_AMD_FAST_CPPC,	CPUID_EDX, 15, 0x80000007, 0 },
+ 	{ X86_FEATURE_MBA,		CPUID_EBX,  6, 0x80000008, 0 },
+ 	{ X86_FEATURE_SMBA,		CPUID_EBX,  2, 0x80000020, 0 },
+ 	{ X86_FEATURE_BMEC,		CPUID_EBX,  3, 0x80000020, 0 },
+ 	{ X86_FEATURE_PERFMON_V2,	CPUID_EAX,  0, 0x80000022, 0 },
+ 	{ X86_FEATURE_AMD_LBR_V2,	CPUID_EAX,  1, 0x80000022, 0 },
+ 	{ X86_FEATURE_AMD_LBR_PMC_FREEZE,	CPUID_EAX,  2, 0x80000022, 0 },
++	{ X86_FEATURE_AMD_HETEROGENEOUS_CORES,	CPUID_EAX,  30, 0x80000026, 0 },
+ 	{ 0, 0, 0, 0, 0 }
+ };
+ 
+diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
+index 7d476fa697ca..03b3c9c3a45e 100644
+--- a/arch/x86/kernel/cpu/topology_amd.c
++++ b/arch/x86/kernel/cpu/topology_amd.c
+@@ -182,6 +182,9 @@ static void parse_topology_amd(struct topo_scan *tscan)
+ 	if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
+ 		has_topoext = cpu_parse_topology_ext(tscan);
+ 
++	if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
++		tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
++
+ 	if (!has_topoext && !parse_8000_0008(tscan))
+ 		return;
+ 
+diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
+index 9a6069e7133c..8277c64f88db 100644
+--- a/arch/x86/kernel/cpu/topology_common.c
++++ b/arch/x86/kernel/cpu/topology_common.c
+@@ -3,6 +3,7 @@
+ 
+ #include <xen/xen.h>
+ 
++#include <asm/intel-family.h>
+ #include <asm/apic.h>
+ #include <asm/processor.h>
+ #include <asm/smp.h>
+@@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ 	}
+ }
+ 
++enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
++{
++	if (c->x86_vendor == X86_VENDOR_INTEL) {
++		switch (c->topo.intel_type) {
++		case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
++		case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
++		}
++	}
++	if (c->x86_vendor == X86_VENDOR_AMD) {
++		switch (c->topo.amd_type) {
++		case 0:	return TOPO_CPU_TYPE_PERFORMANCE;
++		case 1:	return TOPO_CPU_TYPE_EFFICIENCY;
++		}
++	}
++
++	return TOPO_CPU_TYPE_UNKNOWN;
++}
++
++const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
++{
++	switch (get_topology_cpu_type(c)) {
++	case TOPO_CPU_TYPE_PERFORMANCE:
++		return "performance";
++	case TOPO_CPU_TYPE_EFFICIENCY:
++		return "efficiency";
++	default:
++		return "unknown";
++	}
++}
++
+ static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+ {
+ 	struct {
+@@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early)
+ 		.cu_id			= 0xff,
+ 		.llc_id			= BAD_APICID,
+ 		.l2c_id			= BAD_APICID,
++		.cpu_type		= TOPO_CPU_TYPE_UNKNOWN,
+ 	};
+ 	struct cpuinfo_x86 *c = tscan->c;
+ 	struct {
+@@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early)
+ 	case X86_VENDOR_INTEL:
+ 		if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ 			parse_legacy(tscan);
++		if (c->cpuid_level >= 0x1a)
++			c->topo.cpu_type = cpuid_eax(0x1a);
+ 		break;
+ 	case X86_VENDOR_HYGON:
+ 		if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 766f092dab80..419e7ae09639 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -62,6 +62,8 @@
+ #include <linux/mc146818rtc.h>
+ #include <linux/acpi.h>
+ 
++#include <acpi/cppc_acpi.h>
++
+ #include <asm/acpi.h>
+ #include <asm/cacheinfo.h>
+ #include <asm/desc.h>
+@@ -498,7 +500,17 @@ static int x86_cluster_flags(void)
+ static int x86_die_flags(void)
+ {
+ 	if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+-	       return x86_sched_itmt_flags();
++		return x86_sched_itmt_flags();
++
++	switch (boot_cpu_data.x86_vendor) {
++	case X86_VENDOR_AMD:
++	case X86_VENDOR_HYGON:
++		bool prefcore = false;
++
++		amd_detect_prefcore(&prefcore);
++		if (prefcore || cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
++			return x86_sched_itmt_flags();
++	};
+ 
+ 	return 0;
+ }
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index eb503f53c319..101725c149c4 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void)
+ }
+ 
+ /*
+- * INVLPG may not properly flush Global entries
+- * on these CPUs when PCIDs are enabled.
++ * INVLPG may not properly flush Global entries on
++ * these CPUs.  New microcode fixes the issue.
+  */
+ static const struct x86_cpu_id invlpg_miss_ids[] = {
+-	X86_MATCH_VFM(INTEL_ALDERLAKE,	    0),
+-	X86_MATCH_VFM(INTEL_ALDERLAKE_L,    0),
+-	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
+-	X86_MATCH_VFM(INTEL_RAPTORLAKE,	    0),
+-	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,   0),
+-	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,   0),
++	X86_MATCH_VFM(INTEL_ALDERLAKE,	    0x2e),
++	X86_MATCH_VFM(INTEL_ALDERLAKE_L,    0x42c),
++	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
++	X86_MATCH_VFM(INTEL_RAPTORLAKE,	    0x118),
++	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,   0x4117),
++	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,   0x2e),
+ 	{}
+ };
+ 
+ static void setup_pcid(void)
+ {
++	const struct x86_cpu_id *invlpg_miss_match;
++
+ 	if (!IS_ENABLED(CONFIG_X86_64))
+ 		return;
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_PCID))
+ 		return;
+ 
+-	if (x86_match_cpu(invlpg_miss_ids)) {
++	invlpg_miss_match = x86_match_cpu(invlpg_miss_ids);
++
++	if (invlpg_miss_match &&
++	    boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
+ 		pr_info("Incomplete global flushes, disabling PCID");
+ 		setup_clear_cpu_cap(X86_FEATURE_PCID);
+ 		return;
+diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
+index f66701514d90..a261d7300951 100644
+--- a/drivers/cpufreq/amd-pstate-ut.c
++++ b/drivers/cpufreq/amd-pstate-ut.c
+@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32 index)
+ 			goto skip_test;
+ 		}
+ 
+-		if (cpudata->min_freq != policy->min) {
++		if (cpudata->lowest_nonlinear_freq != policy->min) {
+ 			amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
+-			pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n",
+-				__func__, cpu, cpudata->min_freq, policy->min);
++			pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
++				__func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
+ 			goto skip_test;
+ 		}
+ 
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index 91d3c3b1c2d3..66e5dfc711c0 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -233,7 +233,7 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
+ 	return index;
+ }
+ 
+-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
++static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+ 			       u32 des_perf, u32 max_perf, bool fast_switch)
+ {
+ 	if (fast_switch)
+@@ -243,7 +243,7 @@ static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+ 			      READ_ONCE(cpudata->cppc_req_cached));
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
++DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
+ 
+ static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
+ 					  u32 min_perf, u32 des_perf,
+@@ -306,11 +306,17 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
+ 	return ret;
+ }
+ 
+-static inline int pstate_enable(bool enable)
++static inline int msr_cppc_enable(bool enable)
+ {
+ 	int ret, cpu;
+ 	unsigned long logical_proc_id_mask = 0;
+ 
++       /*
++        * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared.
++        */
++	if (!enable)
++		return 0;
++
+ 	if (enable == cppc_enabled)
+ 		return 0;
+ 
+@@ -332,7 +338,7 @@ static inline int pstate_enable(bool enable)
+ 	return 0;
+ }
+ 
+-static int cppc_enable(bool enable)
++static int shmem_cppc_enable(bool enable)
+ {
+ 	int cpu, ret = 0;
+ 	struct cppc_perf_ctrls perf_ctrls;
+@@ -359,24 +365,28 @@ static int cppc_enable(bool enable)
+ 	return ret;
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
++DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable);
+ 
+-static inline int amd_pstate_enable(bool enable)
++static inline int amd_pstate_cppc_enable(bool enable)
+ {
+-	return static_call(amd_pstate_enable)(enable);
++	return static_call(amd_pstate_cppc_enable)(enable);
+ }
+ 
+-static int pstate_init_perf(struct amd_cpudata *cpudata)
++static int msr_init_perf(struct amd_cpudata *cpudata)
+ {
+-	u64 cap1;
++	u64 cap1, numerator;
+ 
+ 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
+ 				     &cap1);
+ 	if (ret)
+ 		return ret;
+ 
+-	WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+-	WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
++	ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
++	if (ret)
++		return ret;
++
++	WRITE_ONCE(cpudata->highest_perf, numerator);
++	WRITE_ONCE(cpudata->max_limit_perf, numerator);
+ 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
+@@ -385,16 +395,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
+ 	return 0;
+ }
+ 
+-static int cppc_init_perf(struct amd_cpudata *cpudata)
++static int shmem_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	struct cppc_perf_caps cppc_perf;
++	u64 numerator;
+ 
+ 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
+ 	if (ret)
+ 		return ret;
+ 
+-	WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
+-	WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
++	ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
++	if (ret)
++		return ret;
++
++	WRITE_ONCE(cpudata->highest_perf, numerator);
++	WRITE_ONCE(cpudata->max_limit_perf, numerator);
+ 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
+ 		   cppc_perf.lowest_nonlinear_perf);
+@@ -420,14 +435,14 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
+ 	return ret;
+ }
+ 
+-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
++DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf);
+ 
+ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	return static_call(amd_pstate_init_perf)(cpudata);
+ }
+ 
+-static void cppc_update_perf(struct amd_cpudata *cpudata,
++static void shmem_update_perf(struct amd_cpudata *cpudata,
+ 			     u32 min_perf, u32 des_perf,
+ 			     u32 max_perf, bool fast_switch)
+ {
+@@ -527,25 +542,41 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
+ 	cpufreq_cpu_put(policy);
+ }
+ 
+-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
++static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
+ {
+-	cpufreq_verify_within_cpu_limits(policy);
++	/*
++	 * Initialize lower frequency limit (i.e.policy->min) with
++	 * lowest_nonlinear_frequency which is the most energy efficient
++	 * frequency. Override the initial value set by cpufreq core and
++	 * amd-pstate qos_requests.
++	 */
++	if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
++		struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
++		struct amd_cpudata *cpudata;
++
++		if (!policy)
++			return -EINVAL;
++
++		cpudata = policy->driver_data;
++		policy_data->min = cpudata->lowest_nonlinear_freq;
++		cpufreq_cpu_put(policy);
++	}
++
++	cpufreq_verify_within_cpu_limits(policy_data);
++	pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
+ 
+ 	return 0;
+ }
+ 
+ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
+ {
+-	u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
++	u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq;
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+-	if (cpudata->boost_supported && !policy->boost_enabled)
+-		max_perf = READ_ONCE(cpudata->nominal_perf);
+-	else
+-		max_perf = READ_ONCE(cpudata->highest_perf);
+-
+-	max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+-	min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
++	max_perf = READ_ONCE(cpudata->highest_perf);
++	max_freq = READ_ONCE(cpudata->max_freq);
++	max_limit_perf = div_u64(policy->max * max_perf, max_freq);
++	min_limit_perf = div_u64(policy->min * max_perf, max_freq);
+ 
+ 	lowest_perf = READ_ONCE(cpudata->lowest_perf);
+ 	if (min_limit_perf < lowest_perf)
+@@ -825,7 +856,7 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu)
+ 
+ 	transition_delay_ns = cppc_get_transition_latency(cpu);
+ 	if (transition_delay_ns == CPUFREQ_ETERNAL) {
+-		if (cpu_feature_enabled(X86_FEATURE_FAST_CPPC))
++		if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC))
+ 			return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY;
+ 		else
+ 			return AMD_PSTATE_TRANSITION_DELAY;
+@@ -864,7 +895,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
+ {
+ 	int ret;
+ 	u32 min_freq, max_freq;
+-	u64 numerator;
+ 	u32 nominal_perf, nominal_freq;
+ 	u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
+ 	u32 boost_ratio, lowest_nonlinear_ratio;
+@@ -886,10 +916,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
+ 
+ 	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+ 
+-	ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+-	if (ret)
+-		return ret;
+-	boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
++	boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+ 	max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
+ 
+ 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
+@@ -979,7 +1006,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
+ 		policy->fast_switch_possible = true;
+ 
+ 	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
+-				   FREQ_QOS_MIN, policy->cpuinfo.min_freq);
++				   FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
+ 	if (ret < 0) {
+ 		dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
+ 		goto free_cpudata1;
+@@ -1023,7 +1050,7 @@ static int amd_pstate_cpu_resume(struct cpufreq_policy *policy)
+ {
+ 	int ret;
+ 
+-	ret = amd_pstate_enable(true);
++	ret = amd_pstate_cppc_enable(true);
+ 	if (ret)
+ 		pr_err("failed to enable amd-pstate during resume, return %d\n", ret);
+ 
+@@ -1034,7 +1061,7 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy)
+ {
+ 	int ret;
+ 
+-	ret = amd_pstate_enable(false);
++	ret = amd_pstate_cppc_enable(false);
+ 	if (ret)
+ 		pr_err("failed to disable amd-pstate during suspend, return %d\n", ret);
+ 
+@@ -1167,25 +1194,41 @@ static ssize_t show_energy_performance_preference(
+ 
+ static void amd_pstate_driver_cleanup(void)
+ {
+-	amd_pstate_enable(false);
++	amd_pstate_cppc_enable(false);
+ 	cppc_state = AMD_PSTATE_DISABLE;
+ 	current_pstate_driver = NULL;
+ }
+ 
++static int amd_pstate_set_driver(int mode_idx)
++{
++	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
++		cppc_state = mode_idx;
++		if (cppc_state == AMD_PSTATE_DISABLE)
++			pr_info("driver is explicitly disabled\n");
++
++		if (cppc_state == AMD_PSTATE_ACTIVE)
++			current_pstate_driver = &amd_pstate_epp_driver;
++
++		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
++			current_pstate_driver = &amd_pstate_driver;
++
++		return 0;
++	}
++
++	return -EINVAL;
++}
++
+ static int amd_pstate_register_driver(int mode)
+ {
+ 	int ret;
+ 
+-	if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
+-		current_pstate_driver = &amd_pstate_driver;
+-	else if (mode == AMD_PSTATE_ACTIVE)
+-		current_pstate_driver = &amd_pstate_epp_driver;
+-	else
+-		return -EINVAL;
++	ret = amd_pstate_set_driver(mode);
++	if (ret)
++		return ret;
+ 
+ 	cppc_state = mode;
+ 
+-	ret = amd_pstate_enable(true);
++	ret = amd_pstate_cppc_enable(true);
+ 	if (ret) {
+ 		pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
+ 		       ret);
+@@ -1463,6 +1506,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
+ 	}
+ 
++	current_pstate_driver->adjust_perf = NULL;
++
+ 	return 0;
+ 
+ free_cpudata1:
+@@ -1485,26 +1530,13 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
+ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+-	u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
++	u32 max_perf, min_perf;
+ 	u64 value;
+ 	s16 epp;
+ 
+-	if (cpudata->boost_supported && !policy->boost_enabled)
+-		max_perf = READ_ONCE(cpudata->nominal_perf);
+-	else
+-		max_perf = READ_ONCE(cpudata->highest_perf);
++	max_perf = READ_ONCE(cpudata->highest_perf);
+ 	min_perf = READ_ONCE(cpudata->lowest_perf);
+-	max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+-	min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
+-
+-	if (min_limit_perf < min_perf)
+-		min_limit_perf = min_perf;
+-
+-	if (max_limit_perf < min_limit_perf)
+-		max_limit_perf = min_limit_perf;
+-
+-	WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
+-	WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
++	amd_pstate_update_min_max_limit(policy);
+ 
+ 	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+ 			cpudata->max_limit_perf);
+@@ -1541,12 +1573,6 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+ 		epp = 0;
+ 
+-	/* Set initial EPP value */
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		value &= ~GENMASK_ULL(31, 24);
+-		value |= (u64)epp << 24;
+-	}
+-
+ 	WRITE_ONCE(cpudata->cppc_req_cached, value);
+ 	return amd_pstate_set_epp(cpudata, epp);
+ }
+@@ -1583,7 +1609,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
+ 	u64 value, max_perf;
+ 	int ret;
+ 
+-	ret = amd_pstate_enable(true);
++	ret = amd_pstate_cppc_enable(true);
+ 	if (ret)
+ 		pr_err("failed to enable amd pstate during resume, return %d\n", ret);
+ 
+@@ -1594,8 +1620,9 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
+ 		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+ 	} else {
+ 		perf_ctrls.max_perf = max_perf;
+-		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
+ 		cppc_set_perf(cpudata->cpu, &perf_ctrls);
++		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
++		cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
+ 	}
+ }
+ 
+@@ -1635,9 +1662,11 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
+ 		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+ 	} else {
+ 		perf_ctrls.desired_perf = 0;
++		perf_ctrls.min_perf = min_perf;
+ 		perf_ctrls.max_perf = min_perf;
+-		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
+ 		cppc_set_perf(cpudata->cpu, &perf_ctrls);
++		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
++		cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
+ 	}
+ 	mutex_unlock(&amd_pstate_limits_lock);
+ }
+@@ -1657,13 +1686,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
+ 	return 0;
+ }
+ 
+-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
+-{
+-	cpufreq_verify_within_cpu_limits(policy);
+-	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
+-	return 0;
+-}
+-
+ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+@@ -1677,7 +1699,7 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
+ 	cpudata->suspended = true;
+ 
+ 	/* disable CPPC in lowlevel firmware */
+-	ret = amd_pstate_enable(false);
++	ret = amd_pstate_cppc_enable(false);
+ 	if (ret)
+ 		pr_err("failed to suspend, return %d\n", ret);
+ 
+@@ -1719,7 +1741,7 @@ static struct cpufreq_driver amd_pstate_driver = {
+ 
+ static struct cpufreq_driver amd_pstate_epp_driver = {
+ 	.flags		= CPUFREQ_CONST_LOOPS,
+-	.verify		= amd_pstate_epp_verify_policy,
++	.verify		= amd_pstate_verify,
+ 	.setpolicy	= amd_pstate_epp_set_policy,
+ 	.init		= amd_pstate_epp_cpu_init,
+ 	.exit		= amd_pstate_epp_cpu_exit,
+@@ -1733,26 +1755,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
+ 	.attr		= amd_pstate_epp_attr,
+ };
+ 
+-static int __init amd_pstate_set_driver(int mode_idx)
+-{
+-	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
+-		cppc_state = mode_idx;
+-		if (cppc_state == AMD_PSTATE_DISABLE)
+-			pr_info("driver is explicitly disabled\n");
+-
+-		if (cppc_state == AMD_PSTATE_ACTIVE)
+-			current_pstate_driver = &amd_pstate_epp_driver;
+-
+-		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
+-			current_pstate_driver = &amd_pstate_driver;
+-
+-		return 0;
+-	}
+-
+-	return -EINVAL;
+-}
+-
+-/**
++/*
+  * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
+  * show the debug message that helps to check if the CPU has CPPC support for loading issue.
+  */
+@@ -1842,10 +1845,10 @@ static int __init amd_pstate_init(void)
+ 	if (cppc_state == AMD_PSTATE_UNDEFINED) {
+ 		/* Disable on the following configs by default:
+ 		 * 1. Undefined platforms
+-		 * 2. Server platforms
++		 * 2. Server platforms with CPUs older than Family 0x1A.
+ 		 */
+ 		if (amd_pstate_acpi_pm_profile_undefined() ||
+-		    amd_pstate_acpi_pm_profile_server()) {
++		    (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) {
+ 			pr_info("driver load is disabled, boot with specific mode to enable this\n");
+ 			return -ENODEV;
+ 		}
+@@ -1853,31 +1856,19 @@ static int __init amd_pstate_init(void)
+ 		cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE;
+ 	}
+ 
+-	switch (cppc_state) {
+-	case AMD_PSTATE_DISABLE:
++	if (cppc_state == AMD_PSTATE_DISABLE) {
+ 		pr_info("driver load is disabled, boot with specific mode to enable this\n");
+ 		return -ENODEV;
+-	case AMD_PSTATE_PASSIVE:
+-	case AMD_PSTATE_ACTIVE:
+-	case AMD_PSTATE_GUIDED:
+-		ret = amd_pstate_set_driver(cppc_state);
+-		if (ret)
+-			return ret;
+-		break;
+-	default:
+-		return -EINVAL;
+ 	}
+ 
+ 	/* capability check */
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ 		pr_debug("AMD CPPC MSR based functionality is supported\n");
+-		if (cppc_state != AMD_PSTATE_ACTIVE)
+-			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 	} else {
+ 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
+-		static_call_update(amd_pstate_enable, cppc_enable);
+-		static_call_update(amd_pstate_init_perf, cppc_init_perf);
+-		static_call_update(amd_pstate_update_perf, cppc_update_perf);
++		static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);
++		static_call_update(amd_pstate_init_perf, shmem_init_perf);
++		static_call_update(amd_pstate_update_perf, shmem_update_perf);
+ 	}
+ 
+ 	if (amd_pstate_prefcore) {
+@@ -1886,17 +1877,10 @@ static int __init amd_pstate_init(void)
+ 			return ret;
+ 	}
+ 
+-	/* enable amd pstate feature */
+-	ret = amd_pstate_enable(true);
+-	if (ret) {
+-		pr_err("failed to enable driver mode(%d)\n", cppc_state);
+-		return ret;
+-	}
+-
+-	ret = cpufreq_register_driver(current_pstate_driver);
++	ret = amd_pstate_register_driver(cppc_state);
+ 	if (ret) {
+ 		pr_err("failed to register with return %d\n", ret);
+-		goto disable_driver;
++		return ret;
+ 	}
+ 
+ 	dev_root = bus_get_dev_root(&cpu_subsys);
+@@ -1913,8 +1897,7 @@ static int __init amd_pstate_init(void)
+ 
+ global_attr_free:
+ 	cpufreq_unregister_driver(current_pstate_driver);
+-disable_driver:
+-	amd_pstate_enable(false);
++	amd_pstate_cppc_enable(false);
+ 	return ret;
+ }
+ device_initcall(amd_pstate_init);
+diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
+index dd4682857c12..23698d0f4bb4 100644
+--- a/tools/arch/x86/include/asm/cpufeatures.h
++++ b/tools/arch/x86/include/asm/cpufeatures.h
+@@ -472,7 +472,7 @@
+ #define X86_FEATURE_BHI_CTRL		(21*32+ 2) /* BHI_DIS_S HW control available */
+ #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* BHI_DIS_S HW control enabled */
+ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
+-#define X86_FEATURE_FAST_CPPC		(21*32 + 5) /* AMD Fast CPPC */
++#define X86_FEATURE_AMD_FAST_CPPC		(21*32 + 5) /* AMD Fast CPPC */
+ 
+ /*
+  * BUG word(s)
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch
new file mode 100644
index 0000000..5f6c27b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch
@@ -0,0 +1,3386 @@
+From d03dc7618d35c0c3e5ab7373cff2032a8c3ecf9f Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:50:32 +0100
+Subject: [PATCH 04/12] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    4 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    9 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2230 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 16 files changed, 1940 insertions(+), 553 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 6a5e08b937b3..27aab715490e 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -369,7 +369,9 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
+ 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+ 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+ 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index c0deaafebfdc..d53f042d936e 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index d1948d357dad..7d99f0bec5f2 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -973,9 +989,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1088,6 +1109,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1110,7 +1132,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1130,10 +1156,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1144,7 +1173,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1168,8 +1199,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1235,6 +1269,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1254,6 +1296,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1266,6 +1309,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2417,7 +2475,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index db7254d52d93..38de18d921ea 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -507,12 +507,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dbf896f3146c..4702cd2f1ffc 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 6d2c97f8e9ef..ddc116ef22cb 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index 554804774628..2279e6e7bc9c 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+ 
++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++{
++}
++
+ static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ 				    const struct rate_sample *rs)
+ {
+@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
++	.skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 4f77bd862e95..fd3a5551eda7 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3384,6 +3384,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4110,6 +4111,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..a180fa648d5e 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index 0306d257fa64..28f581c0dab7 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 2d844e1f867f..efb92e47a632 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3826,7 +3842,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3843,6 +3860,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3853,6 +3871,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3961,6 +3984,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4035,7 +4059,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4059,6 +4083,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4078,7 +4103,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5752,13 +5777,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index bb1fe1ba867a..050a80769de6 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 8efc58716ce9..5798ce3db487 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1603,7 +1606,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1678,6 +1681,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2035,13 +2062,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 79064580c8c0..697270ce1ea6 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -690,6 +690,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0006-crypto.patch b/sys-kernel/gentoo-sources-6.12/0006-crypto.patch
new file mode 100644
index 0000000..195db65
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0006-crypto.patch
@@ -0,0 +1,1606 @@
+From 03450504df5c4fe2d2ba5981aff7a532ab1ebf17 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:51:01 +0100
+Subject: [PATCH 06/12] crypto
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/crypto/Kconfig                   |   4 +-
+ arch/x86/crypto/aegis128-aesni-asm.S      | 533 ++++++++--------------
+ arch/x86/crypto/aegis128-aesni-glue.c     | 145 +++---
+ arch/x86/crypto/crc32c-intel_glue.c       |   2 +-
+ arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 354 +++++---------
+ 5 files changed, 387 insertions(+), 651 deletions(-)
+
+diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
+index 7b1bebed879d..3d2e38ba5240 100644
+--- a/arch/x86/crypto/Kconfig
++++ b/arch/x86/crypto/Kconfig
+@@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64
+ 	  - AVX-512VL (Advanced Vector Extensions-512VL)
+ 
+ config CRYPTO_AEGIS128_AESNI_SSE2
+-	tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
++	tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
+ 	depends on X86 && 64BIT
+ 	select CRYPTO_AEAD
+ 	select CRYPTO_SIMD
+@@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2
+ 
+ 	  Architecture: x86_64 using:
+ 	  - AES-NI (AES New Instructions)
+-	  - SSE2 (Streaming SIMD Extensions 2)
++	  - SSE4.1 (Streaming SIMD Extensions 4.1)
+ 
+ config CRYPTO_NHPOLY1305_SSE2
+ 	tristate "Hash functions: NHPoly1305 (SSE2)"
+diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
+index 2de859173940..7294dc0ee7ba 100644
+--- a/arch/x86/crypto/aegis128-aesni-asm.S
++++ b/arch/x86/crypto/aegis128-aesni-asm.S
+@@ -1,14 +1,13 @@
+ /* SPDX-License-Identifier: GPL-2.0-only */
+ /*
+- * AES-NI + SSE2 implementation of AEGIS-128
++ * AES-NI + SSE4.1 implementation of AEGIS-128
+  *
+  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
++ * Copyright 2024 Google LLC
+  */
+ 
+ #include <linux/linkage.h>
+-#include <linux/cfi_types.h>
+-#include <asm/frame.h>
+ 
+ #define STATE0	%xmm0
+ #define STATE1	%xmm1
+@@ -20,11 +19,6 @@
+ #define T0	%xmm6
+ #define T1	%xmm7
+ 
+-#define STATEP	%rdi
+-#define LEN	%esi
+-#define SRC	%rdx
+-#define DST	%rcx
+-
+ .section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+ .align 16
+ .Laegis128_const_0:
+@@ -34,11 +28,11 @@
+ 	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ 	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+ 
+-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+-.align 16
+-.Laegis128_counter:
+-	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+-	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
++.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
++.align 32
++.Lzeropad_mask:
++	.octa 0xffffffffffffffffffffffffffffffff
++	.octa 0
+ 
+ .text
+ 
+@@ -61,140 +55,102 @@
+ .endm
+ 
+ /*
+- * __load_partial: internal ABI
+- * input:
+- *   LEN - bytes
+- *   SRC - src
+- * output:
+- *   MSG  - message block
+- * changed:
+- *   T0
+- *   %r8
+- *   %r9
++ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
++ * MSG and zeroize any remaining bytes.  Clobbers %rax, %rcx, and %r8.
+  */
+-SYM_FUNC_START_LOCAL(__load_partial)
+-	xor %r9d, %r9d
+-	pxor MSG, MSG
+-
+-	mov LEN, %r8d
+-	and $0x1, %r8
+-	jz .Lld_partial_1
+-
+-	mov LEN, %r8d
+-	and $0x1E, %r8
+-	add SRC, %r8
+-	mov (%r8), %r9b
+-
+-.Lld_partial_1:
+-	mov LEN, %r8d
+-	and $0x2, %r8
+-	jz .Lld_partial_2
+-
+-	mov LEN, %r8d
+-	and $0x1C, %r8
+-	add SRC, %r8
+-	shl $0x10, %r9
+-	mov (%r8), %r9w
+-
+-.Lld_partial_2:
+-	mov LEN, %r8d
+-	and $0x4, %r8
+-	jz .Lld_partial_4
+-
+-	mov LEN, %r8d
+-	and $0x18, %r8
+-	add SRC, %r8
+-	shl $32, %r9
+-	mov (%r8), %r8d
+-	xor %r8, %r9
+-
+-.Lld_partial_4:
+-	movq %r9, MSG
+-
+-	mov LEN, %r8d
+-	and $0x8, %r8
+-	jz .Lld_partial_8
+-
+-	mov LEN, %r8d
+-	and $0x10, %r8
+-	add SRC, %r8
+-	pslldq $8, MSG
+-	movq (%r8), T0
+-	pxor T0, MSG
+-
+-.Lld_partial_8:
+-	RET
+-SYM_FUNC_END(__load_partial)
++.macro load_partial
++	sub $8, %ecx			/* LEN - 8 */
++	jle .Lle8\@
++
++	/* Load 9 <= LEN <= 15 bytes: */
++	movq (SRC), MSG			/* Load first 8 bytes */
++	mov (SRC, %rcx), %rax		/* Load last 8 bytes */
++	neg %ecx
++	shl $3, %ecx
++	shr %cl, %rax			/* Discard overlapping bytes */
++	pinsrq $1, %rax, MSG
++	jmp .Ldone\@
++
++.Lle8\@:
++	add $4, %ecx			/* LEN - 4 */
++	jl .Llt4\@
++
++	/* Load 4 <= LEN <= 8 bytes: */
++	mov (SRC), %eax			/* Load first 4 bytes */
++	mov (SRC, %rcx), %r8d		/* Load last 4 bytes */
++	jmp .Lcombine\@
++
++.Llt4\@:
++	/* Load 1 <= LEN <= 3 bytes: */
++	add $2, %ecx			/* LEN - 2 */
++	movzbl (SRC), %eax		/* Load first byte */
++	jl .Lmovq\@
++	movzwl (SRC, %rcx), %r8d	/* Load last 2 bytes */
++.Lcombine\@:
++	shl $3, %ecx
++	shl %cl, %r8
++	or %r8, %rax			/* Combine the two parts */
++.Lmovq\@:
++	movq %rax, MSG
++.Ldone\@:
++.endm
+ 
+ /*
+- * __store_partial: internal ABI
+- * input:
+- *   LEN - bytes
+- *   DST - dst
+- * output:
+- *   T0   - message block
+- * changed:
+- *   %r8
+- *   %r9
+- *   %r10
++ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
++ * DST.  Clobbers %rax, %rcx, and %r8.
+  */
+-SYM_FUNC_START_LOCAL(__store_partial)
+-	mov LEN, %r8d
+-	mov DST, %r9
+-
+-	movq T0, %r10
+-
+-	cmp $8, %r8
+-	jl .Lst_partial_8
+-
+-	mov %r10, (%r9)
+-	psrldq $8, T0
+-	movq T0, %r10
+-
+-	sub $8, %r8
+-	add $8, %r9
+-
+-.Lst_partial_8:
+-	cmp $4, %r8
+-	jl .Lst_partial_4
+-
+-	mov %r10d, (%r9)
+-	shr $32, %r10
+-
+-	sub $4, %r8
+-	add $4, %r9
+-
+-.Lst_partial_4:
+-	cmp $2, %r8
+-	jl .Lst_partial_2
+-
+-	mov %r10w, (%r9)
+-	shr $0x10, %r10
+-
+-	sub $2, %r8
+-	add $2, %r9
+-
+-.Lst_partial_2:
+-	cmp $1, %r8
+-	jl .Lst_partial_1
+-
+-	mov %r10b, (%r9)
+-
+-.Lst_partial_1:
+-	RET
+-SYM_FUNC_END(__store_partial)
++.macro store_partial msg
++	sub $8, %ecx			/* LEN - 8 */
++	jl .Llt8\@
++
++	/* Store 8 <= LEN <= 15 bytes: */
++	pextrq $1, \msg, %rax
++	mov %ecx, %r8d
++	shl $3, %ecx
++	ror %cl, %rax
++	mov %rax, (DST, %r8)		/* Store last LEN - 8 bytes */
++	movq \msg, (DST)		/* Store first 8 bytes */
++	jmp .Ldone\@
++
++.Llt8\@:
++	add $4, %ecx			/* LEN - 4 */
++	jl .Llt4\@
++
++	/* Store 4 <= LEN <= 7 bytes: */
++	pextrd $1, \msg, %eax
++	mov %ecx, %r8d
++	shl $3, %ecx
++	ror %cl, %eax
++	mov %eax, (DST, %r8)		/* Store last LEN - 4 bytes */
++	movd \msg, (DST)		/* Store first 4 bytes */
++	jmp .Ldone\@
++
++.Llt4\@:
++	/* Store 1 <= LEN <= 3 bytes: */
++	pextrb $0, \msg, 0(DST)
++	cmp $-2, %ecx			/* LEN - 4 == -2, i.e. LEN == 2? */
++	jl .Ldone\@
++	pextrb $1, \msg, 1(DST)
++	je .Ldone\@
++	pextrb $2, \msg, 2(DST)
++.Ldone\@:
++.endm
+ 
+ /*
+- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
++ * void aegis128_aesni_init(struct aegis_state *state,
++ *			    const struct aegis_block *key,
++ *			    const u8 iv[AEGIS128_NONCE_SIZE]);
+  */
+-SYM_FUNC_START(crypto_aegis128_aesni_init)
+-	FRAME_BEGIN
++SYM_FUNC_START(aegis128_aesni_init)
++	.set STATEP, %rdi
++	.set KEYP, %rsi
++	.set IVP, %rdx
+ 
+ 	/* load IV: */
+-	movdqu (%rdx), T1
++	movdqu (IVP), T1
+ 
+ 	/* load key: */
+-	movdqa (%rsi), KEY
++	movdqa (KEYP), KEY
+ 	pxor KEY, T1
+ 	movdqa T1, STATE0
+ 	movdqa KEY, STATE3
+@@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
+ 	movdqu STATE2, 0x20(STATEP)
+ 	movdqu STATE3, 0x30(STATEP)
+ 	movdqu STATE4, 0x40(STATEP)
+-
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_init)
++SYM_FUNC_END(aegis128_aesni_init)
+ 
+ /*
+- * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+- *                               const void *data);
++ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
++ *			  unsigned int len);
++ *
++ * len must be a multiple of 16.
+  */
+-SYM_FUNC_START(crypto_aegis128_aesni_ad)
+-	FRAME_BEGIN
++SYM_FUNC_START(aegis128_aesni_ad)
++	.set STATEP, %rdi
++	.set SRC, %rsi
++	.set LEN, %edx
+ 
+-	cmp $0x10, LEN
+-	jb .Lad_out
++	test LEN, LEN
++	jz .Lad_out
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu 0x30(STATEP), STATE3
+ 	movdqu 0x40(STATEP), STATE4
+ 
+-	mov SRC, %r8
+-	and $0xF, %r8
+-	jnz .Lad_u_loop
+-
+-.align 8
+-.Lad_a_loop:
+-	movdqa 0x00(SRC), MSG
+-	aegis128_update
+-	pxor MSG, STATE4
+-	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_1
+-
+-	movdqa 0x10(SRC), MSG
+-	aegis128_update
+-	pxor MSG, STATE3
+-	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_2
+-
+-	movdqa 0x20(SRC), MSG
+-	aegis128_update
+-	pxor MSG, STATE2
+-	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_3
+-
+-	movdqa 0x30(SRC), MSG
+-	aegis128_update
+-	pxor MSG, STATE1
+-	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_4
+-
+-	movdqa 0x40(SRC), MSG
+-	aegis128_update
+-	pxor MSG, STATE0
+-	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_0
+-
+-	add $0x50, SRC
+-	jmp .Lad_a_loop
+-
+ .align 8
+-.Lad_u_loop:
++.Lad_loop:
+ 	movdqu 0x00(SRC), MSG
+ 	aegis128_update
+ 	pxor MSG, STATE4
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_1
++	jz .Lad_out_1
+ 
+ 	movdqu 0x10(SRC), MSG
+ 	aegis128_update
+ 	pxor MSG, STATE3
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_2
++	jz .Lad_out_2
+ 
+ 	movdqu 0x20(SRC), MSG
+ 	aegis128_update
+ 	pxor MSG, STATE2
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_3
++	jz .Lad_out_3
+ 
+ 	movdqu 0x30(SRC), MSG
+ 	aegis128_update
+ 	pxor MSG, STATE1
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_4
++	jz .Lad_out_4
+ 
+ 	movdqu 0x40(SRC), MSG
+ 	aegis128_update
+ 	pxor MSG, STATE0
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lad_out_0
++	jz .Lad_out_0
+ 
+ 	add $0x50, SRC
+-	jmp .Lad_u_loop
++	jmp .Lad_loop
+ 
+ 	/* store the state: */
+ .Lad_out_0:
+@@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu STATE2, 0x20(STATEP)
+ 	movdqu STATE3, 0x30(STATEP)
+ 	movdqu STATE4, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lad_out_1:
+@@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu STATE1, 0x20(STATEP)
+ 	movdqu STATE2, 0x30(STATEP)
+ 	movdqu STATE3, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lad_out_2:
+@@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu STATE0, 0x20(STATEP)
+ 	movdqu STATE1, 0x30(STATEP)
+ 	movdqu STATE2, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lad_out_3:
+@@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu STATE4, 0x20(STATEP)
+ 	movdqu STATE0, 0x30(STATEP)
+ 	movdqu STATE1, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lad_out_4:
+@@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
+ 	movdqu STATE3, 0x20(STATEP)
+ 	movdqu STATE4, 0x30(STATEP)
+ 	movdqu STATE0, 0x40(STATEP)
+-	FRAME_END
+-	RET
+-
+ .Lad_out:
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_ad)
++SYM_FUNC_END(aegis128_aesni_ad)
+ 
+-.macro encrypt_block a s0 s1 s2 s3 s4 i
+-	movdq\a (\i * 0x10)(SRC), MSG
++.macro encrypt_block s0 s1 s2 s3 s4 i
++	movdqu (\i * 0x10)(SRC), MSG
+ 	movdqa MSG, T0
+ 	pxor \s1, T0
+ 	pxor \s4, T0
+ 	movdqa \s2, T1
+ 	pand \s3, T1
+ 	pxor T1, T0
+-	movdq\a T0, (\i * 0x10)(DST)
++	movdqu T0, (\i * 0x10)(DST)
+ 
+ 	aegis128_update
+ 	pxor MSG, \s4
+ 
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Lenc_out_\i
++	jz .Lenc_out_\i
+ .endm
+ 
+ /*
+- * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+- *                                const void *src, void *dst);
++ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
++ *			   unsigned int len);
++ *
++ * len must be nonzero and a multiple of 16.
+  */
+-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+-	FRAME_BEGIN
+-
+-	cmp $0x10, LEN
+-	jb .Lenc_out
++SYM_FUNC_START(aegis128_aesni_enc)
++	.set STATEP, %rdi
++	.set SRC, %rsi
++	.set DST, %rdx
++	.set LEN, %ecx
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu 0x30(STATEP), STATE3
+ 	movdqu 0x40(STATEP), STATE4
+ 
+-	mov  SRC,  %r8
+-	or   DST,  %r8
+-	and $0xF, %r8
+-	jnz .Lenc_u_loop
+-
+ .align 8
+-.Lenc_a_loop:
+-	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+-	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+-	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+-	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+-	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
++.Lenc_loop:
++	encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
++	encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
++	encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
++	encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
++	encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
+ 
+ 	add $0x50, SRC
+ 	add $0x50, DST
+-	jmp .Lenc_a_loop
+-
+-.align 8
+-.Lenc_u_loop:
+-	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+-	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+-	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+-	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+-	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+-
+-	add $0x50, SRC
+-	add $0x50, DST
+-	jmp .Lenc_u_loop
++	jmp .Lenc_loop
+ 
+ 	/* store the state: */
+ .Lenc_out_0:
+@@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu STATE1, 0x20(STATEP)
+ 	movdqu STATE2, 0x30(STATEP)
+ 	movdqu STATE3, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lenc_out_1:
+@@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu STATE0, 0x20(STATEP)
+ 	movdqu STATE1, 0x30(STATEP)
+ 	movdqu STATE2, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lenc_out_2:
+@@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu STATE4, 0x20(STATEP)
+ 	movdqu STATE0, 0x30(STATEP)
+ 	movdqu STATE1, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lenc_out_3:
+@@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu STATE3, 0x20(STATEP)
+ 	movdqu STATE4, 0x30(STATEP)
+ 	movdqu STATE0, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Lenc_out_4:
+@@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+ 	movdqu STATE2, 0x20(STATEP)
+ 	movdqu STATE3, 0x30(STATEP)
+ 	movdqu STATE4, 0x40(STATEP)
+-	FRAME_END
+-	RET
+-
+ .Lenc_out:
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_enc)
++SYM_FUNC_END(aegis128_aesni_enc)
+ 
+ /*
+- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+- *                                     const void *src, void *dst);
++ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
++ *				u8 *dst, unsigned int len);
+  */
+-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+-	FRAME_BEGIN
++SYM_FUNC_START(aegis128_aesni_enc_tail)
++	.set STATEP, %rdi
++	.set SRC, %rsi
++	.set DST, %rdx
++	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+ 	movdqu 0x40(STATEP), STATE4
+ 
+ 	/* encrypt message: */
+-	call __load_partial
++	mov LEN, %r9d
++	load_partial
+ 
+ 	movdqa MSG, T0
+ 	pxor STATE1, T0
+@@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+ 	pand STATE3, T1
+ 	pxor T1, T0
+ 
+-	call __store_partial
++	mov %r9d, LEN
++	store_partial T0
+ 
+ 	aegis128_update
+ 	pxor MSG, STATE4
+@@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+ 	movdqu STATE1, 0x20(STATEP)
+ 	movdqu STATE2, 0x30(STATEP)
+ 	movdqu STATE3, 0x40(STATEP)
+-
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
++SYM_FUNC_END(aegis128_aesni_enc_tail)
+ 
+-.macro decrypt_block a s0 s1 s2 s3 s4 i
+-	movdq\a (\i * 0x10)(SRC), MSG
++.macro decrypt_block s0 s1 s2 s3 s4 i
++	movdqu (\i * 0x10)(SRC), MSG
+ 	pxor \s1, MSG
+ 	pxor \s4, MSG
+ 	movdqa \s2, T1
+ 	pand \s3, T1
+ 	pxor T1, MSG
+-	movdq\a MSG, (\i * 0x10)(DST)
++	movdqu MSG, (\i * 0x10)(DST)
+ 
+ 	aegis128_update
+ 	pxor MSG, \s4
+ 
+ 	sub $0x10, LEN
+-	cmp $0x10, LEN
+-	jl .Ldec_out_\i
++	jz .Ldec_out_\i
+ .endm
+ 
+ /*
+- * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+- *                                const void *src, void *dst);
++ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
++ *			   unsigned int len);
++ *
++ * len must be nonzero and a multiple of 16.
+  */
+-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+-	FRAME_BEGIN
+-
+-	cmp $0x10, LEN
+-	jb .Ldec_out
++SYM_FUNC_START(aegis128_aesni_dec)
++	.set STATEP, %rdi
++	.set SRC, %rsi
++	.set DST, %rdx
++	.set LEN, %ecx
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu 0x30(STATEP), STATE3
+ 	movdqu 0x40(STATEP), STATE4
+ 
+-	mov  SRC, %r8
+-	or   DST, %r8
+-	and $0xF, %r8
+-	jnz .Ldec_u_loop
+-
+ .align 8
+-.Ldec_a_loop:
+-	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+-	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+-	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+-	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+-	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
++.Ldec_loop:
++	decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
++	decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
++	decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
++	decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
++	decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
+ 
+ 	add $0x50, SRC
+ 	add $0x50, DST
+-	jmp .Ldec_a_loop
+-
+-.align 8
+-.Ldec_u_loop:
+-	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+-	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+-	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+-	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+-	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+-
+-	add $0x50, SRC
+-	add $0x50, DST
+-	jmp .Ldec_u_loop
++	jmp .Ldec_loop
+ 
+ 	/* store the state: */
+ .Ldec_out_0:
+@@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu STATE1, 0x20(STATEP)
+ 	movdqu STATE2, 0x30(STATEP)
+ 	movdqu STATE3, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Ldec_out_1:
+@@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu STATE0, 0x20(STATEP)
+ 	movdqu STATE1, 0x30(STATEP)
+ 	movdqu STATE2, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Ldec_out_2:
+@@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu STATE4, 0x20(STATEP)
+ 	movdqu STATE0, 0x30(STATEP)
+ 	movdqu STATE1, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Ldec_out_3:
+@@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu STATE3, 0x20(STATEP)
+ 	movdqu STATE4, 0x30(STATEP)
+ 	movdqu STATE0, 0x40(STATEP)
+-	FRAME_END
+ 	RET
+ 
+ .Ldec_out_4:
+@@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+ 	movdqu STATE2, 0x20(STATEP)
+ 	movdqu STATE3, 0x30(STATEP)
+ 	movdqu STATE4, 0x40(STATEP)
+-	FRAME_END
+-	RET
+-
+ .Ldec_out:
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_dec)
++SYM_FUNC_END(aegis128_aesni_dec)
+ 
+ /*
+- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+- *                                     const void *src, void *dst);
++ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
++ *				u8 *dst, unsigned int len);
+  */
+-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+-	FRAME_BEGIN
++SYM_FUNC_START(aegis128_aesni_dec_tail)
++	.set STATEP, %rdi
++	.set SRC, %rsi
++	.set DST, %rdx
++	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+ 	movdqu 0x40(STATEP), STATE4
+ 
+ 	/* decrypt message: */
+-	call __load_partial
++	mov LEN, %r9d
++	load_partial
+ 
+ 	pxor STATE1, MSG
+ 	pxor STATE4, MSG
+@@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+ 	pand STATE3, T1
+ 	pxor T1, MSG
+ 
+-	movdqa MSG, T0
+-	call __store_partial
++	mov %r9d, LEN
++	store_partial MSG
+ 
+ 	/* mask with byte count: */
+-	movd LEN, T0
+-	punpcklbw T0, T0
+-	punpcklbw T0, T0
+-	punpcklbw T0, T0
+-	punpcklbw T0, T0
+-	movdqa .Laegis128_counter(%rip), T1
+-	pcmpgtb T1, T0
++	lea .Lzeropad_mask+16(%rip), %rax
++	sub %r9, %rax
++	movdqu (%rax), T0
+ 	pand T0, MSG
+ 
+ 	aegis128_update
+@@ -695,18 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+ 	movdqu STATE1, 0x20(STATEP)
+ 	movdqu STATE2, 0x30(STATEP)
+ 	movdqu STATE3, 0x40(STATEP)
+-
+-	FRAME_END
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
++SYM_FUNC_END(aegis128_aesni_dec_tail)
+ 
+ /*
+- * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+- *                                  unsigned int assoclen,
+- *                                  unsigned int cryptlen);
++ * void aegis128_aesni_final(struct aegis_state *state,
++ *			     struct aegis_block *tag_xor,
++ *			     unsigned int assoclen, unsigned int cryptlen);
+  */
+-SYM_FUNC_START(crypto_aegis128_aesni_final)
+-	FRAME_BEGIN
++SYM_FUNC_START(aegis128_aesni_final)
++	.set STATEP, %rdi
++	.set TAG_XOR, %rsi
++	.set ASSOCLEN, %edx
++	.set CRYPTLEN, %ecx
+ 
+ 	/* load the state: */
+ 	movdqu 0x00(STATEP), STATE0
+@@ -716,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
+ 	movdqu 0x40(STATEP), STATE4
+ 
+ 	/* prepare length block: */
+-	movd %edx, MSG
+-	movd %ecx, T0
+-	pslldq $8, T0
+-	pxor T0, MSG
++	movd ASSOCLEN, MSG
++	pinsrd $2, CRYPTLEN, MSG
+ 	psllq $3, MSG /* multiply by 8 (to get bit count) */
+ 
+ 	pxor STATE3, MSG
+@@ -734,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
+ 	aegis128_update; pxor MSG, STATE3
+ 
+ 	/* xor tag: */
+-	movdqu (%rsi), MSG
++	movdqu (TAG_XOR), MSG
+ 
+ 	pxor STATE0, MSG
+ 	pxor STATE1, MSG
+@@ -742,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
+ 	pxor STATE3, MSG
+ 	pxor STATE4, MSG
+ 
+-	movdqu MSG, (%rsi)
+-
+-	FRAME_END
++	movdqu MSG, (TAG_XOR)
+ 	RET
+-SYM_FUNC_END(crypto_aegis128_aesni_final)
++SYM_FUNC_END(aegis128_aesni_final)
+diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
+index 4623189000d8..c19d8e3d96a3 100644
+--- a/arch/x86/crypto/aegis128-aesni-glue.c
++++ b/arch/x86/crypto/aegis128-aesni-glue.c
+@@ -1,7 +1,7 @@
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+  * The AEGIS-128 Authenticated-Encryption Algorithm
+- *   Glue for AES-NI + SSE2 implementation
++ *   Glue for AES-NI + SSE4.1 implementation
+  *
+  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+@@ -23,27 +23,6 @@
+ #define AEGIS128_MIN_AUTH_SIZE 8
+ #define AEGIS128_MAX_AUTH_SIZE 16
+ 
+-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+-
+-asmlinkage void crypto_aegis128_aesni_ad(
+-		void *state, unsigned int length, const void *data);
+-
+-asmlinkage void crypto_aegis128_aesni_enc(
+-		void *state, unsigned int length, const void *src, void *dst);
+-
+-asmlinkage void crypto_aegis128_aesni_dec(
+-		void *state, unsigned int length, const void *src, void *dst);
+-
+-asmlinkage void crypto_aegis128_aesni_enc_tail(
+-		void *state, unsigned int length, const void *src, void *dst);
+-
+-asmlinkage void crypto_aegis128_aesni_dec_tail(
+-		void *state, unsigned int length, const void *src, void *dst);
+-
+-asmlinkage void crypto_aegis128_aesni_final(
+-		void *state, void *tag_xor, unsigned int cryptlen,
+-		unsigned int assoclen);
+-
+ struct aegis_block {
+ 	u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+ };
+@@ -56,15 +35,31 @@ struct aegis_ctx {
+ 	struct aegis_block key;
+ };
+ 
+-struct aegis_crypt_ops {
+-	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+-				  struct aead_request *req, bool atomic);
++asmlinkage void aegis128_aesni_init(struct aegis_state *state,
++				    const struct aegis_block *key,
++				    const u8 iv[AEGIS128_NONCE_SIZE]);
+ 
+-	void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+-			     void *dst);
+-	void (*crypt_tail)(void *state, unsigned int length, const void *src,
+-			   void *dst);
+-};
++asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
++				  unsigned int len);
++
++asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
++				   u8 *dst, unsigned int len);
++
++asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
++				   u8 *dst, unsigned int len);
++
++asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
++					const u8 *src, u8 *dst,
++					unsigned int len);
++
++asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
++					const u8 *src, u8 *dst,
++					unsigned int len);
++
++asmlinkage void aegis128_aesni_final(struct aegis_state *state,
++				     struct aegis_block *tag_xor,
++				     unsigned int assoclen,
++				     unsigned int cryptlen);
+ 
+ static void crypto_aegis128_aesni_process_ad(
+ 		struct aegis_state *state, struct scatterlist *sg_src,
+@@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad(
+ 			if (pos > 0) {
+ 				unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+ 				memcpy(buf.bytes + pos, src, fill);
+-				crypto_aegis128_aesni_ad(state,
+-							 AEGIS128_BLOCK_SIZE,
+-							 buf.bytes);
++				aegis128_aesni_ad(state, buf.bytes,
++						  AEGIS128_BLOCK_SIZE);
+ 				pos = 0;
+ 				left -= fill;
+ 				src += fill;
+ 			}
+ 
+-			crypto_aegis128_aesni_ad(state, left, src);
+-
++			aegis128_aesni_ad(state, src,
++					  left & ~(AEGIS128_BLOCK_SIZE - 1));
+ 			src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+ 			left &= AEGIS128_BLOCK_SIZE - 1;
+ 		}
+@@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad(
+ 
+ 	if (pos > 0) {
+ 		memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+-		crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
++		aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
+ 	}
+ }
+ 
+-static void crypto_aegis128_aesni_process_crypt(
+-		struct aegis_state *state, struct skcipher_walk *walk,
+-		const struct aegis_crypt_ops *ops)
++static __always_inline void
++crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
++				    struct skcipher_walk *walk, bool enc)
+ {
+ 	while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+-		ops->crypt_blocks(state,
+-				  round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+-				  walk->src.virt.addr, walk->dst.virt.addr);
++		if (enc)
++			aegis128_aesni_enc(state, walk->src.virt.addr,
++					   walk->dst.virt.addr,
++					   round_down(walk->nbytes,
++						      AEGIS128_BLOCK_SIZE));
++		else
++			aegis128_aesni_dec(state, walk->src.virt.addr,
++					   walk->dst.virt.addr,
++					   round_down(walk->nbytes,
++						      AEGIS128_BLOCK_SIZE));
+ 		skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ 	}
+ 
+ 	if (walk->nbytes) {
+-		ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+-				walk->dst.virt.addr);
++		if (enc)
++			aegis128_aesni_enc_tail(state, walk->src.virt.addr,
++						walk->dst.virt.addr,
++						walk->nbytes);
++		else
++			aegis128_aesni_dec_tail(state, walk->src.virt.addr,
++						walk->dst.virt.addr,
++						walk->nbytes);
+ 		skcipher_walk_done(walk, 0);
+ 	}
+ }
+@@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+ 	return 0;
+ }
+ 
+-static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+-					struct aegis_block *tag_xor,
+-					unsigned int cryptlen,
+-					const struct aegis_crypt_ops *ops)
++static __always_inline void
++crypto_aegis128_aesni_crypt(struct aead_request *req,
++			    struct aegis_block *tag_xor,
++			    unsigned int cryptlen, bool enc)
+ {
+ 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ 	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ 	struct skcipher_walk walk;
+ 	struct aegis_state state;
+ 
+-	ops->skcipher_walk_init(&walk, req, true);
++	if (enc)
++		skcipher_walk_aead_encrypt(&walk, req, true);
++	else
++		skcipher_walk_aead_decrypt(&walk, req, true);
+ 
+ 	kernel_fpu_begin();
+ 
+-	crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
++	aegis128_aesni_init(&state, &ctx->key, req->iv);
+ 	crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+-	crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
+-	crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
++	crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
++	aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+ 
+ 	kernel_fpu_end();
+ }
+ 
+ static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+ {
+-	static const struct aegis_crypt_ops OPS = {
+-		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+-		.crypt_blocks = crypto_aegis128_aesni_enc,
+-		.crypt_tail = crypto_aegis128_aesni_enc_tail,
+-	};
+-
+ 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ 	struct aegis_block tag = {};
+ 	unsigned int authsize = crypto_aead_authsize(tfm);
+ 	unsigned int cryptlen = req->cryptlen;
+ 
+-	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
++	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ 
+ 	scatterwalk_map_and_copy(tag.bytes, req->dst,
+ 				 req->assoclen + cryptlen, authsize, 1);
+@@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+ {
+ 	static const struct aegis_block zeros = {};
+ 
+-	static const struct aegis_crypt_ops OPS = {
+-		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+-		.crypt_blocks = crypto_aegis128_aesni_dec,
+-		.crypt_tail = crypto_aegis128_aesni_dec_tail,
+-	};
+-
+ 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ 	struct aegis_block tag;
+ 	unsigned int authsize = crypto_aead_authsize(tfm);
+@@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+ 	scatterwalk_map_and_copy(tag.bytes, req->src,
+ 				 req->assoclen + cryptlen, authsize, 0);
+ 
+-	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
++	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ 
+ 	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+ }
+ 
+-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+-{
+-	return 0;
+-}
+-
+-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+-{
+-}
+-
+ static struct aead_alg crypto_aegis128_aesni_alg = {
+ 	.setkey = crypto_aegis128_aesni_setkey,
+ 	.setauthsize = crypto_aegis128_aesni_setauthsize,
+ 	.encrypt = crypto_aegis128_aesni_encrypt,
+ 	.decrypt = crypto_aegis128_aesni_decrypt,
+-	.init = crypto_aegis128_aesni_init_tfm,
+-	.exit = crypto_aegis128_aesni_exit_tfm,
+ 
+ 	.ivsize = AEGIS128_NONCE_SIZE,
+ 	.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+@@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg;
+ 
+ static int __init crypto_aegis128_aesni_module_init(void)
+ {
+-	if (!boot_cpu_has(X86_FEATURE_XMM2) ||
++	if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
+ 	    !boot_cpu_has(X86_FEATURE_AES) ||
+ 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ 		return -ENODEV;
+@@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit);
+ 
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
++MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
+ MODULE_ALIAS_CRYPTO("aegis128");
+ MODULE_ALIAS_CRYPTO("aegis128-aesni");
+diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
+index feccb5254c7e..52c5d47ef5a1 100644
+--- a/arch/x86/crypto/crc32c-intel_glue.c
++++ b/arch/x86/crypto/crc32c-intel_glue.c
+@@ -41,7 +41,7 @@
+  */
+ #define CRC32C_PCL_BREAKEVEN	512
+ 
+-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
++asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
+ 				unsigned int crc_init);
+ #endif /* CONFIG_X86_64 */
+ 
+diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+index bbcff1fb78cb..752812bc4991 100644
+--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+@@ -7,6 +7,7 @@
+  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+  *
+  * Copyright (C) 2012 Intel Corporation.
++ * Copyright 2024 Google LLC
+  *
+  * Authors:
+  *	Wajdi Feghali <wajdi.k.feghali@intel.com>
+@@ -44,185 +45,129 @@
+  */
+ 
+ #include <linux/linkage.h>
+-#include <asm/nospec-branch.h>
+ 
+ ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+ 
+-.macro LABEL prefix n
+-.L\prefix\n\():
+-.endm
+-
+-.macro JMPTBL_ENTRY i
+-.quad .Lcrc_\i
+-.endm
+-
+-.macro JNC_LESS_THAN j
+-	jnc .Lless_than_\j
+-.endm
+-
+-# Define threshold where buffers are considered "small" and routed to more
+-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+-# SMALL_SIZE can be no larger than 255.
+-
++# Define threshold below which buffers are considered "small" and routed to
++# regular CRC code that does not interleave the CRC instructions.
+ #define SMALL_SIZE 200
+ 
+-.if (SMALL_SIZE > 255)
+-.error "SMALL_ SIZE must be < 256"
+-.endif
+-
+-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
++# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
+ 
+ .text
+ SYM_FUNC_START(crc_pcl)
+-#define    bufp		rdi
+-#define    bufp_dw	%edi
+-#define    bufp_w	%di
+-#define    bufp_b	%dil
+-#define    bufptmp	%rcx
+-#define    block_0	%rcx
+-#define    block_1	%rdx
+-#define    block_2	%r11
+-#define    len		%rsi
+-#define    len_dw	%esi
+-#define    len_w	%si
+-#define    len_b	%sil
+-#define    crc_init_arg %rdx
+-#define    tmp		%rbx
+-#define    crc_init	%r8
+-#define    crc_init_dw	%r8d
+-#define    crc1		%r9
+-#define    crc2		%r10
+-
+-	pushq   %rbx
+-	pushq   %rdi
+-	pushq   %rsi
+-
+-	## Move crc_init for Linux to a different
+-	mov     crc_init_arg, crc_init
++#define    bufp		  %rdi
++#define    bufp_d	  %edi
++#define    len		  %esi
++#define    crc_init	  %edx
++#define    crc_init_q	  %rdx
++#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
++#define    n_misaligned_q %rcx
++#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
++#define    chunk_bytes_q  %rcx
++#define    crc1		  %r8
++#define    crc2		  %r9
++
++	cmp	$SMALL_SIZE, len
++	jb	.Lsmall
+ 
+ 	################################################################
+ 	## 1) ALIGN:
+ 	################################################################
+-
+-	mov     %bufp, bufptmp		# rdi = *buf
+-	neg     %bufp
+-	and     $7, %bufp		# calculate the unalignment amount of
++	mov	bufp_d, n_misaligned
++	neg	n_misaligned
++	and	$7, n_misaligned	# calculate the misalignment amount of
+ 					# the address
+-	je      .Lproc_block		# Skip if aligned
+-
+-	## If len is less than 8 and we're unaligned, we need to jump
+-	## to special code to avoid reading beyond the end of the buffer
+-	cmp     $8, len
+-	jae     .Ldo_align
+-	# less_than_8 expects length in upper 3 bits of len_dw
+-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+-	shl     $32-3+1, len_dw
+-	jmp     .Lless_than_8_post_shl1
++	je	.Laligned		# Skip if aligned
+ 
++	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
++	# the remaining data to an 8-byte boundary.
+ .Ldo_align:
+-	#### Calculate CRC of unaligned bytes of the buffer (if any)
+-	movq    (bufptmp), tmp		# load a quadward from the buffer
+-	add     %bufp, bufptmp		# align buffer pointer for quadword
+-					# processing
+-	sub     %bufp, len		# update buffer length
++	movq	(bufp), %rax
++	add	n_misaligned_q, bufp
++	sub	n_misaligned, len
+ .Lalign_loop:
+-	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
+-	shr     $8, tmp			# get next byte
+-	dec     %bufp
++	crc32b	%al, crc_init		# compute crc32 of 1-byte
++	shr	$8, %rax		# get next byte
++	dec	n_misaligned
+ 	jne     .Lalign_loop
+-
+-.Lproc_block:
++.Laligned:
+ 
+ 	################################################################
+-	## 2) PROCESS  BLOCKS:
++	## 2) PROCESS BLOCK:
+ 	################################################################
+ 
+-	## compute num of bytes to be processed
+-	movq    len, tmp		# save num bytes in tmp
+-
+-	cmpq    $128*24, len
++	cmp	$128*24, len
+ 	jae     .Lfull_block
+ 
+-.Lcontinue_block:
+-	cmpq    $SMALL_SIZE, len
+-	jb      .Lsmall
+-
+-	## len < 128*24
+-	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+-	mul     len_dw
+-	shrq    $16, %rax
+-
+-	## eax contains floor(bytes / 24) = num 24-byte chunks to do
+-
+-	## process rax 24-byte chunks (128 >= rax >= 0)
+-
+-	## compute end address of each block
+-	## block 0 (base addr + RAX * 8)
+-	## block 1 (base addr + RAX * 16)
+-	## block 2 (base addr + RAX * 24)
+-	lea     (bufptmp, %rax, 8), block_0
+-	lea     (block_0, %rax, 8), block_1
+-	lea     (block_1, %rax, 8), block_2
++.Lpartial_block:
++	# Compute floor(len / 24) to get num qwords to process from each lane.
++	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
++	shr	$16, %eax
++	jmp	.Lcrc_3lanes
+ 
+-	xor     crc1, crc1
+-	xor     crc2, crc2
+-
+-	## branch into array
+-	leaq	jump_table(%rip), %bufp
+-	mov	(%bufp,%rax,8), %bufp
+-	JMP_NOSPEC bufp
+-
+-	################################################################
+-	## 2a) PROCESS FULL BLOCKS:
+-	################################################################
+ .Lfull_block:
+-	movl    $128,%eax
+-	lea     128*8*2(block_0), block_1
+-	lea     128*8*3(block_0), block_2
+-	add     $128*8*1, block_0
+-
+-	xor     crc1,crc1
+-	xor     crc2,crc2
+-
+-	# Fall through into top of crc array (crc_128)
++	# Processing 128 qwords from each lane.
++	mov	$128, %eax
+ 
+ 	################################################################
+-	## 3) CRC Array:
++	## 3) CRC each of three lanes:
+ 	################################################################
+ 
+-	i=128
+-.rept 128-1
+-.altmacro
+-LABEL crc_ %i
+-.noaltmacro
+-	ENDBR
+-	crc32q   -i*8(block_0), crc_init
+-	crc32q   -i*8(block_1), crc1
+-	crc32q   -i*8(block_2), crc2
+-	i=(i-1)
+-.endr
+-
+-.altmacro
+-LABEL crc_ %i
+-.noaltmacro
+-	ENDBR
+-	crc32q   -i*8(block_0), crc_init
+-	crc32q   -i*8(block_1), crc1
+-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+-
+-	mov     block_2, block_0
++.Lcrc_3lanes:
++	xor	crc1,crc1
++	xor     crc2,crc2
++	mov	%eax, chunk_bytes
++	shl	$3, chunk_bytes		# num bytes to process from each lane
++	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
++	jl	.Lcrc_3lanes_4x_done
++
++	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
++	# bookkeeping instructions, which can compete with crc32q for the ALUs.
++.Lcrc_3lanes_4x_loop:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++	crc32q	(bufp,chunk_bytes_q,2), crc2
++	crc32q	8(bufp), crc_init_q
++	crc32q	8(bufp,chunk_bytes_q), crc1
++	crc32q	8(bufp,chunk_bytes_q,2), crc2
++	crc32q	16(bufp), crc_init_q
++	crc32q	16(bufp,chunk_bytes_q), crc1
++	crc32q	16(bufp,chunk_bytes_q,2), crc2
++	crc32q	24(bufp), crc_init_q
++	crc32q	24(bufp,chunk_bytes_q), crc1
++	crc32q	24(bufp,chunk_bytes_q,2), crc2
++	add	$32, bufp
++	sub	$4, %eax
++	jge	.Lcrc_3lanes_4x_loop
++
++.Lcrc_3lanes_4x_done:
++	add	$4, %eax
++	jz	.Lcrc_3lanes_last_qword
++
++.Lcrc_3lanes_1x_loop:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++	crc32q	(bufp,chunk_bytes_q,2), crc2
++	add	$8, bufp
++	dec	%eax
++	jnz	.Lcrc_3lanes_1x_loop
++
++.Lcrc_3lanes_last_qword:
++	crc32q	(bufp), crc_init_q
++	crc32q	(bufp,chunk_bytes_q), crc1
++# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
+ 
+ 	################################################################
+ 	## 4) Combine three results:
+ 	################################################################
+ 
+-	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
+-	shlq    $3, %rax			# rax *= 8
+-	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
+-	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
+-	subq    %rax, tmp			# tmp -= rax*24
++	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
++	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
++	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
++	sub	%eax, len			# len -= chunk_bytes * 3
+ 
+-	movq    crc_init, %xmm1			# CRC for block 1
++	movq	crc_init_q, %xmm1		# CRC for block 1
+ 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
+ 
+ 	movq    crc1, %xmm2			# CRC for block 2
+@@ -230,103 +175,54 @@ LABEL crc_ %i
+ 
+ 	pxor    %xmm2,%xmm1
+ 	movq    %xmm1, %rax
+-	xor     -i*8(block_2), %rax
+-	mov     crc2, crc_init
+-	crc32   %rax, crc_init
++	xor	(bufp,chunk_bytes_q,2), %rax
++	mov	crc2, crc_init_q
++	crc32	%rax, crc_init_q
++	lea	8(bufp,chunk_bytes_q,2), bufp
+ 
+ 	################################################################
+-	## 5) Check for end:
++	## 5) If more blocks remain, goto (2):
+ 	################################################################
+ 
+-LABEL crc_ 0
+-	ENDBR
+-	mov     tmp, len
+-	cmp     $128*24, tmp
+-	jae     .Lfull_block
+-	cmp     $24, tmp
+-	jae     .Lcontinue_block
+-
+-.Lless_than_24:
+-	shl     $32-4, len_dw			# less_than_16 expects length
+-						# in upper 4 bits of len_dw
+-	jnc     .Lless_than_16
+-	crc32q  (bufptmp), crc_init
+-	crc32q  8(bufptmp), crc_init
+-	jz      .Ldo_return
+-	add     $16, bufptmp
+-	# len is less than 8 if we got here
+-	# less_than_8 expects length in upper 3 bits of len_dw
+-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+-	shl     $2, len_dw
+-	jmp     .Lless_than_8_post_shl1
++	cmp	$128*24, len
++	jae	.Lfull_block
++	cmp	$SMALL_SIZE, len
++	jae	.Lpartial_block
+ 
+ 	#######################################################################
+-	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
++	## 6) Process any remainder without interleaving:
+ 	#######################################################################
+ .Lsmall:
+-	shl $32-8, len_dw		# Prepare len_dw for less_than_256
+-	j=256
+-.rept 5					# j = {256, 128, 64, 32, 16}
+-.altmacro
+-LABEL less_than_ %j			# less_than_j: Length should be in
+-					# upper lg(j) bits of len_dw
+-	j=(j/2)
+-	shl     $1, len_dw		# Get next MSB
+-	JNC_LESS_THAN %j
+-.noaltmacro
+-	i=0
+-.rept (j/8)
+-	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
+-	i=i+8
+-.endr
+-	jz      .Ldo_return		# Return if remaining length is zero
+-	add     $j, bufptmp		# Advance buf
+-.endr
+-
+-.Lless_than_8:				# Length should be stored in
+-					# upper 3 bits of len_dw
+-	shl     $1, len_dw
+-.Lless_than_8_post_shl1:
+-	jnc     .Lless_than_4
+-	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
+-	jz      .Ldo_return		# return if remaining data is zero
+-	add     $4, bufptmp
+-.Lless_than_4:				# Length should be stored in
+-					# upper 2 bits of len_dw
+-	shl     $1, len_dw
+-	jnc     .Lless_than_2
+-	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
+-	jz      .Ldo_return		# return if remaining data is zero
+-	add     $2, bufptmp
+-.Lless_than_2:				# Length should be stored in the MSB
+-					# of len_dw
+-	shl     $1, len_dw
+-	jnc     .Lless_than_1
+-	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
+-.Lless_than_1:				# Length should be zero
+-.Ldo_return:
+-	movq    crc_init, %rax
+-	popq    %rsi
+-	popq    %rdi
+-	popq    %rbx
++	test	len, len
++	jz	.Ldone
++	mov	len, %eax
++	shr	$3, %eax
++	jz	.Ldo_dword
++.Ldo_qwords:
++	crc32q	(bufp), crc_init_q
++	add	$8, bufp
++	dec	%eax
++	jnz	.Ldo_qwords
++.Ldo_dword:
++	test	$4, len
++	jz	.Ldo_word
++	crc32l	(bufp), crc_init
++	add	$4, bufp
++.Ldo_word:
++	test	$2, len
++	jz	.Ldo_byte
++	crc32w	(bufp), crc_init
++	add	$2, bufp
++.Ldo_byte:
++	test	$1, len
++	jz	.Ldone
++	crc32b	(bufp), crc_init
++.Ldone:
++	mov	crc_init, %eax
+         RET
+ SYM_FUNC_END(crc_pcl)
+ 
+ .section	.rodata, "a", @progbits
+-        ################################################################
+-        ## jump table        Table is 129 entries x 2 bytes each
+-        ################################################################
+-.align 4
+-jump_table:
+-	i=0
+-.rept 129
+-.altmacro
+-JMPTBL_ENTRY %i
+-.noaltmacro
+-	i=i+1
+-.endr
+-
+-
+ 	################################################################
+ 	## PCLMULQDQ tables
+ 	## Table is 128 entries x 2 words (8 bytes) each
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0007-fixes.patch b/sys-kernel/gentoo-sources-6.12/0007-fixes.patch
new file mode 100644
index 0000000..ec28dd2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0007-fixes.patch
@@ -0,0 +1,955 @@
+From 3e7168943409ace243e6d4b10896d6e71b5e0c4d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:51:12 +0100
+Subject: [PATCH 07/12] fixes
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/Kconfig                               |  4 +-
+ arch/x86/include/asm/futex.h               |  8 ++-
+ arch/x86/mm/tlb.c                          |  2 +-
+ drivers/bluetooth/btmtk.c                  |  4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 50 +++++++++++++++--
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  6 ++-
+ drivers/gpu/drm/drm_edid.c                 | 47 ++++++++++++++--
+ drivers/hid/hid-ids.h                      |  1 +
+ fs/ntfs3/bitmap.c                          | 62 ++++++----------------
+ fs/ntfs3/file.c                            | 32 ++++++-----
+ fs/ntfs3/frecord.c                         |  1 -
+ fs/ntfs3/fsntfs.c                          |  2 +-
+ fs/ntfs3/record.c                          | 16 ++++--
+ fs/ntfs3/run.c                             |  6 +--
+ kernel/futex/core.c                        | 22 --------
+ kernel/futex/futex.h                       | 59 +++++++++++++++++++-
+ kernel/kprobes.c                           | 23 ++++----
+ kernel/workqueue.c                         | 22 ++++++--
+ scripts/package/PKGBUILD                   |  5 ++
+ sound/pci/hda/patch_realtek.c              |  2 +
+ 21 files changed, 256 insertions(+), 119 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 00551f340dbe..833b2344ce79 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -1128,7 +1128,7 @@ config ARCH_MMAP_RND_BITS
+ 	int "Number of bits to use for ASLR of mmap base address" if EXPERT
+ 	range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
+ 	default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
+-	default ARCH_MMAP_RND_BITS_MIN
++	default ARCH_MMAP_RND_BITS_MAX
+ 	depends on HAVE_ARCH_MMAP_RND_BITS
+ 	help
+ 	  This value can be used to select the number of bits to use to
+@@ -1162,7 +1162,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
+ 	int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
+ 	range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
+ 	default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
+-	default ARCH_MMAP_RND_COMPAT_BITS_MIN
++	default ARCH_MMAP_RND_COMPAT_BITS_MAX
+ 	depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ 	help
+ 	  This value can be used to select the number of bits to use to
+diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
+index 99d345b686fa..6e2458088800 100644
+--- a/arch/x86/include/asm/futex.h
++++ b/arch/x86/include/asm/futex.h
+@@ -48,7 +48,9 @@ do {								\
+ static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ 		u32 __user *uaddr)
+ {
+-	if (!user_access_begin(uaddr, sizeof(u32)))
++	if (can_do_masked_user_access())
++		uaddr = masked_user_access_begin(uaddr);
++	else if (!user_access_begin(uaddr, sizeof(u32)))
+ 		return -EFAULT;
+ 
+ 	switch (op) {
+@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ {
+ 	int ret = 0;
+ 
+-	if (!user_access_begin(uaddr, sizeof(u32)))
++	if (can_do_masked_user_access())
++		uaddr = masked_user_access_begin(uaddr);
++	else if (!user_access_begin(uaddr, sizeof(u32)))
+ 		return -EFAULT;
+ 	asm volatile("\n"
+ 		"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index b0678d59ebdb..a2becb85bea7 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -569,7 +569,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
+ 		 * mm_cpumask. The TLB shootdown code can figure out from
+ 		 * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
+ 		 */
+-		if (WARN_ON_ONCE(prev != &init_mm &&
++		if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm &&
+ 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 
+diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c
+index 85e99641eaae..c1b6bcc6f7dd 100644
+--- a/drivers/bluetooth/btmtk.c
++++ b/drivers/bluetooth/btmtk.c
+@@ -1329,7 +1329,6 @@ int btmtk_usb_setup(struct hci_dev *hdev)
+ 		fwname = FIRMWARE_MT7668;
+ 		break;
+ 	case 0x7922:
+-	case 0x7961:
+ 	case 0x7925:
+ 		/* Reset the device to ensure it's in the initial state before
+ 		 * downloading the firmware to ensure.
+@@ -1337,7 +1336,8 @@ int btmtk_usb_setup(struct hci_dev *hdev)
+ 
+ 		if (!test_bit(BTMTK_FIRMWARE_LOADED, &btmtk_data->flags))
+ 			btmtk_usb_subsys_reset(hdev, dev_id);
+-
++		fallthrough;
++	case 0x7961:
+ 		btmtk_fw_get_filename(fw_bin_name, sizeof(fw_bin_name), dev_id,
+ 				      fw_version, fw_flavor);
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 7617963901fa..03933b2c5ebc 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -855,6 +855,7 @@ struct amdgpu_device {
+ 	bool				need_swiotlb;
+ 	bool				accel_working;
+ 	struct notifier_block		acpi_nb;
++	struct notifier_block		pm_nb;
+ 	struct amdgpu_i2c_chan		*i2c_bus[AMDGPU_MAX_I2C_BUS];
+ 	struct debugfs_blob_wrapper     debugfs_vbios_blob;
+ 	struct debugfs_blob_wrapper     debugfs_discovery_blob;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index 51904906545e..d5d3391cc788 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -145,6 +145,8 @@ const char *amdgpu_asic_name[] = {
+ };
+ 
+ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
++static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
++				     void *data);
+ 
+ /**
+  * DOC: pcie_replay_count
+@@ -4507,6 +4509,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
+ 
+ 	amdgpu_device_check_iommu_direct_map(adev);
+ 
++	adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
++	r = register_pm_notifier(&adev->pm_nb);
++	if (r)
++		goto failed;
++
+ 	return 0;
+ 
+ release_ras_con:
+@@ -4571,6 +4578,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
+ 		drain_workqueue(adev->mman.bdev.wq);
+ 	adev->shutdown = true;
+ 
++	unregister_pm_notifier(&adev->pm_nb);
++
+ 	/* make sure IB test finished before entering exclusive mode
+ 	 * to avoid preemption on IB test
+ 	 */
+@@ -4688,8 +4697,8 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
+ {
+ 	int ret;
+ 
+-	/* No need to evict vram on APUs for suspend to ram or s2idle */
+-	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
++	/* No need to evict vram on APUs unless going to S4 */
++	if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
+ 		return 0;
+ 
+ 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
+@@ -4701,6 +4710,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
+ /*
+  * Suspend & resume.
+  */
++/**
++ * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
++ * @nb: notifier block
++ * @mode: suspend mode
++ * @data: data
++ *
++ * This function is called when the system is about to suspend or hibernate.
++ * It is used to evict resources from the device before the system goes to
++ * sleep while there is still access to swap.
++ */
++static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
++				     void *data)
++{
++	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
++	int r;
++
++	switch (mode) {
++	case PM_HIBERNATION_PREPARE:
++		adev->in_s4 = true;
++		fallthrough;
++	case PM_SUSPEND_PREPARE:
++		r = amdgpu_device_evict_resources(adev);
++		/*
++		 * This is considered non-fatal at this time because
++		 * amdgpu_device_prepare() will also fatally evict resources.
++		 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
++		 */
++		if (r)
++			drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
++		break;
++	}
++
++	return NOTIFY_DONE;
++}
++
+ /**
+  * amdgpu_device_prepare - prepare for device suspend
+  *
+@@ -4740,7 +4784,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
+ 	return 0;
+ 
+ unprepare:
+-	adev->in_s0ix = adev->in_s3 = false;
++	adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
+ 
+ 	return r;
+ }
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+index 852e6f315576..94a9a9266f8e 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -2639,7 +2639,6 @@ static int amdgpu_pmops_freeze(struct device *dev)
+ 	struct amdgpu_device *adev = drm_to_adev(drm_dev);
+ 	int r;
+ 
+-	adev->in_s4 = true;
+ 	r = amdgpu_device_suspend(drm_dev, true);
+ 	adev->in_s4 = false;
+ 	if (r)
+@@ -3078,6 +3077,11 @@ static int __init amdgpu_init(void)
+ 	/* Ignore KFD init failures. Normal when CONFIG_HSA_AMD is not set. */
+ 	amdgpu_amdkfd_init();
+ 
++	if (amdgpu_pp_feature_mask & PP_OVERDRIVE_MASK) {
++		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
++		pr_crit("Overdrive is enabled, please disable it before reporting any bugs.\n");
++	}
++
+ 	/* let modprobe override vga console setting */
+ 	return pci_register_driver(&amdgpu_kms_pci_driver);
+ 
+diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
+index 855beafb76ff..ad78059ee954 100644
+--- a/drivers/gpu/drm/drm_edid.c
++++ b/drivers/gpu/drm/drm_edid.c
+@@ -94,6 +94,8 @@ static int oui(u8 first, u8 second, u8 third)
+ #define EDID_QUIRK_NON_DESKTOP			(1 << 12)
+ /* Cap the DSC target bitrate to 15bpp */
+ #define EDID_QUIRK_CAP_DSC_15BPP		(1 << 13)
++/* Fix up a particular 5120x1440@240Hz timing */
++#define EDID_QUIRK_FIXUP_5120_1440_240		(1 << 14)
+ 
+ #define MICROSOFT_IEEE_OUI	0xca125c
+ 
+@@ -182,6 +184,12 @@ static const struct edid_quirk {
+ 	EDID_QUIRK('S', 'A', 'M', 596, EDID_QUIRK_PREFER_LARGE_60),
+ 	EDID_QUIRK('S', 'A', 'M', 638, EDID_QUIRK_PREFER_LARGE_60),
+ 
++	/* Samsung C49G95T */
++	EDID_QUIRK('S', 'A', 'M', 0x7053, EDID_QUIRK_FIXUP_5120_1440_240),
++
++	/* Samsung S49AG95 */
++	EDID_QUIRK('S', 'A', 'M', 0x71ac, EDID_QUIRK_FIXUP_5120_1440_240),
++
+ 	/* Sony PVM-2541A does up to 12 bpc, but only reports max 8 bpc */
+ 	EDID_QUIRK('S', 'N', 'Y', 0x2541, EDID_QUIRK_FORCE_12BPC),
+ 
+@@ -6753,7 +6761,37 @@ static void update_display_info(struct drm_connector *connector,
+ 	drm_edid_to_eld(connector, drm_edid);
+ }
+ 
+-static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *dev,
++static void drm_mode_displayid_detailed_edid_quirks(struct drm_connector *connector,
++						    struct drm_display_mode *mode)
++{
++	unsigned int hsync_width;
++	unsigned int vsync_width;
++
++	if (connector->display_info.quirks & EDID_QUIRK_FIXUP_5120_1440_240) {
++		if (mode->hdisplay == 5120 && mode->vdisplay == 1440 &&
++		    mode->clock == 1939490) {
++			hsync_width = mode->hsync_end - mode->hsync_start;
++			vsync_width = mode->vsync_end - mode->vsync_start;
++
++			mode->clock = 2018490;
++			mode->hdisplay = 5120;
++			mode->hsync_start = 5120 + 8;
++			mode->hsync_end = 5120 + 8 + hsync_width;
++			mode->htotal = 5200;
++
++			mode->vdisplay = 1440;
++			mode->vsync_start = 1440 + 165;
++			mode->vsync_end = 1440 + 165 + vsync_width;
++			mode->vtotal = 1619;
++
++			drm_dbg_kms(connector->dev,
++				    "[CONNECTOR:%d:%s] Samsung 240Hz mode quirk applied\n",
++				    connector->base.id, connector->name);
++		}
++	}
++}
++
++static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_connector *connector,
+ 							    struct displayid_detailed_timings_1 *timings,
+ 							    bool type_7)
+ {
+@@ -6772,7 +6810,7 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d
+ 	bool hsync_positive = (timings->hsync[1] >> 7) & 0x1;
+ 	bool vsync_positive = (timings->vsync[1] >> 7) & 0x1;
+ 
+-	mode = drm_mode_create(dev);
++	mode = drm_mode_create(connector->dev);
+ 	if (!mode)
+ 		return NULL;
+ 
+@@ -6795,6 +6833,9 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d
+ 
+ 	if (timings->flags & 0x80)
+ 		mode->type |= DRM_MODE_TYPE_PREFERRED;
++
++	drm_mode_displayid_detailed_edid_quirks(connector, mode);
++
+ 	drm_mode_set_name(mode);
+ 
+ 	return mode;
+@@ -6817,7 +6858,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector,
+ 	for (i = 0; i < num_timings; i++) {
+ 		struct displayid_detailed_timings_1 *timings = &det->timings[i];
+ 
+-		newmode = drm_mode_displayid_detailed(connector->dev, timings, type_7);
++		newmode = drm_mode_displayid_detailed(connector, timings, type_7);
+ 		if (!newmode)
+ 			continue;
+ 
+diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
+index 0f23be98c56e..1b92729bd378 100644
+--- a/drivers/hid/hid-ids.h
++++ b/drivers/hid/hid-ids.h
+@@ -210,6 +210,7 @@
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD2	0x19b6
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3	0x1a30
+ #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR		0x18c6
++#define USB_DEVICE_ID_ASUSTEK_ROG_RAIKIRI_PAD		0x1abb
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY		0x1abe
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X		0x1b4c
+ #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD	0x196b
+diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
+index cf4fe21a5039..04107b950717 100644
+--- a/fs/ntfs3/bitmap.c
++++ b/fs/ntfs3/bitmap.c
+@@ -710,20 +710,17 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ {
+ 	int err = 0;
+ 	struct super_block *sb = wnd->sb;
+-	size_t bits0 = bits;
+ 	u32 wbits = 8 * sb->s_blocksize;
+ 	size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ 	u32 wbit = bit & (wbits - 1);
+ 	struct buffer_head *bh;
++	u32 op;
+ 
+-	while (iw < wnd->nwnd && bits) {
+-		u32 tail, op;
+-
++	for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) {
+ 		if (iw + 1 == wnd->nwnd)
+ 			wbits = wnd->bits_last;
+ 
+-		tail = wbits - wbit;
+-		op = min_t(u32, tail, bits);
++		op = min_t(u32, wbits - wbit, bits);
+ 
+ 		bh = wnd_map(wnd, iw);
+ 		if (IS_ERR(bh)) {
+@@ -736,20 +733,15 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 		ntfs_bitmap_clear_le(bh->b_data, wbit, op);
+ 
+ 		wnd->free_bits[iw] += op;
++		wnd->total_zeroes += op;
+ 
+ 		set_buffer_uptodate(bh);
+ 		mark_buffer_dirty(bh);
+ 		unlock_buffer(bh);
+ 		put_bh(bh);
+ 
+-		wnd->total_zeroes += op;
+-		bits -= op;
+-		wbit = 0;
+-		iw += 1;
++		wnd_add_free_ext(wnd, bit, op, false);
+ 	}
+-
+-	wnd_add_free_ext(wnd, bit, bits0, false);
+-
+ 	return err;
+ }
+ 
+@@ -760,20 +752,17 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ {
+ 	int err = 0;
+ 	struct super_block *sb = wnd->sb;
+-	size_t bits0 = bits;
+ 	size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ 	u32 wbits = 8 * sb->s_blocksize;
+ 	u32 wbit = bit & (wbits - 1);
+ 	struct buffer_head *bh;
++	u32 op;
+ 
+-	while (iw < wnd->nwnd && bits) {
+-		u32 tail, op;
+-
++	for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) {
+ 		if (unlikely(iw + 1 == wnd->nwnd))
+ 			wbits = wnd->bits_last;
+ 
+-		tail = wbits - wbit;
+-		op = min_t(u32, tail, bits);
++		op = min_t(u32, wbits - wbit, bits);
+ 
+ 		bh = wnd_map(wnd, iw);
+ 		if (IS_ERR(bh)) {
+@@ -785,21 +774,16 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 
+ 		ntfs_bitmap_set_le(bh->b_data, wbit, op);
+ 		wnd->free_bits[iw] -= op;
++		wnd->total_zeroes -= op;
+ 
+ 		set_buffer_uptodate(bh);
+ 		mark_buffer_dirty(bh);
+ 		unlock_buffer(bh);
+ 		put_bh(bh);
+ 
+-		wnd->total_zeroes -= op;
+-		bits -= op;
+-		wbit = 0;
+-		iw += 1;
++		if (!RB_EMPTY_ROOT(&wnd->start_tree))
++			wnd_remove_free_ext(wnd, bit, op);
+ 	}
+-
+-	if (!RB_EMPTY_ROOT(&wnd->start_tree))
+-		wnd_remove_free_ext(wnd, bit, bits0);
+-
+ 	return err;
+ }
+ 
+@@ -852,15 +836,13 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 	size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ 	u32 wbits = 8 * sb->s_blocksize;
+ 	u32 wbit = bit & (wbits - 1);
++	u32 op;
+ 
+-	while (iw < wnd->nwnd && bits) {
+-		u32 tail, op;
+-
++	for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) {
+ 		if (unlikely(iw + 1 == wnd->nwnd))
+ 			wbits = wnd->bits_last;
+ 
+-		tail = wbits - wbit;
+-		op = min_t(u32, tail, bits);
++		op = min_t(u32, wbits - wbit, bits);
+ 
+ 		if (wbits != wnd->free_bits[iw]) {
+ 			bool ret;
+@@ -875,10 +857,6 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 			if (!ret)
+ 				return false;
+ 		}
+-
+-		bits -= op;
+-		wbit = 0;
+-		iw += 1;
+ 	}
+ 
+ 	return true;
+@@ -928,6 +906,7 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 	size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ 	u32 wbits = 8 * sb->s_blocksize;
+ 	u32 wbit = bit & (wbits - 1);
++	u32 op;
+ 	size_t end;
+ 	struct rb_node *n;
+ 	struct e_node *e;
+@@ -945,14 +924,11 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 		return false;
+ 
+ use_wnd:
+-	while (iw < wnd->nwnd && bits) {
+-		u32 tail, op;
+-
++	for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) {
+ 		if (unlikely(iw + 1 == wnd->nwnd))
+ 			wbits = wnd->bits_last;
+ 
+-		tail = wbits - wbit;
+-		op = min_t(u32, tail, bits);
++		op = min_t(u32, wbits - wbit, bits);
+ 
+ 		if (wnd->free_bits[iw]) {
+ 			bool ret;
+@@ -966,10 +942,6 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+ 			if (!ret)
+ 				goto out;
+ 		}
+-
+-		bits -= op;
+-		wbit = 0;
+-		iw += 1;
+ 	}
+ 	ret = true;
+ 
+diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
+index f704ceef9539..3f96a11804c9 100644
+--- a/fs/ntfs3/file.c
++++ b/fs/ntfs3/file.c
+@@ -182,13 +182,15 @@ static int ntfs_extend_initialized_size(struct file *file,
+ 	loff_t pos = valid;
+ 	int err;
+ 
++	if (valid >= new_valid)
++		return 0;
++
+ 	if (is_resident(ni)) {
+ 		ni->i_valid = new_valid;
+ 		return 0;
+ 	}
+ 
+ 	WARN_ON(is_compressed(ni));
+-	WARN_ON(valid >= new_valid);
+ 
+ 	for (;;) {
+ 		u32 zerofrom, len;
+@@ -987,6 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 	u64 frame_vbo;
+ 	pgoff_t index;
+ 	bool frame_uptodate;
++	struct folio *folio;
+ 
+ 	if (frame_size < PAGE_SIZE) {
+ 		/*
+@@ -1041,8 +1044,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 			if (err) {
+ 				for (ip = 0; ip < pages_per_frame; ip++) {
+ 					page = pages[ip];
+-					unlock_page(page);
+-					put_page(page);
++					folio = page_folio(page);
++					folio_unlock(folio);
++					folio_put(folio);
+ 				}
+ 				goto out;
+ 			}
+@@ -1052,9 +1056,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 		off = offset_in_page(valid);
+ 		for (; ip < pages_per_frame; ip++, off = 0) {
+ 			page = pages[ip];
++			folio = page_folio(page);
+ 			zero_user_segment(page, off, PAGE_SIZE);
+ 			flush_dcache_page(page);
+-			SetPageUptodate(page);
++			folio_mark_uptodate(folio);
+ 		}
+ 
+ 		ni_lock(ni);
+@@ -1063,9 +1068,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 
+ 		for (ip = 0; ip < pages_per_frame; ip++) {
+ 			page = pages[ip];
+-			SetPageUptodate(page);
+-			unlock_page(page);
+-			put_page(page);
++			folio = page_folio(page);
++			folio_mark_uptodate(folio);
++			folio_unlock(folio);
++			folio_put(folio);
+ 		}
+ 
+ 		if (err)
+@@ -1107,8 +1113,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 					for (ip = 0; ip < pages_per_frame;
+ 					     ip++) {
+ 						page = pages[ip];
+-						unlock_page(page);
+-						put_page(page);
++						folio = page_folio(page);
++						folio_unlock(folio);
++						folio_put(folio);
+ 					}
+ 					goto out;
+ 				}
+@@ -1149,9 +1156,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+ 		for (ip = 0; ip < pages_per_frame; ip++) {
+ 			page = pages[ip];
+ 			ClearPageDirty(page);
+-			SetPageUptodate(page);
+-			unlock_page(page);
+-			put_page(page);
++			folio = page_folio(page);
++			folio_mark_uptodate(folio);
++			folio_unlock(folio);
++			folio_put(folio);
+ 		}
+ 
+ 		if (err)
+diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
+index c33e818b3164..8b39d0ce5f28 100644
+--- a/fs/ntfs3/frecord.c
++++ b/fs/ntfs3/frecord.c
+@@ -1958,7 +1958,6 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
+ 	if (end > alloc_size)
+ 		end = alloc_size;
+ 
+-
+ 	while (vbo < end) {
+ 		if (idx == -1) {
+ 			ok = run_lookup_entry(&run, vcn, &lcn, &clen, &idx);
+diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
+index 0fa636038b4e..03471bc9371c 100644
+--- a/fs/ntfs3/fsntfs.c
++++ b/fs/ntfs3/fsntfs.c
+@@ -2699,4 +2699,4 @@ int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len)
+ out:
+ 	__putname(uni);
+ 	return err;
+-}
+\ No newline at end of file
++}
+diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
+index f810f0419d25..61d53d39f3b9 100644
+--- a/fs/ntfs3/record.c
++++ b/fs/ntfs3/record.c
+@@ -212,7 +212,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+ 			return NULL;
+ 
+ 		if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
+-		    !IS_ALIGNED(off, 4)) {
++		    !IS_ALIGNED(off, 8)) {
+ 			return NULL;
+ 		}
+ 
+@@ -236,8 +236,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+ 		off += asize;
+ 	}
+ 
+-	/* Can we use the first field (attr->type). */
+-	/* NOTE: this code also checks attr->size availability. */
++	/*
++	 * Can we use the first fields:
++	 * attr->type,
++	 * attr->size
++	 */
+ 	if (off + 8 > used) {
+ 		static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
+ 		return NULL;
+@@ -259,10 +262,17 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+ 
+ 	asize = le32_to_cpu(attr->size);
+ 
++	if (!IS_ALIGNED(asize, 8))
++		return NULL;
++
+ 	/* Check overflow and boundary. */
+ 	if (off + asize < off || off + asize > used)
+ 		return NULL;
+ 
++	/* Can we use the field attr->non_res. */
++	if (off + 9 > used)
++		return NULL;
++
+ 	/* Check size of attribute. */
+ 	if (!attr->non_res) {
+ 		/* Check resident fields. */
+diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
+index 48566dff0dc9..6e86d66197ef 100644
+--- a/fs/ntfs3/run.c
++++ b/fs/ntfs3/run.c
+@@ -1112,9 +1112,9 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
+ 		err = wnd_set_used_safe(wnd, lcn, len, &done);
+ 		if (zone) {
+ 			/* Restore zone. Lock mft run. */
+-			struct rw_semaphore *lock;
+-			lock = is_mounted(sbi) ? &sbi->mft.ni->file.run_lock :
+-						 NULL;
++			struct rw_semaphore *lock =
++				is_mounted(sbi) ? &sbi->mft.ni->file.run_lock :
++						  NULL;
+ 			if (lock)
+ 				down_read(lock);
+ 			ntfs_refresh_zone(sbi);
+diff --git a/kernel/futex/core.c b/kernel/futex/core.c
+index 136768ae2637..9107704a6574 100644
+--- a/kernel/futex/core.c
++++ b/kernel/futex/core.c
+@@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *
+ 	return NULL;
+ }
+ 
+-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+-{
+-	int ret;
+-
+-	pagefault_disable();
+-	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+-	pagefault_enable();
+-
+-	return ret;
+-}
+-
+-int futex_get_value_locked(u32 *dest, u32 __user *from)
+-{
+-	int ret;
+-
+-	pagefault_disable();
+-	ret = __get_user(*dest, from);
+-	pagefault_enable();
+-
+-	return ret ? -EFAULT : 0;
+-}
+-
+ /**
+  * wait_for_owner_exiting - Block until the owner has exited
+  * @ret: owner's current futex lock status
+diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
+index 8b195d06f4e8..618ce1fe870e 100644
+--- a/kernel/futex/futex.h
++++ b/kernel/futex/futex.h
+@@ -6,6 +6,7 @@
+ #include <linux/rtmutex.h>
+ #include <linux/sched/wake_q.h>
+ #include <linux/compat.h>
++#include <linux/uaccess.h>
+ 
+ #ifdef CONFIG_PREEMPT_RT
+ #include <linux/rcuwait.h>
+@@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct futex_q *q);
+ extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
+ 
+ extern int fault_in_user_writeable(u32 __user *uaddr);
+-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
+-extern int futex_get_value_locked(u32 *dest, u32 __user *from);
+ extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+ 
++static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
++{
++	int ret;
++
++	pagefault_disable();
++	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
++	pagefault_enable();
++
++	return ret;
++}
++
++/*
++ * This does a plain atomic user space read, and the user pointer has
++ * already been verified earlier by get_futex_key() to be both aligned
++ * and actually in user space, just like futex_atomic_cmpxchg_inatomic().
++ *
++ * We still want to avoid any speculation, and while __get_user() is
++ * the traditional model for this, it's actually slower than doing
++ * this manually these days.
++ *
++ * We could just have a per-architecture special function for it,
++ * the same way we do futex_atomic_cmpxchg_inatomic(), but rather
++ * than force everybody to do that, write it out long-hand using
++ * the low-level user-access infrastructure.
++ *
++ * This looks a bit overkill, but generally just results in a couple
++ * of instructions.
++ */
++static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
++{
++	u32 val;
++
++	if (can_do_masked_user_access())
++		from = masked_user_access_begin(from);
++	else if (!user_read_access_begin(from, sizeof(*from)))
++		return -EFAULT;
++	unsafe_get_user(val, from, Efault);
++	user_access_end();
++	*dest = val;
++	return 0;
++Efault:
++	user_access_end();
++	return -EFAULT;
++}
++
++static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
++{
++	int ret;
++
++	pagefault_disable();
++	ret = futex_read_inatomic(dest, from);
++	pagefault_enable();
++
++	return ret;
++}
++
+ extern void __futex_unqueue(struct futex_q *q);
+ extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+ extern int futex_unqueue(struct futex_q *q);
+diff --git a/kernel/kprobes.c b/kernel/kprobes.c
+index da59c68df841..55d0835ea0cf 100644
+--- a/kernel/kprobes.c
++++ b/kernel/kprobes.c
+@@ -1570,16 +1570,25 @@ static int check_kprobe_address_safe(struct kprobe *p,
+ 	if (ret)
+ 		return ret;
+ 	jump_label_lock();
+-	preempt_disable();
+ 
+ 	/* Ensure the address is in a text area, and find a module if exists. */
+ 	*probed_mod = NULL;
+ 	if (!core_kernel_text((unsigned long) p->addr)) {
++		guard(preempt)();
+ 		*probed_mod = __module_text_address((unsigned long) p->addr);
+ 		if (!(*probed_mod)) {
+ 			ret = -EINVAL;
+ 			goto out;
+ 		}
++
++		/*
++		 * We must hold a refcount of the probed module while updating
++		 * its code to prohibit unexpected unloading.
++		 */
++		if (unlikely(!try_module_get(*probed_mod))) {
++			ret = -ENOENT;
++			goto out;
++		}
+ 	}
+ 	/* Ensure it is not in reserved area. */
+ 	if (in_gate_area_no_mm((unsigned long) p->addr) ||
+@@ -1588,21 +1597,13 @@ static int check_kprobe_address_safe(struct kprobe *p,
+ 	    static_call_text_reserved(p->addr, p->addr) ||
+ 	    find_bug((unsigned long)p->addr) ||
+ 	    is_cfi_preamble_symbol((unsigned long)p->addr)) {
++		module_put(*probed_mod);
+ 		ret = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	/* Get module refcount and reject __init functions for loaded modules. */
+ 	if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
+-		/*
+-		 * We must hold a refcount of the probed module while updating
+-		 * its code to prohibit unexpected unloading.
+-		 */
+-		if (unlikely(!try_module_get(*probed_mod))) {
+-			ret = -ENOENT;
+-			goto out;
+-		}
+-
+ 		/*
+ 		 * If the module freed '.init.text', we couldn't insert
+ 		 * kprobes in there.
+@@ -1610,13 +1611,11 @@ static int check_kprobe_address_safe(struct kprobe *p,
+ 		if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+ 		    !module_is_coming(*probed_mod)) {
+ 			module_put(*probed_mod);
+-			*probed_mod = NULL;
+ 			ret = -ENOENT;
+ 		}
+ 	}
+ 
+ out:
+-	preempt_enable();
+ 	jump_label_unlock();
+ 
+ 	return ret;
+diff --git a/kernel/workqueue.c b/kernel/workqueue.c
+index 9949ffad8df0..8b07576814a5 100644
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -3833,16 +3833,28 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
+ {
+ 	bool wait = false;
+ 	struct pool_workqueue *pwq;
++	struct worker_pool *current_pool = NULL;
+ 
+ 	if (flush_color >= 0) {
+ 		WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
+ 		atomic_set(&wq->nr_pwqs_to_flush, 1);
+ 	}
+ 
++	/*
++	 * For unbound workqueue, pwqs will map to only a few pools.
++	 * Most of the time, pwqs within the same pool will be linked
++	 * sequentially to wq->pwqs by cpu index. So in the majority
++	 * of pwq iters, the pool is the same, only doing lock/unlock
++	 * if the pool has changed. This can largely reduce expensive
++	 * lock operations.
++	 */
+ 	for_each_pwq(pwq, wq) {
+-		struct worker_pool *pool = pwq->pool;
+-
+-		raw_spin_lock_irq(&pool->lock);
++		if (current_pool != pwq->pool) {
++			if (likely(current_pool))
++				raw_spin_unlock_irq(&current_pool->lock);
++			current_pool = pwq->pool;
++			raw_spin_lock_irq(&current_pool->lock);
++		}
+ 
+ 		if (flush_color >= 0) {
+ 			WARN_ON_ONCE(pwq->flush_color != -1);
+@@ -3859,9 +3871,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
+ 			pwq->work_color = work_color;
+ 		}
+ 
+-		raw_spin_unlock_irq(&pool->lock);
+ 	}
+ 
++	if (current_pool)
++		raw_spin_unlock_irq(&current_pool->lock);
++
+ 	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
+ 		complete(&wq->first_flusher->done);
+ 
+diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD
+index f83493838cf9..4010899652b8 100644
+--- a/scripts/package/PKGBUILD
++++ b/scripts/package/PKGBUILD
+@@ -91,6 +91,11 @@ _package-headers() {
+ 		"${srctree}/scripts/package/install-extmod-build" "${builddir}"
+ 	fi
+ 
++	# required when DEBUG_INFO_BTF_MODULES is enabled
++	if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then
++		install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids
++	fi
++
+ 	echo "Installing System.map and config..."
+ 	mkdir -p "${builddir}"
+ 	cp System.map "${builddir}/System.map"
+diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
+index 192fc75b51e6..d88fc0ca893d 100644
+--- a/sound/pci/hda/patch_realtek.c
++++ b/sound/pci/hda/patch_realtek.c
+@@ -10604,6 +10604,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
+ 	SND_PCI_QUIRK(0x1043, 0x1e1f, "ASUS Vivobook 15 X1504VAP", ALC2XX_FIXUP_HEADSET_MIC),
+ 	SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS),
+ 	SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS),
++	SND_PCI_QUIRK(0x1043, 0x1e63, "ASUS H7606W", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC),
++	SND_PCI_QUIRK(0x1043, 0x1e83, "ASUS GA605W", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC),
+ 	SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401),
+ 	SND_PCI_QUIRK(0x1043, 0x1eb3, "ASUS Ally RCLA72", ALC287_FIXUP_TAS2781_I2C),
+ 	SND_PCI_QUIRK(0x1043, 0x1ed3, "ASUS HN7306W", ALC287_FIXUP_CS35L41_I2C_2),
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch b/sys-kernel/gentoo-sources-6.12/0008-ntsync.patch
similarity index 84%
rename from sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch
rename to sys-kernel/gentoo-sources-6.12/0008-ntsync.patch
index 4a16758..1efe29d 100644
--- a/sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch
+++ b/sys-kernel/gentoo-sources-6.12/0008-ntsync.patch
@@ -1,22 +1,22 @@
-From 36ef0070410e229e52c9de58d6021df36a4b1707 Mon Sep 17 00:00:00 2001
+From 46225020f04e55a29ae30473a9a8cf0d15f0979e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 3 Aug 2024 09:34:15 +0200
-Subject: [PATCH 09/12] ntsync
+Date: Thu, 19 Dec 2024 18:51:57 +0100
+Subject: [PATCH 08/12] ntsync
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  Documentation/userspace-api/index.rst         |    1 +
- Documentation/userspace-api/ntsync.rst        |  398 +++++
+ Documentation/userspace-api/ntsync.rst        |  385 +++++
  MAINTAINERS                                   |    9 +
  drivers/misc/Kconfig                          |    1 -
- drivers/misc/ntsync.c                         |  989 +++++++++++-
- include/uapi/linux/ntsync.h                   |   39 +
+ drivers/misc/ntsync.c                         |  992 +++++++++++-
+ include/uapi/linux/ntsync.h                   |   42 +-
  tools/testing/selftests/Makefile              |    1 +
  .../selftests/drivers/ntsync/.gitignore       |    1 +
  .../testing/selftests/drivers/ntsync/Makefile |    7 +
  tools/testing/selftests/drivers/ntsync/config |    1 +
- .../testing/selftests/drivers/ntsync/ntsync.c | 1407 +++++++++++++++++
- 11 files changed, 2850 insertions(+), 4 deletions(-)
+ .../testing/selftests/drivers/ntsync/ntsync.c | 1343 +++++++++++++++++
+ 11 files changed, 2767 insertions(+), 16 deletions(-)
  create mode 100644 Documentation/userspace-api/ntsync.rst
  create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore
  create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile
@@ -24,10 +24,10 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c
 
 diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
-index 8a251d71fa6e..02bea81fb4bf 100644
+index 274cc7546efc..9c1b15cd89ab 100644
 --- a/Documentation/userspace-api/index.rst
 +++ b/Documentation/userspace-api/index.rst
-@@ -64,6 +64,7 @@ Everything else
+@@ -63,6 +63,7 @@ Everything else
     vduse
     futex2
     perf_ring_buffer
@@ -37,10 +37,10 @@ index 8a251d71fa6e..02bea81fb4bf 100644
  
 diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst
 new file mode 100644
-index 000000000000..767844637a7d
+index 000000000000..25e7c4aef968
 --- /dev/null
 +++ b/Documentation/userspace-api/ntsync.rst
-@@ -0,0 +1,398 @@
+@@ -0,0 +1,385 @@
 +===================================
 +NT synchronization primitive driver
 +===================================
@@ -116,19 +116,16 @@ index 000000000000..767844637a7d
 +structures used in ioctl calls::
 +
 +   struct ntsync_sem_args {
-+   	__u32 sem;
 +   	__u32 count;
 +   	__u32 max;
 +   };
 +
 +   struct ntsync_mutex_args {
-+   	__u32 mutex;
 +   	__u32 owner;
 +   	__u32 count;
 +   };
 +
 +   struct ntsync_event_args {
-+   	__u32 event;
 +   	__u32 signaled;
 +   	__u32 manual;
 +   };
@@ -145,7 +142,7 @@ index 000000000000..767844637a7d
 +   };
 +
 +Depending on the ioctl, members of the structure may be used as input,
-+output, or not at all. All ioctls return 0 on success.
++output, or not at all.
 +
 +The ioctls on the device file are as follows:
 +
@@ -156,14 +153,13 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``sem``
-+       - On output, contains a file descriptor to the created semaphore.
 +     * - ``count``
 +       - Initial count of the semaphore.
 +     * - ``max``
 +       - Maximum count of the semaphore.
 +
 +  Fails with ``EINVAL`` if ``count`` is greater than ``max``.
++  On success, returns a file descriptor the created semaphore.
 +
 +.. c:macro:: NTSYNC_IOC_CREATE_MUTEX
 +
@@ -172,8 +168,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``mutex``
-+       - On output, contains a file descriptor to the created mutex.
 +     * - ``count``
 +       - Initial recursion count of the mutex.
 +     * - ``owner``
@@ -181,6 +175,7 @@ index 000000000000..767844637a7d
 +
 +  If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is
 +  zero and ``count`` is nonzero, the function fails with ``EINVAL``.
++  On success, returns a file descriptor the created mutex.
 +
 +.. c:macro:: NTSYNC_IOC_CREATE_EVENT
 +
@@ -189,8 +184,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``event``
-+       - On output, contains a file descriptor to the created event.
 +     * - ``signaled``
 +       - If nonzero, the event is initially signaled, otherwise
 +         nonsignaled.
@@ -198,6 +191,8 @@ index 000000000000..767844637a7d
 +       - If nonzero, the event is a manual-reset event, otherwise
 +         auto-reset.
 +
++  On success, returns a file descriptor the created event.
++
 +The ioctls on the individual objects are as follows:
 +
 +.. c:macro:: NTSYNC_IOC_SEM_POST
@@ -220,8 +215,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``mutex``
-+       - Ignored.
 +     * - ``owner``
 +       - Specifies the owner trying to release this mutex.
 +     * - ``count``
@@ -270,8 +263,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``sem``
-+       - Ignored.
 +     * - ``count``
 +       - On output, contains the current count of the semaphore.
 +     * - ``max``
@@ -284,8 +275,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``mutex``
-+       - Ignored.
 +     * - ``owner``
 +       - On output, contains the current owner of the mutex, or zero
 +         if the mutex is not currently owned.
@@ -303,8 +292,6 @@ index 000000000000..767844637a7d
 +
 +  .. list-table::
 +
-+     * - ``event``
-+       - Ignored.
 +     * - ``signaled``
 +       - On output, contains the current state of the event.
 +     * - ``manual``
@@ -440,10 +427,10 @@ index 000000000000..767844637a7d
 +  ``objs`` and in ``alert``. If this is attempted, the function fails
 +  with ``EINVAL``.
 diff --git a/MAINTAINERS b/MAINTAINERS
-index b27470be2e6a..4112729fc23a 100644
+index a2d251917629..a30770b6f75a 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
-@@ -15983,6 +15983,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
+@@ -16501,6 +16501,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
  F:	Documentation/filesystems/ntfs3.rst
  F:	fs/ntfs3/
  
@@ -460,10 +447,10 @@ index b27470be2e6a..4112729fc23a 100644
  M:	Finn Thain <fthain@linux-m68k.org>
  L:	linux-m68k@lists.linux-m68k.org
 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
-index faf983680040..2907b5c23368 100644
+index 3fe7e2a9bd29..6c8b999a5e08 100644
 --- a/drivers/misc/Kconfig
 +++ b/drivers/misc/Kconfig
-@@ -507,7 +507,6 @@ config OPEN_DICE
+@@ -517,7 +517,6 @@ config OPEN_DICE
  
  config NTSYNC
  	tristate "NT synchronization primitive emulation"
@@ -472,7 +459,7 @@ index faf983680040..2907b5c23368 100644
  	  This module provides kernel support for emulation of Windows NT
  	  synchronization primitives. It is not a hardware driver.
 diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
-index 3c2f743c58b0..87a24798a5c7 100644
+index 4954553b7baa..457ff28b789f 100644
 --- a/drivers/misc/ntsync.c
 +++ b/drivers/misc/ntsync.c
 @@ -6,11 +6,17 @@
@@ -516,7 +503,7 @@ index 3c2f743c58b0..87a24798a5c7 100644
  
  	enum ntsync_type type;
  
-@@ -46,13 +57,335 @@ struct ntsync_obj {
+@@ -46,22 +57,344 @@ struct ntsync_obj {
  			__u32 count;
  			__u32 max;
  		} sem;
@@ -852,7 +839,9 @@ index 3c2f743c58b0..87a24798a5c7 100644
  /*
   * Actually change the semaphore state, returning -EOVERFLOW if it is made
   * invalid.
-@@ -61,7 +394,7 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
+  */
+-static int post_sem_state(struct ntsync_obj *sem, __u32 count)
++static int release_sem_state(struct ntsync_obj *sem, __u32 count)
  {
  	__u32 sum;
  
@@ -861,9 +850,12 @@ index 3c2f743c58b0..87a24798a5c7 100644
  
  	if (check_add_overflow(sem->u.sem.count, count, &sum) ||
  	    sum > sem->u.sem.max)
-@@ -73,9 +406,11 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
+@@ -71,11 +404,13 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
+ 	return 0;
+ }
  
- static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+-static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
++static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp)
  {
 +	struct ntsync_device *dev = sem->dev;
  	__u32 __user *user_args = argp;
@@ -881,7 +873,8 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	all = ntsync_lock_obj(dev, sem);
  
  	prev_count = sem->u.sem.count;
- 	ret = post_sem_state(sem, args);
+-	ret = post_sem_state(sem, args);
++	ret = release_sem_state(sem, args);
 +	if (!ret) {
 +		if (all)
 +			try_wake_all_obj(dev, sem);
@@ -893,7 +886,7 @@ index 3c2f743c58b0..87a24798a5c7 100644
  
  	if (!ret && put_user(prev_count, user_args))
  		ret = -EFAULT;
-@@ -97,6 +437,226 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+@@ -97,6 +437,220 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
  	return ret;
  }
  
@@ -1053,8 +1046,6 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	if (sem->type != NTSYNC_TYPE_SEM)
 +		return -EINVAL;
 +
-+	args.sem = 0;
-+
 +	all = ntsync_lock_obj(dev, sem);
 +
 +	args.count = sem->u.sem.count;
@@ -1078,8 +1069,6 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	if (mutex->type != NTSYNC_TYPE_MUTEX)
 +		return -EINVAL;
 +
-+	args.mutex = 0;
-+
 +	all = ntsync_lock_obj(dev, mutex);
 +
 +	args.count = mutex->u.mutex.count;
@@ -1103,8 +1092,6 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	if (event->type != NTSYNC_TYPE_EVENT)
 +		return -EINVAL;
 +
-+	args.event = 0;
-+
 +	all = ntsync_lock_obj(dev, event);
 +
 +	args.manual = event->u.event.manual;
@@ -1120,10 +1107,14 @@ index 3c2f743c58b0..87a24798a5c7 100644
  static int ntsync_obj_release(struct inode *inode, struct file *file)
  {
  	struct ntsync_obj *obj = file->private_data;
-@@ -116,6 +676,22 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd,
+@@ -114,8 +668,24 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd,
+ 	void __user *argp = (void __user *)parm;
+ 
  	switch (cmd) {
- 	case NTSYNC_IOC_SEM_POST:
- 		return ntsync_sem_post(obj, argp);
+-	case NTSYNC_IOC_SEM_POST:
+-		return ntsync_sem_post(obj, argp);
++	case NTSYNC_IOC_SEM_RELEASE:
++		return ntsync_sem_release(obj, argp);
 +	case NTSYNC_IOC_SEM_READ:
 +		return ntsync_sem_read(obj, argp);
 +	case NTSYNC_IOC_MUTEX_UNLOCK:
@@ -1143,7 +1134,7 @@ index 3c2f743c58b0..87a24798a5c7 100644
  	default:
  		return -ENOIOCTLCMD;
  	}
-@@ -141,6 +717,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
+@@ -140,6 +710,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
  	obj->dev = dev;
  	get_file(dev->file);
  	spin_lock_init(&obj->lock);
@@ -1153,13 +1144,28 @@ index 3c2f743c58b0..87a24798a5c7 100644
  
  	return obj;
  }
-@@ -191,6 +770,400 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
- 	return put_user(fd, &user_args->sem);
- }
+@@ -165,7 +738,6 @@ static int ntsync_obj_get_fd(struct ntsync_obj *obj)
  
+ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
+ {
+-	struct ntsync_sem_args __user *user_args = argp;
+ 	struct ntsync_sem_args args;
+ 	struct ntsync_obj *sem;
+ 	int fd;
+@@ -182,12 +754,398 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
+ 	sem->u.sem.count = args.count;
+ 	sem->u.sem.max = args.max;
+ 	fd = ntsync_obj_get_fd(sem);
+-	if (fd < 0) {
++	if (fd < 0)
+ 		kfree(sem);
+-		return fd;
++
++	return fd;
++}
++
 +static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp)
 +{
-+	struct ntsync_mutex_args __user *user_args = argp;
 +	struct ntsync_mutex_args args;
 +	struct ntsync_obj *mutex;
 +	int fd;
@@ -1176,17 +1182,14 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	mutex->u.mutex.count = args.count;
 +	mutex->u.mutex.owner = args.owner;
 +	fd = ntsync_obj_get_fd(mutex);
-+	if (fd < 0) {
++	if (fd < 0)
 +		kfree(mutex);
-+		return fd;
-+	}
 +
-+	return put_user(fd, &user_args->mutex);
++	return fd;
 +}
 +
 +static int ntsync_create_event(struct ntsync_device *dev, void __user *argp)
 +{
-+	struct ntsync_event_args __user *user_args = argp;
 +	struct ntsync_event_args args;
 +	struct ntsync_obj *event;
 +	int fd;
@@ -1200,12 +1203,10 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	event->u.event.manual = args.manual;
 +	event->u.event.signaled = args.signaled;
 +	fd = ntsync_obj_get_fd(event);
-+	if (fd < 0) {
++	if (fd < 0)
 +		kfree(event);
-+		return fd;
-+	}
 +
-+	return put_user(fd, &user_args->event);
++	return fd;
 +}
 +
 +static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
@@ -1219,8 +1220,9 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +	if (file->f_op != &ntsync_obj_fops) {
 +		fput(file);
 +		return NULL;
-+	}
-+
+ 	}
+ 
+-	return put_user(fd, &user_args->sem);
 +	obj = file->private_data;
 +	if (obj->dev != dev) {
 +		fput(file);
@@ -1549,12 +1551,10 @@ index 3c2f743c58b0..87a24798a5c7 100644
 +
 +	kfree(q);
 +	return ret;
-+}
-+
+ }
+ 
  static int ntsync_char_open(struct inode *inode, struct file *file)
- {
- 	struct ntsync_device *dev;
-@@ -199,6 +1172,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file)
+@@ -198,6 +1156,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file)
  	if (!dev)
  		return -ENOMEM;
  
@@ -1563,7 +1563,7 @@ index 3c2f743c58b0..87a24798a5c7 100644
  	file->private_data = dev;
  	dev->file = file;
  	return nonseekable_open(inode, file);
-@@ -220,8 +1195,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
+@@ -219,8 +1179,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
  	void __user *argp = (void __user *)parm;
  
  	switch (cmd) {
@@ -1581,21 +1581,25 @@ index 3c2f743c58b0..87a24798a5c7 100644
  		return -ENOIOCTLCMD;
  	}
 diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
-index dcfa38fdc93c..4a8095a3fc34 100644
+index dcfa38fdc93c..6d06793512b1 100644
 --- a/include/uapi/linux/ntsync.h
 +++ b/include/uapi/linux/ntsync.h
-@@ -16,8 +16,47 @@ struct ntsync_sem_args {
+@@ -11,13 +11,49 @@
+ #include <linux/types.h>
+ 
+ struct ntsync_sem_args {
+-	__u32 sem;
+ 	__u32 count;
  	__u32 max;
  };
  
+-#define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
 +struct ntsync_mutex_args {
-+	__u32 mutex;
 +	__u32 owner;
 +	__u32 count;
 +};
 +
 +struct ntsync_event_args {
-+	__u32 event;
 +	__u32 manual;
 +	__u32 signaled;
 +};
@@ -1615,13 +1619,14 @@ index dcfa38fdc93c..4a8095a3fc34 100644
 +
 +#define NTSYNC_MAX_WAIT_COUNT 64
 +
- #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
++#define NTSYNC_IOC_CREATE_SEM		_IOW ('N', 0x80, struct ntsync_sem_args)
 +#define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
 +#define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
-+#define NTSYNC_IOC_CREATE_MUTEX		_IOWR('N', 0x84, struct ntsync_sem_args)
-+#define NTSYNC_IOC_CREATE_EVENT		_IOWR('N', 0x87, struct ntsync_event_args)
++#define NTSYNC_IOC_CREATE_MUTEX		_IOW ('N', 0x84, struct ntsync_mutex_args)
++#define NTSYNC_IOC_CREATE_EVENT		_IOW ('N', 0x87, struct ntsync_event_args)
  
- #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+-#define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
++#define NTSYNC_IOC_SEM_RELEASE		_IOWR('N', 0x81, __u32)
 +#define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
 +#define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
 +#define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
@@ -1633,11 +1638,11 @@ index dcfa38fdc93c..4a8095a3fc34 100644
  
  #endif
 diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
-index 9039f3709aff..d5aeaa8fe3ca 100644
+index 363d031a16f7..ff18c0361e38 100644
 --- a/tools/testing/selftests/Makefile
 +++ b/tools/testing/selftests/Makefile
-@@ -16,6 +16,7 @@ TARGETS += damon
- TARGETS += devices
+@@ -18,6 +18,7 @@ TARGETS += devices/error_logs
+ TARGETS += devices/probe
  TARGETS += dmabuf-heaps
  TARGETS += drivers/dma-buf
 +TARGETS += drivers/ntsync
@@ -1673,10 +1678,10 @@ index 000000000000..60539c826d06
 +CONFIG_WINESYNC=y
 diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
 new file mode 100644
-index 000000000000..5fa2c9a0768c
+index 000000000000..3aad311574c4
 --- /dev/null
 +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
-@@ -0,0 +1,1407 @@
+@@ -0,0 +1,1343 @@
 +// SPDX-License-Identifier: GPL-2.0-or-later
 +/*
 + * Various unit tests for the "ntsync" synchronization primitive driver.
@@ -1714,9 +1719,9 @@ index 000000000000..5fa2c9a0768c
 +		EXPECT_EQ((max), __max); \
 +	})
 +
-+static int post_sem(int sem, __u32 *count)
++static int release_sem(int sem, __u32 *count)
 +{
-+	return ioctl(sem, NTSYNC_IOC_SEM_POST, count);
++	return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count);
 +}
 +
 +static int read_mutex_state(int mutex, __u32 *count, __u32 *owner)
@@ -1831,28 +1836,24 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 3;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(-1, ret);
++	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_EQ(-1, sem);
 +	EXPECT_EQ(EINVAL, errno);
 +
 +	sem_args.count = 2;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
-+	sem = sem_args.sem;
++	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, sem);
 +	check_sem_state(sem, 2, 2);
 +
 +	count = 0;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, count);
 +	check_sem_state(sem, 2, 2);
 +
 +	count = 1;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EOVERFLOW, errno);
 +	check_sem_state(sem, 2, 2);
@@ -1872,13 +1873,13 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
 +	count = 3;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EOVERFLOW, errno);
 +	check_sem_state(sem, 0, 2);
 +
 +	count = 2;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +	check_sem_state(sem, 2, 2);
@@ -1889,13 +1890,13 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(0, ret);
 +
 +	count = 1;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +	check_sem_state(sem, 1, 2);
 +
 +	count = ~0u;
-+	ret = post_sem(sem, &count);
++	ret = release_sem(sem, &count);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EOVERFLOW, errno);
 +	check_sem_state(sem, 1, 2);
@@ -1919,23 +1920,20 @@ index 000000000000..5fa2c9a0768c
 +
 +	mutex_args.owner = 123;
 +	mutex_args.count = 0;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(-1, ret);
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_EQ(-1, mutex);
 +	EXPECT_EQ(EINVAL, errno);
 +
 +	mutex_args.owner = 0;
 +	mutex_args.count = 2;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(-1, ret);
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_EQ(-1, mutex);
 +	EXPECT_EQ(EINVAL, errno);
 +
 +	mutex_args.owner = 123;
 +	mutex_args.count = 2;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+	mutex = mutex_args.mutex;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
 +	check_mutex_state(mutex, 2, 123);
 +
 +	ret = unlock_mutex(mutex, 0, &count);
@@ -2036,11 +2034,8 @@ index 000000000000..5fa2c9a0768c
 +
 +	mutex_args.owner = 0;
 +	mutex_args.count = 0;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+	mutex = mutex_args.mutex;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
 +	check_mutex_state(mutex, 0, 0);
 +
 +	ret = wait_any(fd, 1, &mutex, 123, &index);
@@ -2052,11 +2047,8 @@ index 000000000000..5fa2c9a0768c
 +
 +	mutex_args.owner = 123;
 +	mutex_args.count = ~0u;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+	mutex = mutex_args.mutex;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
 +	check_mutex_state(mutex, ~0u, 123);
 +
 +	ret = wait_any(fd, 1, &mutex, 123, &index);
@@ -2079,11 +2071,8 @@ index 000000000000..5fa2c9a0768c
 +
 +	event_args.manual = 1;
 +	event_args.signaled = 0;
-+	event_args.event = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, event_args.event);
-+	event = event_args.event;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +	check_event_state(event, 0, 1);
 +
 +	signaled = 0xdeadbeef;
@@ -2147,11 +2136,8 @@ index 000000000000..5fa2c9a0768c
 +
 +	event_args.manual = 0;
 +	event_args.signaled = 1;
-+	event_args.event = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, event_args.event);
-+	event = event_args.event;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +
 +	check_event_state(event, 1, 0);
 +
@@ -2210,62 +2196,55 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 2;
 +	sem_args.max = 3;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	mutex_args.owner = 0;
 +	mutex_args.count = 0;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+
-+	objs[0] = sem_args.sem;
-+	objs[1] = mutex_args.mutex;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	ret = wait_any(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 1, 3);
-+	check_mutex_state(mutex_args.mutex, 0, 0);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 0, 0);
 +
 +	ret = wait_any(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 0, 0);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 0, 0);
 +
 +	ret = wait_any(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 123);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 1, 123);
 +
 +	count = 1;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +
 +	ret = wait_any(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 123);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 1, 123);
 +
 +	ret = wait_any(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 2, 123);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
 +
 +	ret = wait_any(fd, 2, objs, 456, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
 +	owner = 123;
-+	ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
 +	EXPECT_EQ(0, ret);
 +
 +	ret = wait_any(fd, 2, objs, 456, &index);
@@ -2277,24 +2256,27 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, index);
 +
++	close(objs[1]);
++
 +	/* test waiting on the same object twice */
++
 +	count = 2;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +
-+	objs[0] = objs[1] = sem_args.sem;
++	objs[1] = objs[0];
 +	ret = wait_any(fd, 2, objs, 456, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 1, 3);
++	check_sem_state(objs[0], 1, 3);
 +
 +	ret = wait_any(fd, 0, NULL, 456, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
-+	for (i = 0; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i)
-+		objs[i] = sem_args.sem;
++	for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i)
++		objs[i] = objs[0];
 +
 +	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
@@ -2308,8 +2290,7 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EINVAL, errno);
 +
-+	close(sem_args.sem);
-+	close(mutex_args.mutex);
++	close(objs[0]);
 +
 +	close(fd);
 +}
@@ -2327,88 +2308,81 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 2;
 +	sem_args.max = 3;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	mutex_args.owner = 0;
 +	mutex_args.count = 0;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+
-+	event_args.manual = true;
-+	event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
-+
-+	objs[0] = sem_args.sem;
-+	objs[1] = mutex_args.mutex;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 1, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 123);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
 +
 +	ret = wait_all(fd, 2, objs, 456, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
-+	check_sem_state(sem_args.sem, 1, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 123);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
 +
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 2, 123);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
 +
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_mutex_state(mutex_args.mutex, 2, 123);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
 +
 +	count = 3;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 2, 3);
-+	check_mutex_state(mutex_args.mutex, 3, 123);
++	check_sem_state(objs[0], 2, 3);
++	check_mutex_state(objs[1], 3, 123);
 +
 +	owner = 123;
-+	ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
 +	EXPECT_EQ(0, ret);
 +
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EOWNERDEAD, errno);
-+	check_sem_state(sem_args.sem, 1, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 123);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	close(objs[1]);
++
++	event_args.manual = true;
++	event_args.signaled = true;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
 +
-+	objs[0] = sem_args.sem;
-+	objs[1] = event_args.event;
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
-+	check_sem_state(sem_args.sem, 0, 3);
-+	check_event_state(event_args.event, 1, 1);
++	check_sem_state(objs[0], 0, 3);
++	check_event_state(objs[1], 1, 1);
++
++	close(objs[1]);
 +
 +	/* test waiting on the same object twice */
-+	objs[0] = objs[1] = sem_args.sem;
++	objs[1] = objs[0];
 +	ret = wait_all(fd, 2, objs, 123, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(EINVAL, errno);
 +
-+	close(sem_args.sem);
-+	close(mutex_args.mutex);
-+	close(event_args.event);
++	close(objs[0]);
 +
 +	close(fd);
 +}
@@ -2469,20 +2443,13 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 0;
 +	sem_args.max = 3;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	mutex_args.owner = 123;
 +	mutex_args.count = 1;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
-+
-+	objs[0] = sem_args.sem;
-+	objs[1] = mutex_args.mutex;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	/* test waking the semaphore */
 +
@@ -2501,10 +2468,10 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
 +	count = 1;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
-+	check_sem_state(sem_args.sem, 0, 3);
++	check_sem_state(objs[0], 0, 3);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
@@ -2514,7 +2481,7 @@ index 000000000000..5fa2c9a0768c
 +	/* test waking the mutex */
 +
 +	/* first grab it again for owner 123 */
-+	ret = wait_any(fd, 1, &mutex_args.mutex, 123, &index);
++	ret = wait_any(fd, 1, &objs[1], 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
@@ -2526,31 +2493,32 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
++	ret = unlock_mutex(objs[1], 123, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, count);
 +
 +	ret = pthread_tryjoin_np(thread, NULL);
 +	EXPECT_EQ(EBUSY, ret);
 +
-+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
++	ret = unlock_mutex(objs[1], 123, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, mutex_args.count);
-+	check_mutex_state(mutex_args.mutex, 1, 456);
++	check_mutex_state(objs[1], 1, 456);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(1, wait_args.index);
 +
++	close(objs[1]);
++
 +	/* test waking events */
 +
 +	event_args.manual = false;
 +	event_args.signaled = false;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
 +
-+	objs[1] = event_args.event;
 +	wait_args.timeout = get_abs_timeout(1000);
 +	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
 +	EXPECT_EQ(0, ret);
@@ -2558,10 +2526,10 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
-+	check_event_state(event_args.event, 0, 0);
++	check_event_state(objs[1], 0, 0);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
@@ -2575,24 +2543,23 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
-+	check_event_state(event_args.event, 0, 0);
++	check_event_state(objs[1], 0, 0);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(1, wait_args.index);
 +
-+	close(event_args.event);
++	close(objs[1]);
 +
 +	event_args.manual = true;
 +	event_args.signaled = false;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
 +
-+	objs[1] = event_args.event;
 +	wait_args.timeout = get_abs_timeout(1000);
 +	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
 +	EXPECT_EQ(0, ret);
@@ -2600,17 +2567,17 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
-+	check_event_state(event_args.event, 1, 1);
++	check_event_state(objs[1], 1, 1);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(1, wait_args.index);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, signaled);
 +
@@ -2621,31 +2588,28 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
-+	check_event_state(event_args.event, 0, 1);
++	check_event_state(objs[1], 0, 1);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(1, wait_args.index);
 +
-+	close(event_args.event);
-+
 +	/* delete an object while it's being waited on */
 +
 +	wait_args.timeout = get_abs_timeout(200);
 +	wait_args.owner = 123;
-+	objs[1] = mutex_args.mutex;
 +	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
 +	EXPECT_EQ(0, ret);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	close(sem_args.sem);
-+	close(mutex_args.mutex);
++	close(objs[0]);
++	close(objs[1]);
 +
 +	ret = wait_for_thread(thread, 200);
 +	EXPECT_EQ(0, ret);
@@ -2672,32 +2636,23 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 0;
 +	sem_args.max = 3;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	mutex_args.owner = 123;
 +	mutex_args.count = 1;
-+	mutex_args.mutex = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	manual_event_args.manual = true;
 +	manual_event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args);
-+	EXPECT_EQ(0, ret);
++	objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args);
++	EXPECT_LE(0, objs[2]);
 +
 +	auto_event_args.manual = false;
 +	auto_event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args);
-+	EXPECT_EQ(0, ret);
-+
-+	objs[0] = sem_args.sem;
-+	objs[1] = mutex_args.mutex;
-+	objs[2] = manual_event_args.event;
-+	objs[3] = auto_event_args.event;
++	objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args);
++	EXPECT_EQ(0, objs[3]);
 +
 +	wait_args.timeout = get_abs_timeout(1000);
 +	wait_args.objs = (uintptr_t)objs;
@@ -2713,54 +2668,54 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
 +	count = 1;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
 +
 +	ret = pthread_tryjoin_np(thread, NULL);
 +	EXPECT_EQ(EBUSY, ret);
 +
-+	check_sem_state(sem_args.sem, 1, 3);
++	check_sem_state(objs[0], 1, 3);
 +
-+	ret = wait_any(fd, 1, &sem_args.sem, 123, &index);
++	ret = wait_any(fd, 1, &objs[0], 123, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
-+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
++	ret = unlock_mutex(objs[1], 123, &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, count);
 +
 +	ret = pthread_tryjoin_np(thread, NULL);
 +	EXPECT_EQ(EBUSY, ret);
 +
-+	check_mutex_state(mutex_args.mutex, 0, 0);
++	check_mutex_state(objs[1], 0, 0);
 +
-+	ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, signaled);
 +
 +	count = 2;
-+	ret = post_sem(sem_args.sem, &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, count);
-+	check_sem_state(sem_args.sem, 2, 3);
++	check_sem_state(objs[0], 2, 3);
 +
-+	ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, signaled);
 +
-+	ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
 +
-+	ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, signaled);
 +
-+	check_sem_state(sem_args.sem, 1, 3);
-+	check_mutex_state(mutex_args.mutex, 1, 456);
-+	check_event_state(manual_event_args.event, 1, 1);
-+	check_event_state(auto_event_args.event, 0, 0);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 456);
++	check_event_state(objs[2], 1, 1);
++	check_event_state(objs[3], 0, 0);
 +
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(0, ret);
@@ -2776,10 +2731,10 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	close(sem_args.sem);
-+	close(mutex_args.mutex);
-+	close(manual_event_args.event);
-+	close(auto_event_args.event);
++	close(objs[0]);
++	close(objs[1]);
++	close(objs[2]);
++	close(objs[3]);
 +
 +	ret = wait_for_thread(thread, 200);
 +	EXPECT_EQ(0, ret);
@@ -2796,7 +2751,7 @@ index 000000000000..5fa2c9a0768c
 +	struct ntsync_sem_args sem_args = {0};
 +	__u32 index, count, signaled;
 +	struct wait_args thread_args;
-+	int objs[2], fd, ret;
++	int objs[2], event, fd, ret;
 +	pthread_t thread;
 +
 +	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
@@ -2804,50 +2759,44 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 0;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
-+	objs[0] = sem_args.sem;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	sem_args.count = 1;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
-+	objs[1] = sem_args.sem;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	event_args.manual = true;
 +	event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +
-+	ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
-+	ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
-+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(1, index);
 +
-+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, index);
 +
 +	/* test wakeup via alert */
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
 +	wait_args.timeout = get_abs_timeout(1000);
@@ -2855,7 +2804,7 @@ index 000000000000..5fa2c9a0768c
 +	wait_args.count = 2;
 +	wait_args.owner = 123;
 +	wait_args.index = 0xdeadbeef;
-+	wait_args.alert = event_args.event;
++	wait_args.alert = event;
 +	thread_args.fd = fd;
 +	thread_args.args = &wait_args;
 +	thread_args.request = NTSYNC_IOC_WAIT_ANY;
@@ -2865,7 +2814,7 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
 +	ret = wait_for_thread(thread, 100);
@@ -2873,32 +2822,32 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(2, wait_args.index);
 +
-+	close(event_args.event);
++	close(event);
 +
 +	/* test with an auto-reset event */
 +
 +	event_args.manual = false;
 +	event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +
 +	count = 1;
-+	ret = post_sem(objs[0], &count);
++	ret = release_sem(objs[0], &count);
 +	EXPECT_EQ(0, ret);
 +
-+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
-+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, index);
 +
-+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
-+	close(event_args.event);
++	close(event);
 +
 +	close(objs[0]);
 +	close(objs[1]);
@@ -2913,7 +2862,7 @@ index 000000000000..5fa2c9a0768c
 +	struct ntsync_sem_args sem_args = {0};
 +	struct wait_args thread_args;
 +	__u32 index, count, signaled;
-+	int objs[2], fd, ret;
++	int objs[2], event, fd, ret;
 +	pthread_t thread;
 +
 +	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
@@ -2921,36 +2870,30 @@ index 000000000000..5fa2c9a0768c
 +
 +	sem_args.count = 2;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
-+	objs[0] = sem_args.sem;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
 +
 +	sem_args.count = 1;
 +	sem_args.max = 2;
-+	sem_args.sem = 0xdeadbeef;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
-+	EXPECT_EQ(0, ret);
-+	EXPECT_NE(0xdeadbeef, sem_args.sem);
-+	objs[1] = sem_args.sem;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[1]);
 +
 +	event_args.manual = true;
 +	event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +
-+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
-+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, index);
 +
 +	/* test wakeup via alert */
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
 +	wait_args.timeout = get_abs_timeout(1000);
@@ -2958,7 +2901,7 @@ index 000000000000..5fa2c9a0768c
 +	wait_args.count = 2;
 +	wait_args.owner = 123;
 +	wait_args.index = 0xdeadbeef;
-+	wait_args.alert = event_args.event;
++	wait_args.alert = event;
 +	thread_args.fd = fd;
 +	thread_args.args = &wait_args;
 +	thread_args.request = NTSYNC_IOC_WAIT_ALL;
@@ -2968,7 +2911,7 @@ index 000000000000..5fa2c9a0768c
 +	ret = wait_for_thread(thread, 100);
 +	EXPECT_EQ(ETIMEDOUT, ret);
 +
-+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
 +	EXPECT_EQ(0, ret);
 +
 +	ret = wait_for_thread(thread, 100);
@@ -2976,32 +2919,32 @@ index 000000000000..5fa2c9a0768c
 +	EXPECT_EQ(0, thread_args.ret);
 +	EXPECT_EQ(2, wait_args.index);
 +
-+	close(event_args.event);
++	close(event);
 +
 +	/* test with an auto-reset event */
 +
 +	event_args.manual = false;
 +	event_args.signaled = true;
-+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
 +
 +	count = 2;
-+	ret = post_sem(objs[1], &count);
++	ret = release_sem(objs[1], &count);
 +	EXPECT_EQ(0, ret);
 +
-+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(0, index);
 +
-+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(0, ret);
 +	EXPECT_EQ(2, index);
 +
-+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
 +	EXPECT_EQ(-1, ret);
 +	EXPECT_EQ(ETIMEDOUT, errno);
 +
-+	close(event_args.event);
++	close(event);
 +
 +	close(objs[0]);
 +	close(objs[1]);
@@ -3055,15 +2998,13 @@ index 000000000000..5fa2c9a0768c
 +
 +	mutex_args.owner = 0;
 +	mutex_args.count = 0;
-+	ret = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
-+	EXPECT_EQ(0, ret);
-+	stress_mutex = mutex_args.mutex;
++	stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, stress_mutex);
 +
 +	event_args.manual = 1;
 +	event_args.signaled = 0;
-+	ret = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args);
-+	EXPECT_EQ(0, ret);
-+	stress_start_event = event_args.event;
++	stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, stress_start_event);
 +
 +	for (i = 0; i < STRESS_THREADS; ++i)
 +		pthread_create(&threads[i], NULL, stress_thread, NULL);
@@ -3085,5 +3026,5 @@ index 000000000000..5fa2c9a0768c
 +
 +TEST_HARNESS_MAIN
 -- 
-2.46.0.rc1
+2.47.1
 
diff --git a/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch b/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch
new file mode 100644
index 0000000..451afad
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch
@@ -0,0 +1,997 @@
+From 7bb6922864744721217f728cc01dcb53dcdcc2da Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:52:26 +0100
+Subject: [PATCH 09/12] perf-per-core
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/arch/x86/topology.rst   |   4 +
+ arch/x86/events/rapl.c                | 507 ++++++++++++++------------
+ arch/x86/include/asm/processor.h      |   1 +
+ arch/x86/include/asm/topology.h       |   1 +
+ arch/x86/kernel/cpu/debugfs.c         |   1 +
+ arch/x86/kernel/cpu/topology_common.c |   1 +
+ include/linux/cpuhotplug.h            |   1 -
+ 7 files changed, 288 insertions(+), 228 deletions(-)
+
+diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
+index 7352ab89a55a..c12837e61bda 100644
+--- a/Documentation/arch/x86/topology.rst
++++ b/Documentation/arch/x86/topology.rst
+@@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
+     The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
+     "core_id."
+ 
++  - topology_logical_core_id();
++
++    The logical core ID to which a thread belongs.
++
+ 
+ 
+ System topology examples
+diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
+index a481a939862e..d3bb3865c1b1 100644
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -39,6 +39,10 @@
+  *	  event: rapl_energy_psys
+  *    perf code: 0x5
+  *
++ *  core counter: consumption of a single physical core
++ *	  event: rapl_energy_core (power_core PMU)
++ *    perf code: 0x1
++ *
+  * We manage those counters as free running (read-only). They may be
+  * use simultaneously by other tools, such as turbostat.
+  *
+@@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
+ /*
+  * RAPL energy status counters
+  */
+-enum perf_rapl_events {
++enum perf_rapl_pkg_events {
+ 	PERF_RAPL_PP0 = 0,		/* all cores */
+ 	PERF_RAPL_PKG,			/* entire package */
+ 	PERF_RAPL_RAM,			/* DRAM */
+ 	PERF_RAPL_PP1,			/* gpu */
+ 	PERF_RAPL_PSYS,			/* psys */
+ 
+-	PERF_RAPL_MAX,
+-	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
++	PERF_RAPL_PKG_EVENTS_MAX,
++	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+ };
+ 
+-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
++#define PERF_RAPL_CORE			0		/* single core */
++#define PERF_RAPL_CORE_EVENTS_MAX	1
++#define NR_RAPL_CORE_DOMAINS		PERF_RAPL_CORE_EVENTS_MAX
++
++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
+ 	"pp0-core",
+ 	"package",
+ 	"dram",
+@@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ 	"psys",
+ };
+ 
++static const char *const rapl_core_domain_name __initconst = "core";
++
+ /*
+  * event code: LSB 8 bits, passed in attr->config
+  * any other bit is reserved
+@@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
+  *	     considered as either pkg-scope or die-scope, and we are considering
+  *	     them as die-scope.
+  */
+-#define rapl_pmu_is_pkg_scope()				\
++#define rapl_pkg_pmu_is_pkg_scope()				\
+ 	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
+ 	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ 
+@@ -129,7 +139,8 @@ struct rapl_pmu {
+ struct rapl_pmus {
+ 	struct pmu		pmu;
+ 	unsigned int		nr_rapl_pmu;
+-	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
++	unsigned int		cntr_mask;
++	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
+ };
+ 
+ enum rapl_unit_quirk {
+@@ -139,45 +150,43 @@ enum rapl_unit_quirk {
+ };
+ 
+ struct rapl_model {
+-	struct perf_msr *rapl_msrs;
+-	unsigned long	events;
++	struct perf_msr *rapl_pkg_msrs;
++	struct perf_msr *rapl_core_msrs;
++	unsigned long	pkg_events;
++	unsigned long	core_events;
+ 	unsigned int	msr_power_unit;
+ 	enum rapl_unit_quirk	unit_quirk;
+ };
+ 
+  /* 1/2^hw_unit Joule */
+-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+-static struct rapl_pmus *rapl_pmus;
+-static cpumask_t rapl_cpu_mask;
+-static unsigned int rapl_cntr_mask;
++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
++static int rapl_core_hw_unit __read_mostly;
++static struct rapl_pmus *rapl_pmus_pkg;
++static struct rapl_pmus *rapl_pmus_core;
+ static u64 rapl_timer_ms;
+-static struct perf_msr *rapl_msrs;
++static struct rapl_model *rapl_model;
+ 
+ /*
+- * Helper functions to get the correct topology macros according to the
++ * Helper function to get the correct topology id according to the
+  * RAPL PMU scope.
+  */
+-static inline unsigned int get_rapl_pmu_idx(int cpu)
+-{
+-	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+-					 topology_logical_die_id(cpu);
+-}
+-
+-static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
+ {
+-	return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
+-					 topology_die_cpumask(cpu);
+-}
+-
+-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+-{
+-	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+-
+ 	/*
+-	 * The unsigned check also catches the '-1' return value for non
+-	 * existent mappings in the topology map.
++	 * Returns unsigned int, which converts the '-1' return value
++	 * (for non-existent mappings in topology map) to UINT_MAX, so
++	 * the error check in the caller is simplified.
+ 	 */
+-	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
++	switch (scope) {
++	case PERF_PMU_SCOPE_PKG:
++		return topology_logical_package_id(cpu);
++	case PERF_PMU_SCOPE_DIE:
++		return topology_logical_die_id(cpu);
++	case PERF_PMU_SCOPE_CORE:
++		return topology_logical_core_id(cpu);
++	default:
++		return -EINVAL;
++	}
+ }
+ 
+ static inline u64 rapl_read_counter(struct perf_event *event)
+@@ -187,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
+ 	return raw;
+ }
+ 
+-static inline u64 rapl_scale(u64 v, int cfg)
++static inline u64 rapl_scale(u64 v, struct perf_event *event)
+ {
+-	if (cfg > NR_RAPL_DOMAINS) {
+-		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+-		return v;
+-	}
++	int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
++
++	if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
++		hw_unit = rapl_core_hw_unit;
++
+ 	/*
+ 	 * scale delta to smallest unit (1/2^32)
+ 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ 	 * or use ldexp(count, -32).
+ 	 * Watts = Joules/Time delta
+ 	 */
+-	return v << (32 - rapl_hw_unit[cfg - 1]);
++	return v << (32 - hw_unit);
+ }
+ 
+ static u64 rapl_event_update(struct perf_event *event)
+@@ -226,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
+ 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ 	delta >>= shift;
+ 
+-	sdelta = rapl_scale(delta, event->hw.config);
++	sdelta = rapl_scale(delta, event);
+ 
+ 	local64_add(sdelta, &event->count);
+ 
+@@ -241,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+ 
+ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+ {
+-	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
++	struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ 	struct perf_event *event;
+ 	unsigned long flags;
+ 
+-	if (!pmu->n_active)
++	if (!rapl_pmu->n_active)
+ 		return HRTIMER_NORESTART;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+-	list_for_each_entry(event, &pmu->active_list, active_entry)
++	list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
+ 		rapl_event_update(event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
++	hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
+ 
+ 	return HRTIMER_RESTART;
+ }
+ 
+-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
+ {
+-	struct hrtimer *hr = &pmu->hrtimer;
++	struct hrtimer *hr = &rapl_pmu->hrtimer;
+ 
+ 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	hr->function = rapl_hrtimer_handle;
+ }
+ 
+-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
+ 				   struct perf_event *event)
+ {
+ 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+@@ -276,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ 
+ 	event->hw.state = 0;
+ 
+-	list_add_tail(&event->active_entry, &pmu->active_list);
++	list_add_tail(&event->active_entry, &rapl_pmu->active_list);
+ 
+ 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+ 
+-	pmu->n_active++;
+-	if (pmu->n_active == 1)
+-		rapl_start_hrtimer(pmu);
++	rapl_pmu->n_active++;
++	if (rapl_pmu->n_active == 1)
++		rapl_start_hrtimer(rapl_pmu);
+ }
+ 
+ static void rapl_pmu_event_start(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
+-	__rapl_pmu_event_start(pmu, event);
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
++	__rapl_pmu_event_start(rapl_pmu, event);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	/* mark event as deactivated and stopped */
+ 	if (!(hwc->state & PERF_HES_STOPPED)) {
+-		WARN_ON_ONCE(pmu->n_active <= 0);
+-		pmu->n_active--;
+-		if (pmu->n_active == 0)
+-			hrtimer_cancel(&pmu->hrtimer);
++		WARN_ON_ONCE(rapl_pmu->n_active <= 0);
++		rapl_pmu->n_active--;
++		if (rapl_pmu->n_active == 0)
++			hrtimer_cancel(&rapl_pmu->hrtimer);
+ 
+ 		list_del(&event->active_entry);
+ 
+@@ -326,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+ 		hwc->state |= PERF_HES_UPTODATE;
+ 	}
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static int rapl_pmu_event_add(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ 
+ 	if (mode & PERF_EF_START)
+-		__rapl_pmu_event_start(pmu, event);
++		__rapl_pmu_event_start(rapl_pmu, event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+ 	return 0;
+ }
+@@ -355,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
+ static int rapl_pmu_event_init(struct perf_event *event)
+ {
+ 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+-	int bit, ret = 0;
+-	struct rapl_pmu *pmu;
++	int bit, rapl_pmus_scope, ret = 0;
++	struct rapl_pmu *rapl_pmu;
++	unsigned int rapl_pmu_idx;
++	struct rapl_pmus *rapl_pmus;
+ 
+-	/* only look at RAPL events */
+-	if (event->attr.type != rapl_pmus->pmu.type)
+-		return -ENOENT;
++	/* unsupported modes and filters */
++	if (event->attr.sample_period) /* no sampling */
++		return -EINVAL;
+ 
+ 	/* check only supported bits are set */
+ 	if (event->attr.config & ~RAPL_EVENT_MASK)
+@@ -369,29 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+-
+-	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
++	rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
++	if (!rapl_pmus)
++		return -EINVAL;
++	rapl_pmus_scope = rapl_pmus->pmu.scope;
++
++	if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
++		/* only look at RAPL package events */
++		if (event->attr.type != rapl_pmus_pkg->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
++	} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
++		/* only look at RAPL core events */
++		if (event->attr.type != rapl_pmus_core->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
++	} else
+ 		return -EINVAL;
+-
+-	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
+-	bit = cfg - 1;
+ 
+ 	/* check event supported */
+-	if (!(rapl_cntr_mask & (1 << bit)))
++	if (!(rapl_pmus->cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+ 
+-	/* unsupported modes and filters */
+-	if (event->attr.sample_period) /* no sampling */
++	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
++	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ 		return -EINVAL;
+-
+ 	/* must be done before validate_group */
+-	pmu = cpu_to_rapl_pmu(event->cpu);
+-	if (!pmu)
++	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
++	if (!rapl_pmu)
+ 		return -EINVAL;
+-	event->cpu = pmu->cpu;
+-	event->pmu_private = pmu;
+-	event->hw.event_base = rapl_msrs[bit].msr;
++
++	event->pmu_private = rapl_pmu;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = bit;
+ 
+@@ -403,34 +435,19 @@ static void rapl_pmu_event_read(struct perf_event *event)
+ 	rapl_event_update(event);
+ }
+ 
+-static ssize_t rapl_get_attr_cpumask(struct device *dev,
+-				struct device_attribute *attr, char *buf)
+-{
+-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+-}
+-
+-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+-
+-static struct attribute *rapl_pmu_attrs[] = {
+-	&dev_attr_cpumask.attr,
+-	NULL,
+-};
+-
+-static struct attribute_group rapl_pmu_attr_group = {
+-	.attrs = rapl_pmu_attrs,
+-};
+-
+ RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+ RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+ RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+ RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+ RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
++RAPL_EVENT_ATTR_STR(energy-core,   rapl_core, "event=0x01");
+ 
+ RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
++RAPL_EVENT_ATTR_STR(energy-core.unit,   rapl_core_unit, "Joules");
+ 
+ /*
+  * we compute in 0.23 nJ increments regardless of MSR
+@@ -440,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890
+ RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
++RAPL_EVENT_ATTR_STR(energy-core.scale,   rapl_core_scale, "2.3283064365386962890625e-10");
+ 
+ /*
+  * There are no default events, but we need to create
+@@ -467,7 +485,12 @@ static struct attribute_group rapl_pmu_format_group = {
+ };
+ 
+ static const struct attribute_group *rapl_attr_groups[] = {
+-	&rapl_pmu_attr_group,
++	&rapl_pmu_format_group,
++	&rapl_pmu_events_group,
++	NULL,
++};
++
++static const struct attribute_group *rapl_core_attr_groups[] = {
+ 	&rapl_pmu_format_group,
+ 	&rapl_pmu_events_group,
+ 	NULL,
+@@ -533,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
+ 	.attrs = rapl_events_psys,
+ };
+ 
++static struct attribute *rapl_events_core[] = {
++	EVENT_PTR(rapl_core),
++	EVENT_PTR(rapl_core_unit),
++	EVENT_PTR(rapl_core_scale),
++	NULL,
++};
++
++static struct attribute_group rapl_events_core_group = {
++	.name  = "events",
++	.attrs = rapl_events_core,
++};
++
+ static bool test_msr(int idx, void *data)
+ {
+ 	return test_bit(idx, (unsigned long *) data);
+@@ -558,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
+ };
+ 
+ /*
+- * Force to PERF_RAPL_MAX size due to:
+- * - perf_msr_probe(PERF_RAPL_MAX)
++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
+  * - want to use same event codes across both architectures
+  */
+-static struct perf_msr amd_rapl_msrs[] = {
++static struct perf_msr amd_rapl_pkg_msrs[] = {
+ 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
+ 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
+ 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
+@@ -570,77 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
+ 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
+ };
+ 
+-static int rapl_cpu_offline(unsigned int cpu)
+-{
+-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+-	int target;
+-
+-	/* Check if exiting cpu is used for collecting rapl events */
+-	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+-		return 0;
+-
+-	pmu->cpu = -1;
+-	/* Find a new cpu to collect rapl events */
+-	target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
+-
+-	/* Migrate rapl events to the new target */
+-	if (target < nr_cpu_ids) {
+-		cpumask_set_cpu(target, &rapl_cpu_mask);
+-		pmu->cpu = target;
+-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+-	}
+-	return 0;
+-}
+-
+-static int rapl_cpu_online(unsigned int cpu)
+-{
+-	s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+-	if (rapl_pmu_idx < 0) {
+-		pr_err("topology_logical_(package/die)_id() returned a negative value");
+-		return -EINVAL;
+-	}
+-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+-	int target;
+-
+-	if (!pmu) {
+-		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+-		if (!pmu)
+-			return -ENOMEM;
+-
+-		raw_spin_lock_init(&pmu->lock);
+-		INIT_LIST_HEAD(&pmu->active_list);
+-		pmu->pmu = &rapl_pmus->pmu;
+-		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+-		rapl_hrtimer_init(pmu);
+-
+-		rapl_pmus->pmus[rapl_pmu_idx] = pmu;
+-	}
+-
+-	/*
+-	 * Check if there is an online cpu in the package which collects rapl
+-	 * events already.
+-	 */
+-	target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
+-	if (target < nr_cpu_ids)
+-		return 0;
+-
+-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+-	pmu->cpu = cpu;
+-	return 0;
+-}
++static struct perf_msr amd_rapl_core_msrs[] = {
++	[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
++				 test_msr, false, RAPL_MSR_MASK },
++};
+ 
+-static int rapl_check_hw_unit(struct rapl_model *rm)
++static int rapl_check_hw_unit(void)
+ {
+ 	u64 msr_rapl_power_unit_bits;
+ 	int i;
+ 
+ 	/* protect rdmsrl() to handle virtualization */
+-	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
++	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
+ 		return -1;
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+-		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
++		rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++
++	rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ 
+-	switch (rm->unit_quirk) {
++	switch (rapl_model->unit_quirk) {
+ 	/*
+ 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ 	 * different than the unit from power unit MSR. See
+@@ -648,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ 	 */
+ 	case RAPL_UNIT_QUIRK_INTEL_HSW:
+-		rapl_hw_unit[PERF_RAPL_RAM] = 16;
++		rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
+ 		break;
+ 	/* SPR uses a fixed energy unit for Psys domain. */
+ 	case RAPL_UNIT_QUIRK_INTEL_SPR:
+-		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
++		rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
+ 		break;
+ 	default:
+ 		break;
+ 	}
+ 
+-
+ 	/*
+ 	 * Calculate the timer rate:
+ 	 * Use reference of 200W for scaling the timeout to avoid counter
+@@ -667,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ 	 * if hw unit is 32, then we use 2 ms 1/200/2
+ 	 */
+ 	rapl_timer_ms = 2;
+-	if (rapl_hw_unit[0] < 32) {
++	if (rapl_pkg_hw_unit[0] < 32) {
+ 		rapl_timer_ms = (1000 / (2 * 100));
+-		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
++		rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
+ 	}
+ 	return 0;
+ }
+@@ -677,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ static void __init rapl_advertise(void)
+ {
+ 	int i;
++	int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
++
++	if (rapl_pmus_core)
++		num_counters += hweight32(rapl_pmus_core->cntr_mask);
+ 
+ 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+-		hweight32(rapl_cntr_mask), rapl_timer_ms);
++		num_counters, rapl_timer_ms);
+ 
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+-		if (rapl_cntr_mask & (1 << i)) {
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
++		if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
+ 			pr_info("hw unit of domain %s 2^-%d Joules\n",
+-				rapl_domain_names[i], rapl_hw_unit[i]);
++				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ 		}
+ 	}
++
++	if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
++		pr_info("hw unit of domain %s 2^-%d Joules\n",
++			rapl_core_domain_name, rapl_core_hw_unit);
+ }
+ 
+-static void cleanup_rapl_pmus(void)
++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
+-		kfree(rapl_pmus->pmus[i]);
++		kfree(rapl_pmus->rapl_pmu[i]);
+ 	kfree(rapl_pmus);
+ }
+ 
+@@ -707,17 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
+ 	NULL,
+ };
+ 
+-static int __init init_rapl_pmus(void)
++static const struct attribute_group *rapl_core_attr_update[] = {
++	&rapl_events_core_group,
++	NULL,
++};
++
++static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
++{
++	struct rapl_pmu *rapl_pmu;
++	int idx;
++
++	for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
++		rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
++		if (!rapl_pmu)
++			goto free;
++
++		raw_spin_lock_init(&rapl_pmu->lock);
++		INIT_LIST_HEAD(&rapl_pmu->active_list);
++		rapl_pmu->pmu = &rapl_pmus->pmu;
++		rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
++		rapl_hrtimer_init(rapl_pmu);
++
++		rapl_pmus->rapl_pmu[idx] = rapl_pmu;
++	}
++
++	return 0;
++free:
++	for (; idx > 0; idx--)
++		kfree(rapl_pmus->rapl_pmu[idx - 1]);
++	return -ENOMEM;
++}
++
++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
++				 const struct attribute_group **rapl_attr_groups,
++				 const struct attribute_group **rapl_attr_update)
+ {
+ 	int nr_rapl_pmu = topology_max_packages();
++	struct rapl_pmus *rapl_pmus;
+ 
+-	if (!rapl_pmu_is_pkg_scope())
+-		nr_rapl_pmu *= topology_max_dies_per_package();
++	/*
++	 * rapl_pmu_scope must be either PKG, DIE or CORE
++	 */
++	if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
++		nr_rapl_pmu	*= topology_max_dies_per_package();
++	else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
++		nr_rapl_pmu	*= topology_num_cores_per_package();
++	else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
++		return -EINVAL;
+ 
+-	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
++	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+ 		return -ENOMEM;
+ 
++	*rapl_pmus_ptr = rapl_pmus;
++
+ 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
+ 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
+ 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
+@@ -728,77 +761,81 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
+ 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
+ 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
++	rapl_pmus->pmu.scope		= rapl_pmu_scope;
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+-	return 0;
++
++	return init_rapl_pmu(rapl_pmus);
+ }
+ 
+ static struct rapl_model model_snb = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_snbep = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsw = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsx = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_knl = {
+-	.events		= BIT(PERF_RAPL_PKG) |
++	.pkg_events	= BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_skl = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs      = intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_spr = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_spr_msrs,
++	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
+ };
+ 
+ static struct rapl_model model_amd_hygon = {
+-	.events		= BIT(PERF_RAPL_PKG),
++	.pkg_events	= BIT(PERF_RAPL_PKG),
++	.core_events	= BIT(PERF_RAPL_CORE),
+ 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+-	.rapl_msrs      = amd_rapl_msrs,
++	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
++	.rapl_core_msrs	= amd_rapl_core_msrs,
+ };
+ 
+ static const struct x86_cpu_id rapl_model_match[] __initconst = {
+@@ -854,57 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
+ static int __init rapl_pmu_init(void)
+ {
+ 	const struct x86_cpu_id *id;
+-	struct rapl_model *rm;
++	int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
+ 	int ret;
+ 
++	if (rapl_pkg_pmu_is_pkg_scope())
++		rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
++
+ 	id = x86_match_cpu(rapl_model_match);
+ 	if (!id)
+ 		return -ENODEV;
+ 
+-	rm = (struct rapl_model *) id->driver_data;
++	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_msrs = rm->rapl_msrs;
+-
+-	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
+-					false, (void *) &rm->events);
+-
+-	ret = rapl_check_hw_unit(rm);
++	ret = rapl_check_hw_unit();
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = init_rapl_pmus();
++	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
++			     rapl_attr_update);
+ 	if (ret)
+ 		return ret;
+ 
+-	/*
+-	 * Install callbacks. Core will call them for each online cpu.
+-	 */
+-	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
+-				"perf/x86/rapl:online",
+-				rapl_cpu_online, rapl_cpu_offline);
++	rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
++						  PERF_RAPL_PKG_EVENTS_MAX, false,
++						  (void *) &rapl_model->pkg_events);
++
++	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
+ 	if (ret)
+ 		goto out;
+ 
+-	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+-	if (ret)
+-		goto out1;
++	if (rapl_model->core_events) {
++		ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
++				     rapl_core_attr_groups,
++				     rapl_core_attr_update);
++		if (ret) {
++			pr_warn("power-core PMU initialization failed (%d)\n", ret);
++			goto core_init_failed;
++		}
++
++		rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
++						     PERF_RAPL_CORE_EVENTS_MAX, false,
++						     (void *) &rapl_model->core_events);
++
++		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
++		if (ret) {
++			pr_warn("power-core PMU registration failed (%d)\n", ret);
++			cleanup_rapl_pmus(rapl_pmus_core);
++		}
++	}
+ 
++core_init_failed:
+ 	rapl_advertise();
+ 	return 0;
+ 
+-out1:
+-	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+ out:
+ 	pr_warn("Initialization failed (%d), disabled\n", ret);
+-	cleanup_rapl_pmus();
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ 	return ret;
+ }
+ module_init(rapl_pmu_init);
+ 
+ static void __exit intel_rapl_exit(void)
+ {
+-	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+-	perf_pmu_unregister(&rapl_pmus->pmu);
+-	cleanup_rapl_pmus();
++	if (rapl_pmus_core) {
++		perf_pmu_unregister(&rapl_pmus_core->pmu);
++		cleanup_rapl_pmus(rapl_pmus_core);
++	}
++	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ }
+ module_exit(intel_rapl_exit);
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 20e6009381ed..c0cd10182e90 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -98,6 +98,7 @@ struct cpuinfo_topology {
+ 	// Logical ID mappings
+ 	u32			logical_pkg_id;
+ 	u32			logical_die_id;
++	u32			logical_core_id;
+ 
+ 	// AMD Node ID and Nodes per Package info
+ 	u32			amd_node_id;
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index fd41103ad342..3973cb9bb2e6 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
+ #define topology_logical_package_id(cpu)	(cpu_data(cpu).topo.logical_pkg_id)
+ #define topology_physical_package_id(cpu)	(cpu_data(cpu).topo.pkg_id)
+ #define topology_logical_die_id(cpu)		(cpu_data(cpu).topo.logical_die_id)
++#define topology_logical_core_id(cpu)		(cpu_data(cpu).topo.logical_core_id)
+ #define topology_die_id(cpu)			(cpu_data(cpu).topo.die_id)
+ #define topology_core_id(cpu)			(cpu_data(cpu).topo.core_id)
+ #define topology_ppin(cpu)			(cpu_data(cpu).ppin)
+diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
+index 10719aba6276..cacfd3f6abef 100644
+--- a/arch/x86/kernel/cpu/debugfs.c
++++ b/arch/x86/kernel/cpu/debugfs.c
+@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
+ 	seq_printf(m, "cpu_type:            %s\n", get_topology_cpu_type_name(c));
+ 	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
+ 	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
++	seq_printf(m, "logical_core_id:     %u\n", c->topo.logical_core_id);
+ 	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
+ 	seq_printf(m, "l2c_id:              %u\n", c->topo.l2c_id);
+ 	seq_printf(m, "amd_node_id:         %u\n", c->topo.amd_node_id);
+diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
+index 8277c64f88db..b5a5e1411469 100644
+--- a/arch/x86/kernel/cpu/topology_common.c
++++ b/arch/x86/kernel/cpu/topology_common.c
+@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
+ 	if (!early) {
+ 		c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ 		c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
++		c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ 	}
+ 
+ 	/* Package relative core ID */
+diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
+index 2361ed4d2b15..37a9afffb59e 100644
+--- a/include/linux/cpuhotplug.h
++++ b/include/linux/cpuhotplug.h
+@@ -208,7 +208,6 @@ enum cpuhp_state {
+ 	CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+ 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
+-	CPUHP_AP_PERF_X86_RAPL_ONLINE,
+ 	CPUHP_AP_PERF_S390_CF_ONLINE,
+ 	CPUHP_AP_PERF_S390_SF_ONLINE,
+ 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0010-pksm.patch b/sys-kernel/gentoo-sources-6.12/0010-pksm.patch
new file mode 100644
index 0000000..051549c
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0010-pksm.patch
@@ -0,0 +1,433 @@
+From 607312cadf367e1baf1362f85e0568ebae5b6d59 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:52:41 +0100
+Subject: [PATCH 10/12] pksm
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/alpha/kernel/syscalls/syscall.tbl        |   3 +
+ arch/arm/tools/syscall.tbl                    |   3 +
+ arch/m68k/kernel/syscalls/syscall.tbl         |   3 +
+ arch/microblaze/kernel/syscalls/syscall.tbl   |   3 +
+ arch/mips/kernel/syscalls/syscall_n32.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_n64.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_o32.tbl     |   3 +
+ arch/parisc/kernel/syscalls/syscall.tbl       |   3 +
+ arch/powerpc/kernel/syscalls/syscall.tbl      |   3 +
+ arch/s390/kernel/syscalls/syscall.tbl         |   3 +
+ arch/sh/kernel/syscalls/syscall.tbl           |   3 +
+ arch/sparc/kernel/syscalls/syscall.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_32.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_64.tbl        |   3 +
+ arch/xtensa/kernel/syscalls/syscall.tbl       |   3 +
+ include/linux/syscalls.h                      |   3 +
+ include/uapi/asm-generic/unistd.h             |   9 +-
+ kernel/sys.c                                  | 138 ++++++++++++++++++
+ kernel/sys_ni.c                               |   3 +
+ scripts/syscall.tbl                           |   3 +
+ .../arch/powerpc/entry/syscalls/syscall.tbl   |   3 +
+ .../perf/arch/s390/entry/syscalls/syscall.tbl |   3 +
+ 22 files changed, 206 insertions(+), 1 deletion(-)
+
+diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
+index 74720667fe09..e6a11f3c0a2e 100644
+--- a/arch/alpha/kernel/syscalls/syscall.tbl
++++ b/arch/alpha/kernel/syscalls/syscall.tbl
+@@ -502,3 +502,6 @@
+ 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 571	common	lsm_list_modules		sys_lsm_list_modules
+ 572	common  mseal				sys_mseal
++573	common	process_ksm_enable		sys_process_ksm_enable
++574	common	process_ksm_disable		sys_process_ksm_disable
++575	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
+index 23c98203c40f..10a3099decbe 100644
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -477,3 +477,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
+index 22a3cbd4c602..12d2c7594bf0 100644
+--- a/arch/m68k/kernel/syscalls/syscall.tbl
++++ b/arch/m68k/kernel/syscalls/syscall.tbl
+@@ -462,3 +462,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
+index 2b81a6bd78b2..e2a93c856eed 100644
+--- a/arch/microblaze/kernel/syscalls/syscall.tbl
++++ b/arch/microblaze/kernel/syscalls/syscall.tbl
+@@ -468,3 +468,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
+index 953f5b7dc723..b921fbf56fa6 100644
+--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
+@@ -401,3 +401,6 @@
+ 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n32	lsm_list_modules		sys_lsm_list_modules
+ 462	n32	mseal				sys_mseal
++463	n32	process_ksm_enable		sys_process_ksm_enable
++464	n32	process_ksm_disable		sys_process_ksm_disable
++465	n32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
+index 1464c6be6eb3..8d7f9ddd66f4 100644
+--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
+@@ -377,3 +377,6 @@
+ 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	n64	lsm_list_modules		sys_lsm_list_modules
+ 462	n64	mseal				sys_mseal
++463	n64	process_ksm_enable		sys_process_ksm_enable
++464	n64	process_ksm_disable		sys_process_ksm_disable
++465	n64	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
+index 2439a2491cff..9d6142739954 100644
+--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
+@@ -450,3 +450,6 @@
+ 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	o32	lsm_list_modules		sys_lsm_list_modules
+ 462	o32	mseal				sys_mseal
++463	o32	process_ksm_enable		sys_process_ksm_enable
++464	o32	process_ksm_disable		sys_process_ksm_disable
++465	o32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
+index 66dc406b12e4..9d46476fd908 100644
+--- a/arch/parisc/kernel/syscalls/syscall.tbl
++++ b/arch/parisc/kernel/syscalls/syscall.tbl
+@@ -461,3 +461,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
+index ebae8415dfbb..16f71bc2f6f0 100644
+--- a/arch/powerpc/kernel/syscalls/syscall.tbl
++++ b/arch/powerpc/kernel/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
+index 01071182763e..7394bad8178e 100644
+--- a/arch/s390/kernel/syscalls/syscall.tbl
++++ b/arch/s390/kernel/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
+index c55fd7696d40..b9fc31221b87 100644
+--- a/arch/sh/kernel/syscalls/syscall.tbl
++++ b/arch/sh/kernel/syscalls/syscall.tbl
+@@ -466,3 +466,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
+index cfdfb3707c16..0d79fd772854 100644
+--- a/arch/sparc/kernel/syscalls/syscall.tbl
++++ b/arch/sparc/kernel/syscalls/syscall.tbl
+@@ -508,3 +508,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 534c74b14fab..c546a30575f1 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -468,3 +468,6 @@
+ 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	i386	lsm_list_modules	sys_lsm_list_modules
+ 462	i386	mseal 			sys_mseal
++463	i386	process_ksm_enable		sys_process_ksm_enable
++464	i386	process_ksm_disable		sys_process_ksm_disable
++465	i386	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 7093ee21c0d1..0fcd10ba8dfe 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -386,6 +386,9 @@
+ 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
+ 461	common	lsm_list_modules	sys_lsm_list_modules
+ 462 	common  mseal			sys_mseal
++463	common	process_ksm_enable	sys_process_ksm_enable
++464	common	process_ksm_disable	sys_process_ksm_disable
++465	common	process_ksm_status	sys_process_ksm_status
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
+index 67083fc1b2f5..c1aecee4ad9b 100644
+--- a/arch/xtensa/kernel/syscalls/syscall.tbl
++++ b/arch/xtensa/kernel/syscalls/syscall.tbl
+@@ -433,3 +433,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal 				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 5758104921e6..cc9c4fac2412 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+ asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ 			size_t vlen, int behavior, unsigned int flags);
+ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
+ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+ 			unsigned long prot, unsigned long pgoff,
+ 			unsigned long flags);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 5bf6148cac2b..613e559ad6e0 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+ #define __NR_mseal 462
+ __SYSCALL(__NR_mseal, sys_mseal)
+ 
++#define __NR_process_ksm_enable 463
++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
++#define __NR_process_ksm_disable 464
++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
++#define __NR_process_ksm_status 465
++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 463
++#define __NR_syscalls 466
+ 
+ /*
+  * 32 bit systems traditionally used different
+diff --git a/kernel/sys.c b/kernel/sys.c
+index 4da31f28fda8..fcd3aeaddd05 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -2791,6 +2791,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ 	return error;
+ }
+ 
++#ifdef CONFIG_KSM
++enum pkc_action {
++	PKSM_ENABLE = 0,
++	PKSM_DISABLE,
++	PKSM_STATUS,
++};
++
++static long do_process_ksm_control(int pidfd, enum pkc_action action)
++{
++	long ret;
++	struct task_struct *task;
++	struct mm_struct *mm;
++	unsigned int f_flags;
++
++	task = pidfd_get_task(pidfd, &f_flags);
++	if (IS_ERR(task)) {
++		ret = PTR_ERR(task);
++		goto out;
++	}
++
++	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
++	if (IS_ERR_OR_NULL(mm)) {
++		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
++		goto release_task;
++	}
++
++	/* Require CAP_SYS_NICE for influencing process performance. */
++	if (!capable(CAP_SYS_NICE)) {
++		ret = -EPERM;
++		goto release_mm;
++	}
++
++	if (mmap_write_lock_killable(mm)) {
++		ret = -EINTR;
++		goto release_mm;
++	}
++
++	switch (action) {
++		case PKSM_ENABLE:
++			ret = ksm_enable_merge_any(mm);
++			break;
++		case PKSM_DISABLE:
++			ret = ksm_disable_merge_any(mm);
++			break;
++		case PKSM_STATUS:
++			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
++			break;
++	}
++
++	mmap_write_unlock(mm);
++
++release_mm:
++	mmput(mm);
++release_task:
++	put_task_struct(task);
++out:
++	return ret;
++}
++#endif /* CONFIG_KSM */
++
++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_ENABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_DISABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_STATUS);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++#ifdef CONFIG_KSM
++static ssize_t process_ksm_enable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
++}
++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
++
++static ssize_t process_ksm_disable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
++}
++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
++
++static ssize_t process_ksm_status_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_status);
++}
++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
++
++static struct attribute *process_ksm_sysfs_attrs[] = {
++	&process_ksm_enable_attr.attr,
++	&process_ksm_disable_attr.attr,
++	&process_ksm_status_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group process_ksm_sysfs_attr_group = {
++	.attrs = process_ksm_sysfs_attrs,
++	.name = "process_ksm",
++};
++
++static int __init process_ksm_sysfs_init(void)
++{
++	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
++}
++subsys_initcall(process_ksm_sysfs_init);
++#endif /* CONFIG_KSM */
++
+ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ 		struct getcpu_cache __user *, unused)
+ {
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index c00a86931f8c..d82213d68522 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
+ COND_SYSCALL(madvise);
+ COND_SYSCALL(process_madvise);
+ COND_SYSCALL(process_mrelease);
++COND_SYSCALL(process_ksm_enable);
++COND_SYSCALL(process_ksm_disable);
++COND_SYSCALL(process_ksm_status);
+ COND_SYSCALL(remap_file_pages);
+ COND_SYSCALL(mbind);
+ COND_SYSCALL(get_mempolicy);
+diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
+index 845e24eb372e..227d9cc12365 100644
+--- a/scripts/syscall.tbl
++++ b/scripts/syscall.tbl
+@@ -403,3 +403,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable			sys_process_ksm_enable
++464	common	process_ksm_disable			sys_process_ksm_disable
++465	common	process_ksm_status			sys_process_ksm_status
+diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+index ebae8415dfbb..16f71bc2f6f0 100644
+--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+@@ -553,3 +553,6 @@
+ 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
+ 461	common	lsm_list_modules		sys_lsm_list_modules
+ 462	common	mseal				sys_mseal
++463	common	process_ksm_enable		sys_process_ksm_enable
++464	common	process_ksm_disable		sys_process_ksm_disable
++465	common	process_ksm_status		sys_process_ksm_status
+diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+index 01071182763e..7394bad8178e 100644
+--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
+ 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+ 462  common	mseal			sys_mseal			sys_mseal
++463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/0012-zstd.patch b/sys-kernel/gentoo-sources-6.12/0012-zstd.patch
new file mode 100644
index 0000000..7518743
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/0012-zstd.patch
@@ -0,0 +1,18652 @@
+From a7a211a9bf51bfd07e645e8362b40f249d00d13b Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 19 Dec 2024 18:53:34 +0100
+Subject: [PATCH 12/12] zstd
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/zstd.h                          |    2 +-
+ include/linux/zstd_errors.h                   |   23 +-
+ include/linux/zstd_lib.h                      |  850 +++++--
+ lib/zstd/Makefile                             |    2 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  149 ++
+ lib/zstd/common/bitstream.h                   |  127 +-
+ lib/zstd/common/compiler.h                    |  134 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   34 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   12 +-
+ lib/zstd/common/error_private.h               |   84 +-
+ lib/zstd/common/fse.h                         |   94 +-
+ lib/zstd/common/fse_decompress.c              |  130 +-
+ lib/zstd/common/huf.h                         |  237 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   28 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |  109 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
+ lib/zstd/compress/hist.c                      |    3 +-
+ lib/zstd/compress/hist.h                      |    3 +-
+ lib/zstd/compress/huf_compress.c              |  441 ++--
+ lib/zstd/compress/zstd_compress.c             | 2111 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  359 ++-
+ lib/zstd/compress/zstd_compress_literals.c    |  155 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  376 ++-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  169 +-
+ lib/zstd/compress/zstd_double_fast.c          |  143 +-
+ lib/zstd/compress/zstd_double_fast.h          |   17 +-
+ lib/zstd/compress/zstd_fast.c                 |  596 +++--
+ lib/zstd/compress/zstd_fast.h                 |    6 +-
+ lib/zstd/compress/zstd_lazy.c                 |  732 +++---
+ lib/zstd/compress/zstd_lazy.h                 |  138 +-
+ lib/zstd/compress/zstd_ldm.c                  |   21 +-
+ lib/zstd/compress/zstd_ldm.h                  |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  497 ++--
+ lib/zstd/compress/zstd_opt.h                  |   41 +-
+ lib/zstd/decompress/huf_decompress.c          |  887 ++++---
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  358 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  708 +++---
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |    9 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |    2 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 58 files changed, 6577 insertions(+), 3531 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index b2c7cf310c8f..ac59ae9a18d7 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..6d5cf55f0bf3 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+ 
+ 
+ /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE 
++
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..6320fedcf8a4 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
++/* ======   Dependencies   ======*/
+ #include <linux/limits.h>   /* INT_MAX */
+ #include <linux/types.h>   /* size_t */
+ 
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +85,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  6
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+  * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+ 
+ 
+ /*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -183,7 +228,7 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
+ /*= Compression context
+  *  When compressing many times,
+  *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+@@ -196,9 +241,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer *
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+  *  they will all be reset. Only `compressionLevel` remains.
+  */
+@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +265,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +369,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,7 +461,6 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+@@ -412,6 +469,9 @@ typedef enum {
+      * ZSTD_c_validateSequences
+      * ZSTD_c_useBlockSplitter
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -421,7 +481,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +490,11 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +557,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +609,17 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
+ 
+ } ZSTD_dParameter;
+ 
+@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +770,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be reused multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ *   call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is reused,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1303,7 +1395,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+ 
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsSize The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
+  */
+-
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                       const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+  * The entire source is compressed into a single frame.
+  *
+@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+  *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Without validation, providing a sequence that does not conform to the zstd spec will cause
+  * undefined behavior, and may produce a corrupted block.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+  *
+@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t (*ZSTD_sequenceProducer_F) (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+-
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+ 
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..464c410b2768 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..16c3d08e8d1a
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "compiler.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..aa3487ec4b6a
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,149 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_ctz(val);
++#   else
++        return ZSTD_countTrailingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_clz(val);
++#   else
++        return ZSTD_countLeadingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4) && defined(__LP64__)
++        return (unsigned)__builtin_ctzll(val);
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (leastSignificantWord == 0) {
++                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++            } else {
++                return ZSTD_countTrailingZeros32(leastSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)(__builtin_clzll(val));
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (mostSignificantWord == 0) {
++                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++            } else {
++                return ZSTD_countLeadingZeros32(mostSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..6a13f1f0f1e8 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ 
+ /*=========================================
+@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ /*-********************************************
+ *  bitStream decoding API (read backward)
+ **********************************************/
++typedef size_t BitContainerType;
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+ MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
++ *  unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
+         return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..508ee25537bb 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,9 +143,9 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+@@ -179,6 +196,85 @@
+ *  Sanitizer
+ *****************************************************************/
+ 
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without trigging
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..8eb6aa9a3b20 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,10 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..226ba3c57ec3 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared,
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+ 
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..a4062d30d170 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..0410ca415b54 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) {
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) {
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ 
+ #endif /* ERROR_H_MODULE */
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..2185a578617d 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+ 
+@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39767..3a17e84f27bf 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+ #include "error_private.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +56,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+             break;
+     }   }
+ 
+-    return op-ostart;
+-}
+-
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..57462466e188 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+-
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+-/* ***   Advanced function   *** */
+ 
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +144,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
+-
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif   /* HUF_H_298734234 */
+ 
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index c22a2e69bf46..d9bd752fe17b 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a527d..f08638cced6c 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -45,6 +46,8 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+@@ -65,7 +68,7 @@
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+  * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +93,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..44b95b25344a 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33a1c..f931f7d0e294 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..11da1233e890 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+     seqDef* sequencesStart;
+     seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
+     size_t maxNbSeq;
+     size_t maxNbLit;
+ 
+@@ -301,8 +297,8 @@ typedef struct {
+      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+      * the existing value of the litLength or matchLength by 0x10000.
+      */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+ 
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+     seqLen.matchLength = seq->mlBase + MINMATCH;
+     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
++            seqLen.litLength += 0x10000;
+         }
+         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
++            seqLen.matchLength += 0x10000;
+         }
+     }
+     return seqLen;
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ 
+ /* ZSTD_invalidateRepCodes() :
+@@ -420,13 +357,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..44a3c10becf2 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..0b12587cc14b 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..f7687b0fc20a 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..0b229f5d2ae2 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
+ }
+ 
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
++}
++
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +519,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +528,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1249,81 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
++
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index 16bb995bc6c4..885167f7e47b 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +28,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -55,14 +57,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+     }
+     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+     default:
+         return 0;
+     }
+@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        }
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
+ 
+     case ZSTD_c_useBlockSplitter:
+         BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        CCtxParams->maxBlockSize = value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_searchForExternalRepcodes:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_cParamMode_e mode,
++                            ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+       + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_paramSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+         entropySpace +
+@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+     }
+ }
+ 
+@@ -1637,6 +1879,19 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         }
+     }
+ 
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+     assert(params->useBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+ 
+@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (ZSTD_hasExtSeqProd(params)) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2647,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2658,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const seqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                        const seqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* dst, size_t dstCapacity,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+     {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ }
+ 
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++ZSTD_entropyCompressSeqStore(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
+@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+         static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+         DEBUGLOG(4, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+ 
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+             /* Updates ldmSeqStore.size */
+@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
++            assert(
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->extSeqBuf,
++                    nbExternalSeqs,
++                    zc->extSeqBufCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_sequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++                            zc, &seqPos,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_blockCompressor const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
++            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     return ZSTDbss_compress;
+ }
+ 
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
++    const seqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = seqStore->sequences - inSeqs;
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+ 
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+ 
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
++}
++
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+     void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const seqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const seqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        seqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+         seqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+  */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                                const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++                        const seqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+         seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const seqStore_t* const seqStore,
+                                   repcodes_t* const dRep, repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3481,45 +4027,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+     return cSize;
+@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ 
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+         if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+ 
+ 
+             ip += blockSize;
+@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++                                         ZSTD_dictTableLoadMethod_e dtlm,
++                                         ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+     }
+ 
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    if (params->cParams.strategy < ZSTD_btultra) {
++        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
++    }
++
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4813,7 +5452,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+     if (!cdict)
+         return NULL;
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4908,6 +5547,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4987,12 +5627,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5002,7 +5647,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5199,30 +5844,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSize - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSize;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5231,8 +5887,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5247,7 +5905,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                 op, oend-op, ip, iend-ip);
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5264,8 +5922,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+                                         ip, iend-ip);
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5276,6 +5933,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5283,9 +5954,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 void* cDst;
+                 size_t cSize;
+                 size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5293,9 +5963,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+@@ -5308,19 +5978,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5390,8 +6057,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5410,22 +6079,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5439,9 +6108,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5453,6 +6122,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5479,6 +6151,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5493,8 +6167,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5512,13 +6205,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5541,6 +6241,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5551,64 +6252,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                               ZSTD_sequencePosition* seqPos,
+                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++                                        const void* src, size_t blockSize,
++                                        ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+     repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5617,25 +6315,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         dictSize = 0;
+     }
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
++
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
+     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+@@ -5644,26 +6372,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+     return 0;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+@@ -5675,6 +6392,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5682,7 +6402,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5690,7 +6410,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5704,7 +6424,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5744,21 +6463,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5781,7 +6502,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ 
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
++                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+     ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5795,6 +6516,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+     return sequenceCopier;
+ }
+ 
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
++    }
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++    int const lastBlock = (remaining <= blockSize);
++    return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters)
++        return blockSize_noDelimiter(blockSize, remaining);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5807,9 +6579,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+     ZSTD_sequencePosition seqPos = {0, 0, 0};
+ 
+@@ -5829,22 +6598,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+         size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSize, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+         blockSize -= additionalByteAdjustment;
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5853,6 +6629,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+@@ -5861,11 +6638,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
++            ZSTD_isRLE(ip, blockSize)) {
+             /* We don't want to emit our first block as a RLE even if it qualifies because
+             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+             * This is only an issue for zstd <= v1.4.3
+@@ -5876,12 +6653,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5893,11 +6670,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5908,12 +6684,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+@@ -5923,7 +6702,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+     size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+     /* Begin writing output, starting with frame header */
+@@ -5951,26 +6730,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+ 
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6092,7 +6879,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6127,3 +6914,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
++    } else {
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..53cb582a8d2b 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+ 
+ 
+ /*-*************************************
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -142,26 +145,33 @@ typedef struct {
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -212,8 +222,10 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+@@ -228,6 +240,18 @@ struct ZSTD_matchState_t {
+     const ZSTD_matchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+     const rawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s {
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_paramSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_paramSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,7 +490,7 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     seqStorePtr->sequences[0].litLength = (U16)litLength;
+ 
+     /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++    seqStorePtr->sequences[0].offBase = offBase;
+ 
+     /* match Length */
+     assert(matchLength >= MINMATCH);
+@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +723,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+ 
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+     repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
+                                   void const* src, size_t srcSize,
+                                   int forceNonContiguous)
+ {
+@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ 
+ /* ===============================================================
+@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++                                              ZSTD_sequencePosition* seqPos,
++                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..3e9ea46a670a 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+     symbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..5c028c78d889 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..7fe6f4ff5cf2 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..41f6521b27cd 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++                   const seqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
++{
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const seqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
+     return 0;
+ }
+ 
++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+ {
+     const seqDef* const sstart = seqStorePtr->sequencesStart;
+     const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+-
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
+-
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
++
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+         }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
++            const seqDef* seq;
+             repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..86bc3c2c23c7 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+ 
+ 
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+     assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+ }
+ 
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+  */
+@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..5ff54f17d92f 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,49 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                 goto _match_found;
+             }
+-        } else {
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                     goto _match_found;
+                 }
+-            } else {
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..b7ddc714f13e 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -15,8 +16,12 @@
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
++
+ size_t ZSTD_compressBlock_doubleFast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..b7a63ba4ce56 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,46 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++                }   }   }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls, U32 const hasStep)
+@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 == ip0 + 1, so
++             * we know we will resume searching after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _offset;
+         }
+ 
+@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* first write next hash table entry; we've already calculated it */
++            if (step <= 4) {
++                /* We need to avoid writing an index into the hash table >= the
++                 * position at which we will pick up our searching after we've
++                 * taken this match.
++                 *
++                 * The minimum possible match has length 4, so the earliest ip0
++                 * can be after we take this match will be the current ip0 + 4.
++                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++                 * write this position.
++                 */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
++
+             goto _offset;
+         }
+ 
+@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     match0 = base + idx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++                /* found a regular match */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..e64d9e1b2d39 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+ 
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..3e88d8a1a136 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,14 +11,23 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
+         const ZSTD_matchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
+                         ZSTD_matchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ 
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch(
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic(
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic(
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic(
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict(
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..22c9201f4e63 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -22,98 +23,175 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+ 
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
++
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
++
+ 
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..07f3bc6437ce 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
+         ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..c540731abde7 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..a87b66ac8d24 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +30,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
+                 const ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+                 ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal(
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+ }
+ 
+ FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_matchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+         ZSTD_matchState_t* ms,
+         U32* nextToUpdate3,
+@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID)
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
+ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                                seqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip));
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                     pos, ZSTD_fCost(sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
++                        opt[pos].off = offBase;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
++                    ZSTD_optimal_t const prevMatch = opt[cur];
+                     DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                 inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+             ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
++                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
++                                inr-istart, cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
+                 }   }
+ 
+@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt(
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
++                          seqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2(
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2(
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState(
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict(
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..ac1b743d27cd 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,30 +15,40 @@
+ 
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
++size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict(
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ 
+ #endif /* ZSTD_OPT_H */
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..ac8b87f48f84 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilowest, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+ 
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
+          */
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
+ 
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
++    bit->start = (const char*)args->ilowest;
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +379,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        assert(dstSize >= 6); /* validated above */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    do {                                                        \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    } while (0)
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    do {                                                            \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+     * overwritten any op[].
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1175,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        assert(dstSize >= 6 /* validated above */);
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1);
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
++        } while (op[3] < olimit);
++    }
++
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..30ef65e1ab5c 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94711..c9cbc45f6ed9 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -53,13 +54,15 @@
+ *  Dependencies
+ *********************************************************/
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+@@ -72,11 +75,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
++**           or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+-    unsigned long long totalDstSize = 0;
++    U64 totalDstSize = 0;
+ 
+     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+         U32 const magicNumber = MEM_readLE32(src);
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (U64_MAX - totalDstSize < fcs)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+         ZSTD_frameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_frameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..9fe9a12c8a2c 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+-        }
+-        else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+         symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
+     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+         symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+         symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
+                             ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
++                            const ZSTD_longOffset_e isLongOffset);
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1982,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+-
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
++
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
++    dctx->isFrameDecompression = 0;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..becffbd89364 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..0f02526be774 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
++    int isFrameDecompression;
+ #if DYNAMIC_BMI2 != 0
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
++    int maxBlockSizeParam;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..466828e35752 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index bd8784449b31..ceaf352d03e2 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index 469fc3059be0..0ae819f0c927 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.47.1
+
diff --git a/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch b/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch
new file mode 100644
index 0000000..0930bbf
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch
@@ -0,0 +1,11 @@
+--- a/include/linux/workqueue.h	2024-11-18 19:21:27.602930590 +0100
++++ b/include/linux/workqueue.h	2024-11-19 00:04:41.586700929 +0100
+@@ -412,7 +412,7 @@
+ };
+ 
+ enum wq_consts {
+-	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
++	WQ_MAX_ACTIVE		= 2048,	  /* I like 2048, better ideas? */
+ 	WQ_UNBOUND_MAX_ACTIVE	= WQ_MAX_ACTIVE,
+ 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
+ 
diff --git a/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch b/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch
new file mode 100644
index 0000000..d5e5f37
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch
@@ -0,0 +1,885 @@
+From 46a700551a5ff45cbc27671d7ebd176826adb1c6 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:27:14 +0100
+Subject: [PATCH 01/12] amd-pstate
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/cpufreq/amd-pstate-trace.h |  52 +++-
+ drivers/cpufreq/amd-pstate-ut.c    |  12 +-
+ drivers/cpufreq/amd-pstate.c       | 397 +++++++++++++++--------------
+ drivers/cpufreq/amd-pstate.h       |   3 -
+ 4 files changed, 259 insertions(+), 205 deletions(-)
+
+diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h
+index 35f38ae67fb1..8d692415d905 100644
+--- a/drivers/cpufreq/amd-pstate-trace.h
++++ b/drivers/cpufreq/amd-pstate-trace.h
+@@ -32,7 +32,6 @@ TRACE_EVENT(amd_pstate_perf,
+ 		 u64 aperf,
+ 		 u64 tsc,
+ 		 unsigned int cpu_id,
+-		 bool changed,
+ 		 bool fast_switch
+ 		 ),
+ 
+@@ -44,7 +43,6 @@ TRACE_EVENT(amd_pstate_perf,
+ 		aperf,
+ 		tsc,
+ 		cpu_id,
+-		changed,
+ 		fast_switch
+ 		),
+ 
+@@ -57,7 +55,6 @@ TRACE_EVENT(amd_pstate_perf,
+ 		__field(unsigned long long, aperf)
+ 		__field(unsigned long long, tsc)
+ 		__field(unsigned int, cpu_id)
+-		__field(bool, changed)
+ 		__field(bool, fast_switch)
+ 		),
+ 
+@@ -70,11 +67,10 @@ TRACE_EVENT(amd_pstate_perf,
+ 		__entry->aperf = aperf;
+ 		__entry->tsc = tsc;
+ 		__entry->cpu_id = cpu_id;
+-		__entry->changed = changed;
+ 		__entry->fast_switch = fast_switch;
+ 		),
+ 
+-	TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u changed=%s fast_switch=%s",
++	TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s",
+ 		  (unsigned long)__entry->min_perf,
+ 		  (unsigned long)__entry->target_perf,
+ 		  (unsigned long)__entry->capacity,
+@@ -83,11 +79,55 @@ TRACE_EVENT(amd_pstate_perf,
+ 		  (unsigned long long)__entry->aperf,
+ 		  (unsigned long long)__entry->tsc,
+ 		  (unsigned int)__entry->cpu_id,
+-		  (__entry->changed) ? "true" : "false",
+ 		  (__entry->fast_switch) ? "true" : "false"
+ 		 )
+ );
+ 
++TRACE_EVENT(amd_pstate_epp_perf,
++
++	TP_PROTO(unsigned int cpu_id,
++		 unsigned int highest_perf,
++		 unsigned int epp,
++		 unsigned int min_perf,
++		 unsigned int max_perf,
++		 bool boost
++		 ),
++
++	TP_ARGS(cpu_id,
++		highest_perf,
++		epp,
++		min_perf,
++		max_perf,
++		boost),
++
++	TP_STRUCT__entry(
++		__field(unsigned int, cpu_id)
++		__field(unsigned int, highest_perf)
++		__field(unsigned int, epp)
++		__field(unsigned int, min_perf)
++		__field(unsigned int, max_perf)
++		__field(bool, boost)
++		),
++
++	TP_fast_assign(
++		__entry->cpu_id = cpu_id;
++		__entry->highest_perf = highest_perf;
++		__entry->epp = epp;
++		__entry->min_perf = min_perf;
++		__entry->max_perf = max_perf;
++		__entry->boost = boost;
++		),
++
++	TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u",
++		  (unsigned int)__entry->cpu_id,
++		  (unsigned int)__entry->min_perf,
++		  (unsigned int)__entry->max_perf,
++		  (unsigned int)__entry->highest_perf,
++		  (unsigned int)__entry->epp,
++		  (bool)__entry->boost
++		 )
++);
++
+ #endif /* _AMD_PSTATE_TRACE_H */
+ 
+ /* This part must be outside protection */
+diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
+index a261d7300951..3a0a380c3590 100644
+--- a/drivers/cpufreq/amd-pstate-ut.c
++++ b/drivers/cpufreq/amd-pstate-ut.c
+@@ -207,7 +207,6 @@ static void amd_pstate_ut_check_freq(u32 index)
+ 	int cpu = 0;
+ 	struct cpufreq_policy *policy = NULL;
+ 	struct amd_cpudata *cpudata = NULL;
+-	u32 nominal_freq_khz;
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		policy = cpufreq_cpu_get(cpu);
+@@ -215,14 +214,13 @@ static void amd_pstate_ut_check_freq(u32 index)
+ 			break;
+ 		cpudata = policy->driver_data;
+ 
+-		nominal_freq_khz = cpudata->nominal_freq*1000;
+-		if (!((cpudata->max_freq >= nominal_freq_khz) &&
+-			(nominal_freq_khz > cpudata->lowest_nonlinear_freq) &&
++		if (!((cpudata->max_freq >= cpudata->nominal_freq) &&
++			(cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) &&
+ 			(cpudata->lowest_nonlinear_freq > cpudata->min_freq) &&
+ 			(cpudata->min_freq > 0))) {
+ 			amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
+ 			pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
+-				__func__, cpu, cpudata->max_freq, nominal_freq_khz,
++				__func__, cpu, cpudata->max_freq, cpudata->nominal_freq,
+ 				cpudata->lowest_nonlinear_freq, cpudata->min_freq);
+ 			goto skip_test;
+ 		}
+@@ -236,13 +234,13 @@ static void amd_pstate_ut_check_freq(u32 index)
+ 
+ 		if (cpudata->boost_supported) {
+ 			if ((policy->max == cpudata->max_freq) ||
+-					(policy->max == nominal_freq_khz))
++					(policy->max == cpudata->nominal_freq))
+ 				amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
+ 			else {
+ 				amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
+ 				pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n",
+ 					__func__, cpu, policy->max, cpudata->max_freq,
+-					nominal_freq_khz);
++					cpudata->nominal_freq);
+ 				goto skip_test;
+ 			}
+ 		} else {
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index f71057c2cf90..6a1e02389831 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -22,6 +22,7 @@
+ 
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
++#include <linux/bitfield.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <linux/init.h>
+@@ -88,6 +89,11 @@ static bool cppc_enabled;
+ static bool amd_pstate_prefcore = true;
+ static struct quirk_entry *quirks;
+ 
++#define AMD_CPPC_MAX_PERF_MASK		GENMASK(7, 0)
++#define AMD_CPPC_MIN_PERF_MASK		GENMASK(15, 8)
++#define AMD_CPPC_DES_PERF_MASK		GENMASK(23, 16)
++#define AMD_CPPC_EPP_PERF_MASK		GENMASK(31, 24)
++
+ /*
+  * AMD Energy Preference Performance (EPP)
+  * The EPP is used in the CCLK DPM controller to drive
+@@ -180,120 +186,145 @@ static inline int get_mode_idx_from_str(const char *str, size_t size)
+ static DEFINE_MUTEX(amd_pstate_limits_lock);
+ static DEFINE_MUTEX(amd_pstate_driver_lock);
+ 
+-static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
++static s16 msr_get_epp(struct amd_cpudata *cpudata)
+ {
+-	u64 epp;
++	u64 value;
+ 	int ret;
+ 
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		if (!cppc_req_cached) {
+-			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+-					&cppc_req_cached);
+-			if (epp)
+-				return epp;
+-		}
+-		epp = (cppc_req_cached >> 24) & 0xFF;
+-	} else {
+-		ret = cppc_get_epp_perf(cpudata->cpu, &epp);
+-		if (ret < 0) {
+-			pr_debug("Could not retrieve energy perf value (%d)\n", ret);
+-			return -EIO;
+-		}
++	ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
++	if (ret < 0) {
++		pr_debug("Could not retrieve energy perf value (%d)\n", ret);
++		return ret;
+ 	}
+ 
+-	return (s16)(epp & 0xff);
++	return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, value);
+ }
+ 
+-static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
++DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp);
++
++static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata)
+ {
+-	s16 epp;
+-	int index = -EINVAL;
++	return static_call(amd_pstate_get_epp)(cpudata);
++}
+ 
+-	epp = amd_pstate_get_epp(cpudata, 0);
+-	if (epp < 0)
+-		return epp;
++static s16 shmem_get_epp(struct amd_cpudata *cpudata)
++{
++	u64 epp;
++	int ret;
+ 
+-	switch (epp) {
+-	case AMD_CPPC_EPP_PERFORMANCE:
+-		index = EPP_INDEX_PERFORMANCE;
+-		break;
+-	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
+-		index = EPP_INDEX_BALANCE_PERFORMANCE;
+-		break;
+-	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
+-		index = EPP_INDEX_BALANCE_POWERSAVE;
+-		break;
+-	case AMD_CPPC_EPP_POWERSAVE:
+-		index = EPP_INDEX_POWERSAVE;
+-		break;
+-	default:
+-		break;
++	ret = cppc_get_epp_perf(cpudata->cpu, &epp);
++	if (ret < 0) {
++		pr_debug("Could not retrieve energy perf value (%d)\n", ret);
++		return ret;
+ 	}
+ 
+-	return index;
++	return (s16)(epp & 0xff);
+ }
+ 
+-static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+-			       u32 des_perf, u32 max_perf, bool fast_switch)
++static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
++			   u32 des_perf, u32 max_perf, u32 epp, bool fast_switch)
+ {
+-	if (fast_switch)
+-		wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached));
+-	else
+-		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+-			      READ_ONCE(cpudata->cppc_req_cached));
++	u64 value, prev;
++
++	value = prev = READ_ONCE(cpudata->cppc_req_cached);
++
++	value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK |
++		   AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK);
++	value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf);
++	value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf);
++	value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf);
++	value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
++
++	if (value == prev)
++		return 0;
++
++	if (fast_switch) {
++		wrmsrl(MSR_AMD_CPPC_REQ, value);
++		return 0;
++	} else {
++		int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
++
++		if (ret)
++			return ret;
++	}
++
++	WRITE_ONCE(cpudata->cppc_req_cached, value);
++	WRITE_ONCE(cpudata->epp_cached, epp);
++
++	return 0;
+ }
+ 
+ DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
+ 
+-static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
++static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata,
+ 					  u32 min_perf, u32 des_perf,
+-					  u32 max_perf, bool fast_switch)
++					  u32 max_perf, u32 epp,
++					  bool fast_switch)
+ {
+-	static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
+-					    max_perf, fast_switch);
++	return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
++						   max_perf, epp, fast_switch);
+ }
+ 
+-static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
++static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
+ {
++	u64 value, prev;
+ 	int ret;
+-	struct cppc_perf_ctrls perf_ctrls;
+ 
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		u64 value = READ_ONCE(cpudata->cppc_req_cached);
+-
+-		value &= ~GENMASK_ULL(31, 24);
+-		value |= (u64)epp << 24;
+-		WRITE_ONCE(cpudata->cppc_req_cached, value);
++	value = prev = READ_ONCE(cpudata->cppc_req_cached);
++	value &= ~AMD_CPPC_EPP_PERF_MASK;
++	value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
+ 
+-		ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+-		if (!ret)
+-			cpudata->epp_cached = epp;
+-	} else {
+-		amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
+-					     cpudata->max_limit_perf, false);
++	if (value == prev)
++		return 0;
+ 
+-		perf_ctrls.energy_perf = epp;
+-		ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
+-		if (ret) {
+-			pr_debug("failed to set energy perf value (%d)\n", ret);
+-			return ret;
+-		}
+-		cpudata->epp_cached = epp;
++	ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
++	if (ret) {
++		pr_err("failed to set energy perf value (%d)\n", ret);
++		return ret;
+ 	}
+ 
++	/* update both so that msr_update_perf() can effectively check */
++	WRITE_ONCE(cpudata->epp_cached, epp);
++	WRITE_ONCE(cpudata->cppc_req_cached, value);
++
+ 	return ret;
+ }
+ 
+-static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
+-		int pref_index)
++DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp);
++
++static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
++{
++	return static_call(amd_pstate_set_epp)(cpudata, epp);
++}
++
++static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp)
+ {
+-	int epp = -EINVAL;
+ 	int ret;
++	struct cppc_perf_ctrls perf_ctrls;
++
++	if (epp == cpudata->epp_cached)
++		return 0;
++
++	perf_ctrls.energy_perf = epp;
++	ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
++	if (ret) {
++		pr_debug("failed to set energy perf value (%d)\n", ret);
++		return ret;
++	}
++	WRITE_ONCE(cpudata->epp_cached, epp);
++
++	return ret;
++}
++
++static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy,
++					    int pref_index)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	int epp;
+ 
+ 	if (!pref_index)
+ 		epp = cpudata->epp_default;
+-
+-	if (epp == -EINVAL)
++	else
+ 		epp = epp_values[pref_index];
+ 
+ 	if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
+@@ -301,9 +332,15 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
+ 		return -EBUSY;
+ 	}
+ 
+-	ret = amd_pstate_set_epp(cpudata, epp);
++	if (trace_amd_pstate_epp_perf_enabled()) {
++		trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
++					  epp,
++					  FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
++					  FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached),
++					  policy->boost_enabled);
++	}
+ 
+-	return ret;
++	return amd_pstate_set_epp(cpudata, epp);
+ }
+ 
+ static inline int msr_cppc_enable(bool enable)
+@@ -442,17 +479,23 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
+ 	return static_call(amd_pstate_init_perf)(cpudata);
+ }
+ 
+-static void shmem_update_perf(struct amd_cpudata *cpudata,
+-			     u32 min_perf, u32 des_perf,
+-			     u32 max_perf, bool fast_switch)
++static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
++			     u32 des_perf, u32 max_perf, u32 epp, bool fast_switch)
+ {
+ 	struct cppc_perf_ctrls perf_ctrls;
+ 
++	if (cppc_state == AMD_PSTATE_ACTIVE) {
++		int ret = shmem_set_epp(cpudata, epp);
++
++		if (ret)
++			return ret;
++	}
++
+ 	perf_ctrls.max_perf = max_perf;
+ 	perf_ctrls.min_perf = min_perf;
+ 	perf_ctrls.desired_perf = des_perf;
+ 
+-	cppc_set_perf(cpudata->cpu, &perf_ctrls);
++	return cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ }
+ 
+ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
+@@ -493,14 +536,8 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
+ {
+ 	unsigned long max_freq;
+ 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu);
+-	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
+ 	u32 nominal_perf = READ_ONCE(cpudata->nominal_perf);
+-	u64 value = prev;
+ 
+-	min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
+-			cpudata->max_limit_perf);
+-	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+-			cpudata->max_limit_perf);
+ 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
+ 
+ 	max_freq = READ_ONCE(cpudata->max_limit_freq);
+@@ -511,34 +548,18 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
+ 		des_perf = 0;
+ 	}
+ 
+-	value &= ~AMD_CPPC_MIN_PERF(~0L);
+-	value |= AMD_CPPC_MIN_PERF(min_perf);
+-
+-	value &= ~AMD_CPPC_DES_PERF(~0L);
+-	value |= AMD_CPPC_DES_PERF(des_perf);
+-
+ 	/* limit the max perf when core performance boost feature is disabled */
+ 	if (!cpudata->boost_supported)
+ 		max_perf = min_t(unsigned long, nominal_perf, max_perf);
+ 
+-	value &= ~AMD_CPPC_MAX_PERF(~0L);
+-	value |= AMD_CPPC_MAX_PERF(max_perf);
+-
+ 	if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
+ 		trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
+ 			cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
+-				cpudata->cpu, (value != prev), fast_switch);
++				cpudata->cpu, fast_switch);
+ 	}
+ 
+-	if (value == prev)
+-		goto cpufreq_policy_put;
+-
+-	WRITE_ONCE(cpudata->cppc_req_cached, value);
+-
+-	amd_pstate_update_perf(cpudata, min_perf, des_perf,
+-			       max_perf, fast_switch);
++	amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch);
+ 
+-cpufreq_policy_put:
+ 	cpufreq_cpu_put(policy);
+ }
+ 
+@@ -570,7 +591,7 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
+ 
+ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
+ {
+-	u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq;
++	u32 max_limit_perf, min_limit_perf, max_perf, max_freq;
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+ 	max_perf = READ_ONCE(cpudata->highest_perf);
+@@ -578,12 +599,8 @@ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
+ 	max_limit_perf = div_u64(policy->max * max_perf, max_freq);
+ 	min_limit_perf = div_u64(policy->min * max_perf, max_freq);
+ 
+-	lowest_perf = READ_ONCE(cpudata->lowest_perf);
+-	if (min_limit_perf < lowest_perf)
+-		min_limit_perf = lowest_perf;
+-
+-	if (max_limit_perf < min_limit_perf)
+-		max_limit_perf = min_limit_perf;
++	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
++		min_limit_perf = min(cpudata->nominal_perf, max_limit_perf);
+ 
+ 	WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
+ 	WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
+@@ -682,7 +699,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
+ 	if (min_perf < lowest_nonlinear_perf)
+ 		min_perf = lowest_nonlinear_perf;
+ 
+-	max_perf = cap_perf;
++	max_perf = cpudata->max_limit_perf;
+ 	if (max_perf < min_perf)
+ 		max_perf = min_perf;
+ 
+@@ -704,8 +721,8 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
+ 
+ 	if (on)
+ 		policy->cpuinfo.max_freq = max_freq;
+-	else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
+-		policy->cpuinfo.max_freq = nominal_freq * 1000;
++	else if (policy->cpuinfo.max_freq > nominal_freq)
++		policy->cpuinfo.max_freq = nominal_freq;
+ 
+ 	policy->max = policy->cpuinfo.max_freq;
+ 
+@@ -730,8 +747,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state)
+ 	guard(mutex)(&amd_pstate_driver_lock);
+ 
+ 	ret = amd_pstate_cpu_boost_update(policy, state);
+-	WRITE_ONCE(cpudata->boost_state, !ret ? state : false);
+-	policy->boost_enabled = !ret ? state : false;
+ 	refresh_frequency_limits(policy);
+ 
+ 	return ret;
+@@ -752,9 +767,6 @@ static int amd_pstate_init_boost_support(struct amd_cpudata *cpudata)
+ 		goto exit_err;
+ 	}
+ 
+-	/* at least one CPU supports CPB, even if others fail later on to set up */
+-	current_pstate_driver->boost_enabled = true;
+-
+ 	ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val);
+ 	if (ret) {
+ 		pr_err_once("failed to read initial CPU boost state!\n");
+@@ -906,29 +918,29 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
+ 		return ret;
+ 
+ 	if (quirks && quirks->lowest_freq)
+-		min_freq = quirks->lowest_freq * 1000;
++		min_freq = quirks->lowest_freq;
+ 	else
+-		min_freq = cppc_perf.lowest_freq * 1000;
++		min_freq = cppc_perf.lowest_freq;
+ 
+ 	if (quirks && quirks->nominal_freq)
+-		nominal_freq = quirks->nominal_freq ;
++		nominal_freq = quirks->nominal_freq;
+ 	else
+ 		nominal_freq = cppc_perf.nominal_freq;
+ 
+ 	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+ 
+ 	boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+-	max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
++	max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT);
+ 
+ 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
+ 	lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT,
+ 					 nominal_perf);
+-	lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
++	lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT);
+ 
+-	WRITE_ONCE(cpudata->min_freq, min_freq);
+-	WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq);
+-	WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
+-	WRITE_ONCE(cpudata->max_freq, max_freq);
++	WRITE_ONCE(cpudata->min_freq, min_freq * 1000);
++	WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000);
++	WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000);
++	WRITE_ONCE(cpudata->max_freq, max_freq * 1000);
+ 
+ 	/**
+ 	 * Below values need to be initialized correctly, otherwise driver will fail to load
+@@ -938,13 +950,13 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
+ 	 */
+ 	if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) {
+ 		pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n",
+-			min_freq, max_freq, nominal_freq * 1000);
++			min_freq, max_freq, nominal_freq);
+ 		return -EINVAL;
+ 	}
+ 
+-	if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) {
++	if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq) {
+ 		pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n",
+-			lowest_nonlinear_freq, min_freq, nominal_freq * 1000);
++			lowest_nonlinear_freq, min_freq, nominal_freq);
+ 		return -EINVAL;
+ 	}
+ 
+@@ -1161,7 +1173,6 @@ static ssize_t show_energy_performance_available_preferences(
+ static ssize_t store_energy_performance_preference(
+ 		struct cpufreq_policy *policy, const char *buf, size_t count)
+ {
+-	struct amd_cpudata *cpudata = policy->driver_data;
+ 	char str_preference[21];
+ 	ssize_t ret;
+ 
+@@ -1175,7 +1186,7 @@ static ssize_t store_energy_performance_preference(
+ 
+ 	guard(mutex)(&amd_pstate_limits_lock);
+ 
+-	ret = amd_pstate_set_energy_pref_index(cpudata, ret);
++	ret = amd_pstate_set_energy_pref_index(policy, ret);
+ 
+ 	return ret ? ret : count;
+ }
+@@ -1186,9 +1197,22 @@ static ssize_t show_energy_performance_preference(
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 	int preference;
+ 
+-	preference = amd_pstate_get_energy_pref_index(cpudata);
+-	if (preference < 0)
+-		return preference;
++	switch (cpudata->epp_cached) {
++	case AMD_CPPC_EPP_PERFORMANCE:
++		preference = EPP_INDEX_PERFORMANCE;
++		break;
++	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
++		preference = EPP_INDEX_BALANCE_PERFORMANCE;
++		break;
++	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
++		preference = EPP_INDEX_BALANCE_POWERSAVE;
++		break;
++	case AMD_CPPC_EPP_POWERSAVE:
++		preference = EPP_INDEX_POWERSAVE;
++		break;
++	default:
++		return -EINVAL;
++	}
+ 
+ 	return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
+ }
+@@ -1237,6 +1261,9 @@ static int amd_pstate_register_driver(int mode)
+ 		return ret;
+ 	}
+ 
++	/* at least one CPU supports CPB */
++	current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB);
++
+ 	ret = cpufreq_register_driver(current_pstate_driver);
+ 	if (ret) {
+ 		amd_pstate_driver_cleanup();
+@@ -1448,7 +1475,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 		return -ENOMEM;
+ 
+ 	cpudata->cpu = policy->cpu;
+-	cpudata->epp_policy = 0;
+ 
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+@@ -1474,8 +1500,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 
+ 	policy->driver_data = cpudata;
+ 
+-	cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata, 0);
+-
+ 	policy->min = policy->cpuinfo.min_freq;
+ 	policy->max = policy->cpuinfo.max_freq;
+ 
+@@ -1486,10 +1510,13 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 	 * the default cpufreq governor is neither powersave nor performance.
+ 	 */
+ 	if (amd_pstate_acpi_pm_profile_server() ||
+-	    amd_pstate_acpi_pm_profile_undefined())
++	    amd_pstate_acpi_pm_profile_undefined()) {
+ 		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+-	else
++		cpudata->epp_default = amd_pstate_get_epp(cpudata);
++	} else {
+ 		policy->policy = CPUFREQ_POLICY_POWERSAVE;
++		cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE;
++	}
+ 
+ 	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ 		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
+@@ -1502,6 +1529,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 			return ret;
+ 		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
+ 	}
++	ret = amd_pstate_set_epp(cpudata, cpudata->epp_default);
++	if (ret)
++		return ret;
+ 
+ 	current_pstate_driver->adjust_perf = NULL;
+ 
+@@ -1527,51 +1557,24 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
+ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+-	u32 max_perf, min_perf;
+-	u64 value;
+-	s16 epp;
++	u32 epp;
+ 
+-	max_perf = READ_ONCE(cpudata->highest_perf);
+-	min_perf = READ_ONCE(cpudata->lowest_perf);
+ 	amd_pstate_update_min_max_limit(policy);
+ 
+-	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+-			cpudata->max_limit_perf);
+-	min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
+-			cpudata->max_limit_perf);
+-	value = READ_ONCE(cpudata->cppc_req_cached);
+-
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+-		min_perf = min(cpudata->nominal_perf, max_perf);
+-
+-	/* Initial min/max values for CPPC Performance Controls Register */
+-	value &= ~AMD_CPPC_MIN_PERF(~0L);
+-	value |= AMD_CPPC_MIN_PERF(min_perf);
+-
+-	value &= ~AMD_CPPC_MAX_PERF(~0L);
+-	value |= AMD_CPPC_MAX_PERF(max_perf);
+-
+-	/* CPPC EPP feature require to set zero to the desire perf bit */
+-	value &= ~AMD_CPPC_DES_PERF(~0L);
+-	value |= AMD_CPPC_DES_PERF(0);
+-
+-	cpudata->epp_policy = cpudata->policy;
++		epp = 0;
++	else
++		epp = READ_ONCE(cpudata->epp_cached);
+ 
+-	/* Get BIOS pre-defined epp value */
+-	epp = amd_pstate_get_epp(cpudata, value);
+-	if (epp < 0) {
+-		/**
+-		 * This return value can only be negative for shared_memory
+-		 * systems where EPP register read/write not supported.
+-		 */
+-		return epp;
++	if (trace_amd_pstate_epp_perf_enabled()) {
++		trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp,
++					  cpudata->min_limit_perf,
++					  cpudata->max_limit_perf,
++					  policy->boost_enabled);
+ 	}
+ 
+-	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+-		epp = 0;
+-
+-	WRITE_ONCE(cpudata->cppc_req_cached, value);
+-	return amd_pstate_set_epp(cpudata, epp);
++	return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
++				      cpudata->max_limit_perf, epp, false);
+ }
+ 
+ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
+@@ -1600,8 +1603,9 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
+ 	return 0;
+ }
+ 
+-static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
++static int amd_pstate_epp_reenable(struct cpufreq_policy *policy)
+ {
++	struct amd_cpudata *cpudata = policy->driver_data;
+ 	u64 max_perf;
+ 	int ret;
+ 
+@@ -1611,17 +1615,26 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
+ 
+ 	max_perf = READ_ONCE(cpudata->highest_perf);
+ 
+-	amd_pstate_update_perf(cpudata, 0, 0, max_perf, false);
+-	amd_pstate_set_epp(cpudata, cpudata->epp_cached);
++	if (trace_amd_pstate_epp_perf_enabled()) {
++		trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
++					  cpudata->epp_cached,
++					  FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
++					  max_perf, policy->boost_enabled);
++	}
++
++	return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false);
+ }
+ 
+ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
++	int ret;
+ 
+ 	pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
+ 
+-	amd_pstate_epp_reenable(cpudata);
++	ret = amd_pstate_epp_reenable(policy);
++	if (ret)
++		return ret;
+ 	cpudata->suspended = false;
+ 
+ 	return 0;
+@@ -1639,10 +1652,14 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
+ 
+ 	guard(mutex)(&amd_pstate_limits_lock);
+ 
+-	amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false);
+-	amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);
++	if (trace_amd_pstate_epp_perf_enabled()) {
++		trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
++					  AMD_CPPC_EPP_BALANCE_POWERSAVE,
++					  min_perf, min_perf, policy->boost_enabled);
++	}
+ 
+-	return 0;
++	return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf,
++				      AMD_CPPC_EPP_BALANCE_POWERSAVE, false);
+ }
+ 
+ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
+@@ -1673,7 +1690,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy)
+ 		guard(mutex)(&amd_pstate_limits_lock);
+ 
+ 		/* enable amd pstate from suspend state*/
+-		amd_pstate_epp_reenable(cpudata);
++		amd_pstate_epp_reenable(policy);
+ 
+ 		cpudata->suspended = false;
+ 	}
+@@ -1826,6 +1843,8 @@ static int __init amd_pstate_init(void)
+ 		static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);
+ 		static_call_update(amd_pstate_init_perf, shmem_init_perf);
+ 		static_call_update(amd_pstate_update_perf, shmem_update_perf);
++		static_call_update(amd_pstate_get_epp, shmem_get_epp);
++		static_call_update(amd_pstate_set_epp, shmem_set_epp);
+ 	}
+ 
+ 	if (amd_pstate_prefcore) {
+diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h
+index cd573bc6b6db..9747e3be6cee 100644
+--- a/drivers/cpufreq/amd-pstate.h
++++ b/drivers/cpufreq/amd-pstate.h
+@@ -57,7 +57,6 @@ struct amd_aperf_mperf {
+  * @hw_prefcore: check whether HW supports preferred core featue.
+  * 		  Only when hw_prefcore and early prefcore param are true,
+  * 		  AMD P-State driver supports preferred core featue.
+- * @epp_policy: Last saved policy used to set energy-performance preference
+  * @epp_cached: Cached CPPC energy-performance preference value
+  * @policy: Cpufreq policy value
+  * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value
+@@ -94,13 +93,11 @@ struct amd_cpudata {
+ 	bool	hw_prefcore;
+ 
+ 	/* EPP feature related attributes*/
+-	s16	epp_policy;
+ 	s16	epp_cached;
+ 	u32	policy;
+ 	u64	cppc_cap1_cached;
+ 	bool	suspended;
+ 	s16	epp_default;
+-	bool	boost_state;
+ };
+ 
+ /*
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch b/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch
new file mode 100644
index 0000000..b4fc866
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch
@@ -0,0 +1,1350 @@
+From 379e6b90eecaf17f29691bcfcdd588d03a934b0d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:27:28 +0100
+Subject: [PATCH 02/12] amd-tlb-broadcast
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/Kconfig                         |   2 +-
+ arch/x86/Kconfig.cpu                     |   4 +
+ arch/x86/hyperv/mmu.c                    |   1 -
+ arch/x86/include/asm/cpufeatures.h       |   1 +
+ arch/x86/include/asm/disabled-features.h |   8 +-
+ arch/x86/include/asm/mmu.h               |  12 +
+ arch/x86/include/asm/mmu_context.h       |  10 +-
+ arch/x86/include/asm/msr-index.h         |   2 +
+ arch/x86/include/asm/paravirt.h          |   5 -
+ arch/x86/include/asm/paravirt_types.h    |   2 -
+ arch/x86/include/asm/tlb.h               | 138 +++++++
+ arch/x86/include/asm/tlbflush.h          |  69 ++++
+ arch/x86/kernel/alternative.c            |  10 +-
+ arch/x86/kernel/cpu/amd.c                |  10 +
+ arch/x86/kernel/kvm.c                    |   1 -
+ arch/x86/kernel/paravirt.c               |   6 -
+ arch/x86/mm/pgtable.c                    |  16 +-
+ arch/x86/mm/tlb.c                        | 450 ++++++++++++++++++++---
+ arch/x86/xen/mmu_pv.c                    |   1 -
+ include/linux/mm_types.h                 |   1 +
+ mm/memory.c                              |   1 -
+ mm/mmap.c                                |   2 -
+ mm/swap_state.c                          |   1 -
+ mm/vma.c                                 |   2 -
+ tools/arch/x86/include/asm/msr-index.h   |   2 +
+ 25 files changed, 668 insertions(+), 89 deletions(-)
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 757333fe82c7..3d143bd2c054 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -273,7 +273,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
+-	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
++	select MMU_GATHER_RCU_TABLE_FREE
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
+index 2a7279d80460..25c55cc17c5e 100644
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -401,6 +401,10 @@ menuconfig PROCESSOR_SELECT
+ 	  This lets you choose what x86 vendor support code your kernel
+ 	  will include.
+ 
++config BROADCAST_TLB_FLUSH
++	def_bool y
++	depends on CPU_SUP_AMD && 64BIT
++
+ config CPU_SUP_INTEL
+ 	default y
+ 	bool "Support Intel processors" if PROCESSOR_SELECT
+diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
+index 1cc113200ff5..cbe6c71e17c1 100644
+--- a/arch/x86/hyperv/mmu.c
++++ b/arch/x86/hyperv/mmu.c
+@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
+ 
+ 	pr_info("Using hypercall for remote TLB flush\n");
+ 	pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
+-	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ }
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index 645aa360628d..bf727839326f 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -338,6 +338,7 @@
+ #define X86_FEATURE_CLZERO		(13*32+ 0) /* "clzero" CLZERO instruction */
+ #define X86_FEATURE_IRPERF		(13*32+ 1) /* "irperf" Instructions Retired Count */
+ #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
++#define X86_FEATURE_INVLPGB		(13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
+ #define X86_FEATURE_RDPRU		(13*32+ 4) /* "rdpru" Read processor register at user level */
+ #define X86_FEATURE_WBNOINVD		(13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+ #define X86_FEATURE_AMD_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
+diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
+index c492bdc97b05..be8c38855068 100644
+--- a/arch/x86/include/asm/disabled-features.h
++++ b/arch/x86/include/asm/disabled-features.h
+@@ -129,6 +129,12 @@
+ #define DISABLE_SEV_SNP		(1 << (X86_FEATURE_SEV_SNP & 31))
+ #endif
+ 
++#ifdef CONFIG_BROADCAST_TLB_FLUSH
++#define DISABLE_INVLPGB		0
++#else
++#define DISABLE_INVLPGB		(1 << (X86_FEATURE_INVLPGB & 31))
++#endif
++
+ /*
+  * Make sure to add features to the correct mask
+  */
+@@ -146,7 +152,7 @@
+ #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
+ 			 DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
+ #define DISABLED_MASK12	(DISABLE_FRED|DISABLE_LAM)
+-#define DISABLED_MASK13	0
++#define DISABLED_MASK13	(DISABLE_INVLPGB)
+ #define DISABLED_MASK14	0
+ #define DISABLED_MASK15	0
+ #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
+diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
+index 3b496cdcb74b..8b8055a8eb9e 100644
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -69,6 +69,18 @@ typedef struct {
+ 	u16 pkey_allocation_map;
+ 	s16 execute_only_pkey;
+ #endif
++
++#ifdef CONFIG_BROADCAST_TLB_FLUSH
++	/*
++	 * The global ASID will be a non-zero value when the process has
++	 * the same ASID across all CPUs, allowing it to make use of
++	 * hardware-assisted remote TLB invalidation like AMD INVLPGB.
++	 */
++	u16 global_asid;
++
++	/* The process is transitioning to a new global ASID number. */
++	bool asid_transition;
++#endif
+ } mm_context_t;
+ 
+ #define INIT_MM_CONTEXT(mm)						\
+diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
+index 795fdd53bd0a..2398058b6e83 100644
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -2,7 +2,6 @@
+ #ifndef _ASM_X86_MMU_CONTEXT_H
+ #define _ASM_X86_MMU_CONTEXT_H
+ 
+-#include <asm/desc.h>
+ #include <linux/atomic.h>
+ #include <linux/mm_types.h>
+ #include <linux/pkeys.h>
+@@ -13,6 +12,7 @@
+ #include <asm/paravirt.h>
+ #include <asm/debugreg.h>
+ #include <asm/gsseg.h>
++#include <asm/desc.h>
+ 
+ extern atomic64_t last_mm_ctx_id;
+ 
+@@ -139,6 +139,11 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
+ #define enter_lazy_tlb enter_lazy_tlb
+ extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+ 
++#define mm_init_global_asid mm_init_global_asid
++extern void mm_init_global_asid(struct mm_struct *mm);
++
++extern void mm_free_global_asid(struct mm_struct *mm);
++
+ /*
+  * Init a new mm.  Used on mm copies, like at fork()
+  * and on mm's that are brand-new, like at execve().
+@@ -161,6 +166,8 @@ static inline int init_new_context(struct task_struct *tsk,
+ 		mm->context.execute_only_pkey = -1;
+ 	}
+ #endif
++
++	mm_init_global_asid(mm);
+ 	mm_reset_untag_mask(mm);
+ 	init_new_context_ldt(mm);
+ 	return 0;
+@@ -170,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk,
+ static inline void destroy_context(struct mm_struct *mm)
+ {
+ 	destroy_context_ldt(mm);
++	mm_free_global_asid(mm);
+ }
+ 
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index 61e991507353..6844ebeed377 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
++#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
++#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index d4eb9e1d61b8..794ba3647c6c 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask,
+ 	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+ }
+ 
+-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
+-}
+-
+ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+ {
+ 	PVOP_VCALL1(mmu.exit_mmap, mm);
+diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
+index 8d4fbe1be489..13405959e4db 100644
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -136,8 +136,6 @@ struct pv_mmu_ops {
+ 	void (*flush_tlb_multi)(const struct cpumask *cpus,
+ 				const struct flush_tlb_info *info);
+ 
+-	void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+-
+ 	/* Hook for intercepting the destruction of an mm_struct. */
+ 	void (*exit_mmap)(struct mm_struct *mm);
+ 	void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
+diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
+index 4d3c9d00d6b6..a74b57512761 100644
+--- a/arch/x86/include/asm/tlb.h
++++ b/arch/x86/include/asm/tlb.h
+@@ -6,6 +6,9 @@
+ static inline void tlb_flush(struct mmu_gather *tlb);
+ 
+ #include <asm-generic/tlb.h>
++#include <linux/kernel.h>
++#include <vdso/bits.h>
++#include <vdso/page.h>
+ 
+ static inline void tlb_flush(struct mmu_gather *tlb)
+ {
+@@ -38,4 +41,139 @@ static inline void invlpg(unsigned long addr)
+ {
+ 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ }
++enum addr_stride {
++	PTE_STRIDE = 0,
++	PMD_STRIDE = 1
++};
++
++/*
++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
++ * of the three. For example:
++ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
++ * - FLAG_PCID:			    invalidate all TLB entries matching the PCID
++ *
++ * The first is used to invalidate (kernel) mappings at a particular
++ * address across all processes.
++ *
++ * The latter invalidates all TLB entries matching a PCID.
++ */
++#define INVLPGB_FLAG_VA			BIT(0)
++#define INVLPGB_FLAG_PCID		BIT(1)
++#define INVLPGB_FLAG_ASID		BIT(2)
++#define INVLPGB_FLAG_INCLUDE_GLOBAL	BIT(3)
++#define INVLPGB_FLAG_FINAL_ONLY		BIT(4)
++#define INVLPGB_FLAG_INCLUDE_NESTED	BIT(5)
++
++/* The implied mode when all bits are clear: */
++#define INVLPGB_MODE_ALL_NONGLOBALS	0UL
++
++#ifdef CONFIG_BROADCAST_TLB_FLUSH
++/*
++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
++ *
++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
++ * be done in a parallel fashion.
++ *
++ * The instruction takes the number of extra pages to invalidate, beyond the
++ * first page, while __invlpgb gets the more human readable number of pages to
++ * invalidate.
++ *
++ * The bits in rax[0:2] determine respectively which components of the address
++ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
++ * address in the specified range matches.
++ *
++ * Since it is desired to only flush TLB entries for the ASID that is executing
++ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
++ * always be set. On a host/hypervisor, the hardware will use the ASID value
++ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
++ * use the actual ASID value of the guest.
++ *
++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
++ * this CPU have completed.
++ */
++static inline void __invlpgb(unsigned long asid, unsigned long pcid,
++			     unsigned long addr, u16 nr_pages,
++			     enum addr_stride stride, u8 flags)
++{
++	u64 rax = addr | flags | INVLPGB_FLAG_ASID;
++	u32 ecx = (stride << 31) | (nr_pages - 1);
++	u32 edx = (pcid << 16) | asid;
++
++	/* The low bits in rax are for flags. Verify addr is clean. */
++	VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
++
++	/* INVLPGB; supported in binutils >= 2.36. */
++	asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
++}
++
++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
++{
++	__invlpgb(asid, pcid, 0, 1, 0, flags);
++}
++
++static inline void __tlbsync(void)
++{
++	/*
++	 * TLBSYNC waits for INVLPGB instructions originating on the same CPU
++	 * to have completed. Print a warning if the task has been migrated,
++	 * and might not be waiting on all the INVLPGBs issued during this TLB
++	 * invalidation sequence.
++	 */
++	cant_migrate();
++
++	/* TLBSYNC: supported in binutils >= 0.36. */
++	asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
++}
++#else
++/* Some compilers (I'm looking at you clang!) simply can't do DCE */
++static inline void __invlpgb(unsigned long asid, unsigned long pcid,
++			     unsigned long addr, u16 nr_pages,
++			     enum addr_stride s, u8 flags) { }
++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
++static inline void __tlbsync(void) { }
++#endif
++
++static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
++						unsigned long addr,
++						u16 nr, bool stride)
++{
++	enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
++	u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
++
++	__invlpgb(0, pcid, addr, nr, str, flags);
++}
++
++/* Flush all mappings for a given PCID, not including globals. */
++static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
++{
++	__invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
++}
++
++/* Flush all mappings, including globals, for all PCIDs. */
++static inline void invlpgb_flush_all(void)
++{
++	/*
++	 * TLBSYNC at the end needs to make sure all flushes done on the
++	 * current CPU have been executed system-wide. Therefore, make
++	 * sure nothing gets migrated in-between but disable preemption
++	 * as it is cheaper.
++	 */
++	guard(preempt)();
++	__invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
++	__tlbsync();
++}
++
++/* Flush addr, including globals, for all PCIDs. */
++static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
++{
++	__invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
++}
++
++/* Flush all mappings for all PCIDs except globals. */
++static inline void invlpgb_flush_all_nonglobals(void)
++{
++	guard(preempt)();
++	__invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
++	__tlbsync();
++}
+ #endif /* _ASM_X86_TLB_H */
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 02fc2aa06e9e..0bc91488c9c2 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -6,6 +6,7 @@
+ #include <linux/mmu_notifier.h>
+ #include <linux/sched.h>
+ 
++#include <asm/barrier.h>
+ #include <asm/processor.h>
+ #include <asm/cpufeature.h>
+ #include <asm/special_insns.h>
+@@ -183,6 +184,9 @@ static inline void cr4_init_shadow(void)
+ extern unsigned long mmu_cr4_features;
+ extern u32 *trampoline_cr4_features;
+ 
++/* How many pages can be invalidated with one INVLPGB. */
++extern u16 invlpgb_count_max;
++
+ extern void initialize_tlbstate_and_flush(void);
+ 
+ /*
+@@ -231,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr);
+ void flush_tlb_multi(const struct cpumask *cpumask,
+ 		      const struct flush_tlb_info *info);
+ 
++static inline bool is_dyn_asid(u16 asid)
++{
++	return asid < TLB_NR_DYN_ASIDS;
++}
++
++static inline bool is_global_asid(u16 asid)
++{
++	return !is_dyn_asid(asid);
++}
++
++#ifdef CONFIG_BROADCAST_TLB_FLUSH
++static inline u16 mm_global_asid(struct mm_struct *mm)
++{
++	u16 asid;
++
++	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		return 0;
++
++	asid = smp_load_acquire(&mm->context.global_asid);
++
++	/* mm->context.global_asid is either 0, or a global ASID */
++	VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
++
++	return asid;
++}
++
++static inline void mm_init_global_asid(struct mm_struct *mm)
++{
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		mm->context.global_asid = 0;
++		mm->context.asid_transition = false;
++	}
++}
++
++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
++{
++	/*
++	 * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
++	 * finish_asid_transition() needs to observe asid_transition = true
++	 * once it observes global_asid.
++	 */
++	mm->context.asid_transition = true;
++	smp_store_release(&mm->context.global_asid, asid);
++}
++
++static inline void mm_clear_asid_transition(struct mm_struct *mm)
++{
++	WRITE_ONCE(mm->context.asid_transition, false);
++}
++
++static inline bool mm_in_asid_transition(struct mm_struct *mm)
++{
++	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		return false;
++
++	return mm && READ_ONCE(mm->context.asid_transition);
++}
++#else
++static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
++static inline void mm_init_global_asid(struct mm_struct *mm) { }
++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
++static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
++static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
++#endif /* CONFIG_BROADCAST_TLB_FLUSH */
++
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #endif
+diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
+index 243843e44e89..c71b575bf229 100644
+--- a/arch/x86/kernel/alternative.c
++++ b/arch/x86/kernel/alternative.c
+@@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
+ 	return temp_state;
+ }
+ 
++__ro_after_init struct mm_struct *poking_mm;
++__ro_after_init unsigned long poking_addr;
++
+ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
+ {
+ 	lockdep_assert_irqs_disabled();
++
+ 	switch_mm_irqs_off(NULL, prev_state.mm, current);
+ 
++	/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
++	cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
++
+ 	/*
+ 	 * Restore the breakpoints if they were disabled before the temporary mm
+ 	 * was loaded.
+@@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
+ 		hw_breakpoint_restore();
+ }
+ 
+-__ro_after_init struct mm_struct *poking_mm;
+-__ro_after_init unsigned long poking_addr;
+-
+ static void text_poke_memcpy(void *dst, const void *src, size_t len)
+ {
+ 	memcpy(dst, src, len);
+diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
+index 79d2e17f6582..05ca61b66461 100644
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -29,6 +29,8 @@
+ 
+ #include "cpu.h"
+ 
++u16 invlpgb_count_max __ro_after_init;
++
+ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+ {
+ 	u32 gprs[8] = { 0 };
+@@ -1069,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86 *c)
+ 
+ 	/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ 	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
++
++	/* Enable Translation Cache Extension */
++	if (cpu_has(c, X86_FEATURE_TCE))
++		msr_set_bit(MSR_EFER, _EFER_TCE);
+ }
+ 
+ #ifdef CONFIG_X86_32
+@@ -1135,6 +1141,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+ 		tlb_lli_2m[ENTRIES] = eax & mask;
+ 
+ 	tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
++
++	/* Max number of pages INVLPGB can invalidate in one shot */
++	if (cpu_has(c, X86_FEATURE_INVLPGB))
++		invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
+ }
+ 
+ static const struct cpu_dev amd_cpu_dev = {
+diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
+index 21e9e4845354..83b7679658b1 100644
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
+ #ifdef CONFIG_SMP
+ 	if (pv_tlb_flush_supported()) {
+ 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+-		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ 		pr_info("KVM setup pv remote TLB flush\n");
+ 	}
+ 
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index fec381533555..c019771e0123 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
+ 		static_branch_enable(&virt_spin_lock_key);
+ }
+ 
+-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-
+ struct static_key paravirt_steal_enabled;
+ struct static_key paravirt_steal_rq_enabled;
+ 
+@@ -191,7 +186,6 @@ struct paravirt_patch_template pv_ops = {
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= native_tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 5745a354a241..3dc4af1f7868 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
+ #define PGTABLE_HIGHMEM 0
+ #endif
+ 
+-#ifndef CONFIG_PARAVIRT
+-static inline
+-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-#endif
+-
+ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
+ 
+ pgtable_t pte_alloc_one(struct mm_struct *mm)
+@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+ {
+ 	pagetable_pte_dtor(page_ptdesc(pte));
+ 	paravirt_release_pte(page_to_pfn(pte));
+-	paravirt_tlb_remove_table(tlb, pte);
++	tlb_remove_table(tlb, pte);
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 2
+@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+ 	tlb->need_flush_all = 1;
+ #endif
+ 	pagetable_pmd_dtor(ptdesc);
+-	paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
++	tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 3
+@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+ 
+ 	pagetable_pud_dtor(ptdesc);
+ 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
++	tlb_remove_table(tlb, virt_to_page(pud));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 4
+ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+ {
+ 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
++	tlb_remove_table(tlb, virt_to_page(p4d));
+ }
+ #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+ #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 90a9e4740913..7505c2d94bc0 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -74,13 +74,15 @@
+  * use different names for each of them:
+  *
+  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+- *         the canonical identifier for an mm
++ *         the canonical identifier for an mm, dynamically allocated on each CPU
++ *         [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
++ *         the canonical, global identifier for an mm, identical across all CPUs
+  *
+- * kPCID - [1, TLB_NR_DYN_ASIDS]
++ * kPCID - [1, MAX_ASID_AVAILABLE]
+  *         the value we write into the PCID part of CR3; corresponds to the
+  *         ASID+1, because PCID 0 is special.
+  *
+- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
+  *         for KPTI each mm has two address spaces and thus needs two
+  *         PCID values, but we can still do with a single ASID denomination
+  *         for each mm. Corresponds to kPCID + 2048.
+@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ 		return;
+ 	}
+ 
++	/*
++	 * TLB consistency for global ASIDs is maintained with hardware assisted
++	 * remote TLB flushing. Global ASIDs are always up to date.
++	 */
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		u16 global_asid = mm_global_asid(next);
++
++		if (global_asid) {
++			*new_asid = global_asid;
++			*need_flush = false;
++			return;
++		}
++	}
++
+ 	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ 		clear_asid_other();
+ 
+@@ -251,6 +267,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ 	*need_flush = true;
+ }
+ 
++/*
++ * Global ASIDs are allocated for multi-threaded processes that are
++ * active on multiple CPUs simultaneously, giving each of those
++ * processes the same PCID on every CPU, for use with hardware-assisted
++ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
++ *
++ * These global ASIDs are held for the lifetime of the process.
++ */
++static DEFINE_RAW_SPINLOCK(global_asid_lock);
++static u16 last_global_asid = MAX_ASID_AVAILABLE;
++static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
++static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
++static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
++
++/*
++ * When the search for a free ASID in the global ASID space reaches
++ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
++ * freed global ASIDs are safe to re-use.
++ *
++ * This way the global flush only needs to happen at ASID rollover
++ * time, and not at ASID allocation time.
++ */
++static void reset_global_asid_space(void)
++{
++	lockdep_assert_held(&global_asid_lock);
++
++	invlpgb_flush_all_nonglobals();
++
++	/*
++	 * The TLB flush above makes it safe to re-use the previously
++	 * freed global ASIDs.
++	 */
++	bitmap_andnot(global_asid_used, global_asid_used,
++			global_asid_freed, MAX_ASID_AVAILABLE);
++	bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
++
++	/* Restart the search from the start of global ASID space. */
++	last_global_asid = TLB_NR_DYN_ASIDS;
++}
++
++static u16 allocate_global_asid(void)
++{
++	u16 asid;
++
++	lockdep_assert_held(&global_asid_lock);
++
++	/* The previous allocation hit the edge of available address space */
++	if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
++		reset_global_asid_space();
++
++	asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
++
++	if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
++		/* This should never happen. */
++		VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
++				global_asid_available);
++		return 0;
++	}
++
++	/* Claim this global ASID. */
++	__set_bit(asid, global_asid_used);
++	last_global_asid = asid;
++	global_asid_available--;
++	return asid;
++}
++
++/*
++ * Check whether a process is currently active on more than @threshold CPUs.
++ * This is a cheap estimation on whether or not it may make sense to assign
++ * a global ASID to this process, and use broadcast TLB invalidation.
++ */
++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
++{
++	int count = 0;
++	int cpu;
++
++	/* This quick check should eliminate most single threaded programs. */
++	if (cpumask_weight(mm_cpumask(mm)) <= threshold)
++		return false;
++
++	/* Slower check to make sure. */
++	for_each_cpu(cpu, mm_cpumask(mm)) {
++		/* Skip the CPUs that aren't really running this process. */
++		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
++			continue;
++
++		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
++			continue;
++
++		if (++count > threshold)
++			return true;
++	}
++	return false;
++}
++
++/*
++ * Assign a global ASID to the current process, protecting against
++ * races between multiple threads in the process.
++ */
++static void use_global_asid(struct mm_struct *mm)
++{
++	u16 asid;
++
++	guard(raw_spinlock_irqsave)(&global_asid_lock);
++
++	/* This process is already using broadcast TLB invalidation. */
++	if (mm_global_asid(mm))
++		return;
++
++	/*
++	 * The last global ASID was consumed while waiting for the lock.
++	 *
++	 * If this fires, a more aggressive ASID reuse scheme might be
++	 * needed.
++	 */
++	if (!global_asid_available) {
++		VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
++		return;
++	}
++
++	asid = allocate_global_asid();
++	if (!asid)
++		return;
++
++	mm_assign_global_asid(mm, asid);
++}
++
++void mm_free_global_asid(struct mm_struct *mm)
++{
++	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		return;
++
++	if (!mm_global_asid(mm))
++		return;
++
++	guard(raw_spinlock_irqsave)(&global_asid_lock);
++
++	/* The global ASID can be re-used only after flush at wrap-around. */
++#ifdef CONFIG_BROADCAST_TLB_FLUSH
++	__set_bit(mm->context.global_asid, global_asid_freed);
++
++	mm->context.global_asid = 0;
++	global_asid_available++;
++#endif
++}
++
++/*
++ * Is the mm transitioning from a CPU-local ASID to a global ASID?
++ */
++static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
++{
++	u16 global_asid = mm_global_asid(mm);
++
++	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		return false;
++
++	/* Process is transitioning to a global ASID */
++	if (global_asid && asid != global_asid)
++		return true;
++
++	return false;
++}
++
++/*
++ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
++ * systems have over 8k CPUs. Because of this potential ASID shortage,
++ * global ASIDs are handed out to processes that have frequent TLB
++ * flushes and are active on 4 or more CPUs simultaneously.
++ */
++static void consider_global_asid(struct mm_struct *mm)
++{
++	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		return;
++
++	/* Check every once in a while. */
++	if ((current->pid & 0x1f) != (jiffies & 0x1f))
++		return;
++
++	/*
++	 * Assign a global ASID if the process is active on
++	 * 4 or more CPUs simultaneously.
++	 */
++	if (mm_active_cpus_exceeds(mm, 3))
++		use_global_asid(mm);
++}
++
++static void finish_asid_transition(struct flush_tlb_info *info)
++{
++	struct mm_struct *mm = info->mm;
++	int bc_asid = mm_global_asid(mm);
++	int cpu;
++
++	if (!mm_in_asid_transition(mm))
++		return;
++
++	for_each_cpu(cpu, mm_cpumask(mm)) {
++		/*
++		 * The remote CPU is context switching. Wait for that to
++		 * finish, to catch the unlikely case of it switching to
++		 * the target mm with an out of date ASID.
++		 */
++		while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
++			cpu_relax();
++
++		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
++			continue;
++
++		/*
++		 * If at least one CPU is not using the global ASID yet,
++		 * send a TLB flush IPI. The IPI should cause stragglers
++		 * to transition soon.
++		 *
++		 * This can race with the CPU switching to another task;
++		 * that results in a (harmless) extra IPI.
++		 */
++		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
++			flush_tlb_multi(mm_cpumask(info->mm), info);
++			return;
++		}
++	}
++
++	/* All the CPUs running this process are using the global ASID. */
++	mm_clear_asid_transition(mm);
++}
++
++static void broadcast_tlb_flush(struct flush_tlb_info *info)
++{
++	bool pmd = info->stride_shift == PMD_SHIFT;
++	unsigned long asid = mm_global_asid(info->mm);
++	unsigned long addr = info->start;
++
++	/*
++	 * TLB flushes with INVLPGB are kicked off asynchronously.
++	 * The inc_mm_tlb_gen() guarantees page table updates are done
++	 * before these TLB flushes happen.
++	 */
++	if (info->end == TLB_FLUSH_ALL) {
++		invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
++		/* Do any CPUs supporting INVLPGB need PTI? */
++		if (cpu_feature_enabled(X86_FEATURE_PTI))
++			invlpgb_flush_single_pcid_nosync(user_pcid(asid));
++	} else do {
++		unsigned long nr = 1;
++
++		if (info->stride_shift <= PMD_SHIFT) {
++			nr = (info->end - addr) >> info->stride_shift;
++			nr = clamp_val(nr, 1, invlpgb_count_max);
++		}
++
++		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
++		if (cpu_feature_enabled(X86_FEATURE_PTI))
++			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
++
++		addr += nr << info->stride_shift;
++	} while (addr < info->end);
++
++	finish_asid_transition(info);
++
++	/* Wait for the INVLPGBs kicked off above to finish. */
++	__tlbsync();
++}
++
+ /*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+@@ -556,7 +834,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
+ 	 */
+ 	if (prev == next) {
+ 		/* Not actually switching mm's */
+-		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
++		VM_WARN_ON(is_dyn_asid(prev_asid) &&
++			   this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ 			   next->context.ctx_id);
+ 
+ 		/*
+@@ -573,6 +852,20 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
+ 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 
++		/* Check if the current mm is transitioning to a global ASID */
++		if (mm_needs_global_asid(next, prev_asid)) {
++			next_tlb_gen = atomic64_read(&next->context.tlb_gen);
++			choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
++			goto reload_tlb;
++		}
++
++		/*
++		 * Broadcast TLB invalidation keeps this ASID up to date
++		 * all the time.
++		 */
++		if (is_global_asid(prev_asid))
++			return;
++
+ 		/*
+ 		 * If the CPU is not in lazy TLB mode, we are just switching
+ 		 * from one thread in a process to another thread in the same
+@@ -607,30 +900,32 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
+ 		cond_mitigation(tsk);
+ 
+ 		/*
+-		 * Stop remote flushes for the previous mm.
+-		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+-		 * but the bitmap manipulation can cause cache line contention.
++		 * Let nmi_uaccess_okay() and finish_asid_transition()
++		 * know that CR3 is changing.
+ 		 */
+-		if (prev != &init_mm) {
+-			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+-						mm_cpumask(prev)));
+-			cpumask_clear_cpu(cpu, mm_cpumask(prev));
+-		}
++		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
++		barrier();
++
++		/*
++		 * Leave this CPU in prev's mm_cpumask. Atomic writes to
++		 * mm_cpumask can be expensive under contention. The CPU
++		 * will be removed lazily at TLB flush time.
++		 */
++		VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
++				mm_cpumask(prev)));
+ 
+ 		/* Start receiving IPIs and then read tlb_gen (and LAM below) */
+-		if (next != &init_mm)
++		if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ 
+ 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+-
+-		/* Let nmi_uaccess_okay() know that we're changing CR3. */
+-		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+-		barrier();
+ 	}
+ 
++reload_tlb:
+ 	new_lam = mm_lam_cr3_mask(next);
+ 	if (need_flush) {
++		VM_WARN_ON_ONCE(is_global_asid(new_asid));
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ 		load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
+@@ -749,7 +1044,7 @@ static void flush_tlb_func(void *info)
+ 	const struct flush_tlb_info *f = info;
+ 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
++	u64 local_tlb_gen;
+ 	bool local = smp_processor_id() == f->initiating_cpu;
+ 	unsigned long nr_invalidate = 0;
+ 	u64 mm_tlb_gen;
+@@ -760,15 +1055,28 @@ static void flush_tlb_func(void *info)
+ 	if (!local) {
+ 		inc_irq_stat(irq_tlb_count);
+ 		count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
++	}
+ 
+-		/* Can only happen on remote CPUs */
+-		if (f->mm && f->mm != loaded_mm)
+-			return;
++	/* The CPU was left in the mm_cpumask of the target mm. Clear it. */
++	if (f->mm && f->mm != loaded_mm) {
++		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
++		trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0);
++		return;
+ 	}
+ 
+ 	if (unlikely(loaded_mm == &init_mm))
+ 		return;
+ 
++	/* Reload the ASID if transitioning into or out of a global ASID */
++	if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
++		switch_mm_irqs_off(NULL, loaded_mm, NULL);
++		loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
++	}
++
++	/* Broadcast ASIDs are always kept up to date with INVLPGB. */
++	if (is_global_asid(loaded_mm_asid))
++		return;
++
+ 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ 		   loaded_mm->context.ctx_id);
+ 
+@@ -786,6 +1094,8 @@ static void flush_tlb_func(void *info)
+ 		return;
+ 	}
+ 
++	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
++
+ 	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+ 		     f->new_tlb_gen <= local_tlb_gen)) {
+ 		/*
+@@ -953,7 +1263,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
+ 	 * up on the new contents of what used to be page tables, while
+ 	 * doing a speculative memory access.
+ 	 */
+-	if (info->freed_tables)
++	if (info->freed_tables || mm_in_asid_transition(info->mm))
+ 		on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ 	else
+ 		on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
+@@ -1000,6 +1310,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+ 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
+ #endif
+ 
++	/*
++	 * If the number of flushes is so large that a full flush
++	 * would be faster, do a full flush.
++	 */
++	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
++		start = 0;
++		end = TLB_FLUSH_ALL;
++	}
++
+ 	info->start		= start;
+ 	info->end		= end;
+ 	info->mm		= mm;
+@@ -1026,17 +1345,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ 				bool freed_tables)
+ {
+ 	struct flush_tlb_info *info;
++	int cpu = get_cpu();
+ 	u64 new_tlb_gen;
+-	int cpu;
+-
+-	cpu = get_cpu();
+-
+-	/* Should we flush just the requested range? */
+-	if ((end == TLB_FLUSH_ALL) ||
+-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+-		start = 0;
+-		end = TLB_FLUSH_ALL;
+-	}
+ 
+ 	/* This is also a barrier that synchronizes with switch_mm(). */
+ 	new_tlb_gen = inc_mm_tlb_gen(mm);
+@@ -1049,9 +1359,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
++	if (mm_global_asid(mm)) {
++		broadcast_tlb_flush(info);
++	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ 		info->trim_cpumask = should_trim_cpumask(mm);
+ 		flush_tlb_multi(mm_cpumask(mm), info);
++		consider_global_asid(mm);
+ 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ 		lockdep_assert_irqs_enabled();
+ 		local_irq_disable();
+@@ -1064,7 +1377,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+ }
+ 
+-
+ static void do_flush_tlb_all(void *info)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+@@ -1074,7 +1386,32 @@ static void do_flush_tlb_all(void *info)
+ void flush_tlb_all(void)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+-	on_each_cpu(do_flush_tlb_all, NULL, 1);
++
++	/* First try (faster) hardware-assisted TLB invalidation. */
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		invlpgb_flush_all();
++	else
++		/* Fall back to the IPI-based invalidation. */
++		on_each_cpu(do_flush_tlb_all, NULL, 1);
++}
++
++/* Flush an arbitrarily large range of memory with INVLPGB. */
++static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
++{
++	unsigned long addr, nr;
++
++	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
++		nr = (info->end - addr) >> PAGE_SHIFT;
++
++		/*
++		 * INVLPGB has a limit on the size of ranges it can
++		 * flush. Break up large flushes.
++		 */
++		nr = clamp_val(nr, 1, invlpgb_count_max);
++
++		invlpgb_flush_addr_nosync(addr, nr);
++	}
++	__tlbsync();
+ }
+ 
+ static void do_kernel_range_flush(void *info)
+@@ -1087,24 +1424,37 @@ static void do_kernel_range_flush(void *info)
+ 		flush_tlb_one_kernel(addr);
+ }
+ 
+-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
++static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+ {
+-	/* Balance as user space task's flush, a bit conservative */
+-	if (end == TLB_FLUSH_ALL ||
+-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		invlpgb_flush_all();
++	else
+ 		on_each_cpu(do_flush_tlb_all, NULL, 1);
+-	} else {
+-		struct flush_tlb_info *info;
+-
+-		preempt_disable();
+-		info = get_flush_tlb_info(NULL, start, end, 0, false,
+-					  TLB_GENERATION_INVALID);
++}
+ 
++static void kernel_tlb_flush_range(struct flush_tlb_info *info)
++{
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
++		invlpgb_kernel_range_flush(info);
++	else
+ 		on_each_cpu(do_kernel_range_flush, info, 1);
++}
+ 
+-		put_flush_tlb_info();
+-		preempt_enable();
+-	}
++void flush_tlb_kernel_range(unsigned long start, unsigned long end)
++{
++	struct flush_tlb_info *info;
++
++	guard(preempt)();
++
++	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
++				  TLB_GENERATION_INVALID);
++
++	if (info->end == TLB_FLUSH_ALL)
++		kernel_tlb_flush_all(info);
++	else
++		kernel_tlb_flush_range(info);
++
++	put_flush_tlb_info();
+ }
+ 
+ /*
+@@ -1283,7 +1633,9 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		invlpgb_flush_all_nonglobals();
++	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(&batch->cpumask, info);
+ 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ 		lockdep_assert_irqs_enabled();
+diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
+index d078de2c952b..38971c6dcd4b 100644
+--- a/arch/x86/xen/mmu_pv.c
++++ b/arch/x86/xen/mmu_pv.c
+@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
+ 		.flush_tlb_kernel = xen_flush_tlb,
+ 		.flush_tlb_one_user = xen_flush_tlb_one_user,
+ 		.flush_tlb_multi = xen_flush_tlb_multi,
+-		.tlb_remove_table = tlb_remove_table,
+ 
+ 		.pgd_alloc = xen_pgd_alloc,
+ 		.pgd_free = xen_pgd_free,
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 14fc1b39c0cf..a199e299b0d4 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1402,6 +1402,7 @@ enum tlb_flush_reason {
+ 	TLB_LOCAL_SHOOTDOWN,
+ 	TLB_LOCAL_MM_SHOOTDOWN,
+ 	TLB_REMOTE_SEND_IPI,
++	TLB_REMOTE_WRONG_CPU,
+ 	NR_TLB_FLUSH_REASONS,
+ };
+ 
+diff --git a/mm/memory.c b/mm/memory.c
+index b6015e230822..eb5fdd558442 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1935,7 +1935,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ 	struct mmu_notifier_range range;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ 				address, end);
+ 	hugetlb_zap_begin(vma, &range.start, &range.end);
+diff --git a/mm/mmap.c b/mm/mmap.c
+index aec208f90337..d628b7900d2d 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1664,7 +1664,6 @@ void exit_mmap(struct mm_struct *mm)
+ 		goto destroy;
+ 	}
+ 
+-	lru_add_drain();
+ 	flush_cache_mm(mm);
+ 	tlb_gather_mmu_fullmm(&tlb, mm);
+ 	/* update_hiwater_rss(mm) here? but nobody should be looking */
+@@ -2107,7 +2106,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+ 				       vma, new_start, length, false, true))
+ 		return -ENOMEM;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	next = vma_next(&vmi);
+ 	if (new_end > old_start) {
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index e0c0321b8ff7..ca42b2be64d9 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
+ 	struct folio_batch folios;
+ 	unsigned int refs[PAGEVEC_SIZE];
+ 
+-	lru_add_drain();
+ 	folio_batch_init(&folios);
+ 	for (int i = 0; i < nr; i++) {
+ 		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+diff --git a/mm/vma.c b/mm/vma.c
+index b126683397fc..bf2e91454019 100644
+--- a/mm/vma.c
++++ b/mm/vma.c
+@@ -398,7 +398,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	update_hiwater_rss(mm);
+ 	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
+@@ -1130,7 +1129,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
+ 	 * were isolated before we downgraded mmap_lock.
+ 	 */
+ 	mas_set(mas_detach, 1);
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
+ 	update_hiwater_rss(vms->vma->vm_mm);
+ 	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
+diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
+index 3ae84c3b8e6d..dc1c1057f26e 100644
+--- a/tools/arch/x86/include/asm/msr-index.h
++++ b/tools/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
++#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
++#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch
new file mode 100644
index 0000000..889f841
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch
@@ -0,0 +1,3386 @@
+From 8e25c43b4f65d6249ffcdd8631af68e32aabe985 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:27:38 +0100
+Subject: [PATCH 03/12] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    4 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    9 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2230 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 16 files changed, 1940 insertions(+), 553 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index f88daaa76d83..b0f79a5888a2 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -368,7 +368,9 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
+ 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+ 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+ 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index c7f42844c79a..170250145598 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index bc04599547c3..1ac0efa5a854 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -376,6 +376,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -793,6 +795,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -898,6 +909,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -987,9 +1003,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1102,6 +1123,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1124,7 +1146,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1144,10 +1170,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1158,7 +1187,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1182,8 +1213,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1249,6 +1283,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1268,6 +1310,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1280,6 +1323,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2431,7 +2489,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index db7254d52d93..38de18d921ea 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -507,12 +507,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dbf896f3146c..4702cd2f1ffc 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 6d2c97f8e9ef..ddc116ef22cb 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index 554804774628..2279e6e7bc9c 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+ 
++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++{
++}
++
+ static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ 				    const struct rate_sample *rs)
+ {
+@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
++	.skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index d74281eca14f..61aa756120ad 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3379,6 +3379,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4105,6 +4106,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..a180fa648d5e 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 0ee22e10fcfa..492c143aed1b 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -387,7 +387,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1126,7 +1126,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1507,6 +1512,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3832,7 +3848,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3849,6 +3866,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3859,6 +3877,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3967,6 +3990,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4041,7 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4065,6 +4089,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4084,7 +4109,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5758,13 +5783,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 789e495d3bd6..dea9123e5c5d 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -466,6 +466,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index bc95d2a5924f..d4c45ca6fe06 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1606,7 +1609,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1681,6 +1684,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2038,13 +2065,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2770,6 +2796,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2982,6 +3009,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index b412ed88ccd9..d70f8b742b21 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -699,6 +699,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0005-crypto.patch b/sys-kernel/gentoo-sources-6.13/0005-crypto.patch
new file mode 100644
index 0000000..a508f49
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0005-crypto.patch
@@ -0,0 +1,774 @@
+From 0b97cf6a4825ec41c53e59294964c5e94810a593 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:28:58 +0100
+Subject: [PATCH 05/12] crypto
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/crypto/aes-gcm-avx10-x86_64.S | 119 ++++-----
+ arch/x86/crypto/aes-xts-avx-x86_64.S   | 329 +++++++++++++------------
+ arch/x86/crypto/aesni-intel_glue.c     |  10 +-
+ 3 files changed, 221 insertions(+), 237 deletions(-)
+
+diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+index 97e0ee515fc5..02ee11083d4f 100644
+--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
++++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+@@ -88,7 +88,7 @@
+ 
+ 	// A shuffle mask that reflects the bytes of 16-byte blocks
+ .Lbswap_mask:
+-	.octa   0x000102030405060708090a0b0c0d0e0f
++	.octa	0x000102030405060708090a0b0c0d0e0f
+ 
+ 	// This is the GHASH reducing polynomial without its constant term, i.e.
+ 	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
+@@ -384,8 +384,8 @@
+ 	vpshufd		$0xd3, H_CUR_XMM, %xmm0
+ 	vpsrad		$31, %xmm0, %xmm0
+ 	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+-	vpand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
+-	vpxor		%xmm0, H_CUR_XMM, H_CUR_XMM
++	// H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
++	vpternlogd	$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM
+ 
+ 	// Load the gfpoly constant.
+ 	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
+@@ -562,6 +562,32 @@
+ 	vpxord		RNDKEY0, V3, V3
+ .endm
+ 
++// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
++// data with the resulting keystream, and write the result to DST and
++// GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
++.macro	_aesenclast_and_xor_4x
++	// XOR the source data with the last round key, saving the result in
++	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
++	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
++	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
++	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
++	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
++	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
++
++	// Do the last AES round.  This handles the XOR with the source data
++	// too, as per the optimization described above.
++	vaesenclast	GHASHDATA0, V0, GHASHDATA0
++	vaesenclast	GHASHDATA1, V1, GHASHDATA1
++	vaesenclast	GHASHDATA2, V2, GHASHDATA2
++	vaesenclast	GHASHDATA3, V3, GHASHDATA3
++
++	// Store the en/decrypted data to DST.
++	vmovdqu8	GHASHDATA0, 0*VL(DST)
++	vmovdqu8	GHASHDATA1, 1*VL(DST)
++	vmovdqu8	GHASHDATA2, 2*VL(DST)
++	vmovdqu8	GHASHDATA3, 3*VL(DST)
++.endm
++
+ // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
+ //					  const u32 le_ctr[4], u8 ghash_acc[16],
+ //					  const u8 *src, u8 *dst, int datalen);
+@@ -640,7 +666,7 @@
+ 	// LE_CTR contains the next set of little-endian counter blocks.
+ 	.set	LE_CTR,		V12
+ 
+-	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
++	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
+ 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
+ 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+ 	.set	RNDKEY0,	V13
+@@ -650,15 +676,10 @@
+ 	.set	RNDKEY_M7,	V17
+ 	.set	RNDKEY_M6,	V18
+ 	.set	RNDKEY_M5,	V19
+-
+-	// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
+-	// the corresponding block of source data.  This is useful because
+-	// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
+-	// be computed in parallel with the AES rounds.
+-	.set	RNDKEYLAST0,	V20
+-	.set	RNDKEYLAST1,	V21
+-	.set	RNDKEYLAST2,	V22
+-	.set	RNDKEYLAST3,	V23
++	.set	RNDKEY_M4,	V20
++	.set	RNDKEY_M3,	V21
++	.set	RNDKEY_M2,	V22
++	.set	RNDKEY_M1,	V23
+ 
+ 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
+ 	// cannot coincide with anything used for AES encryption, since for
+@@ -713,7 +734,7 @@
+ 	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+ 	// loop and also ensures that at least one write always occurs to
+ 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+-	sub		$4*VL, DATALEN
++	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
+ 	jl		.Lcrypt_loop_4x_done\@
+ 
+ 	// Load powers of the hash key.
+@@ -748,26 +769,15 @@
+ 	add		$16, %rax
+ 	cmp		%rax, RNDKEYLAST_PTR
+ 	jne		1b
+-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
+-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
+-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
+-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
+-	vmovdqu8	GHASHDATA0, 0*VL(DST)
+-	vmovdqu8	GHASHDATA1, 1*VL(DST)
+-	vmovdqu8	GHASHDATA2, 2*VL(DST)
+-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+-	add		$4*VL, SRC
+-	add		$4*VL, DST
+-	sub		$4*VL, DATALEN
++	_aesenclast_and_xor_4x
++	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
++	sub		$-4*VL, DST
++	add		$-4*VL, DATALEN
+ 	jl		.Lghash_last_ciphertext_4x\@
+ .endif
+ 
+ 	// Cache as many additional AES round keys as possible.
+-.irp i, 9,8,7,6,5
++.irp i, 9,8,7,6,5,4,3,2,1
+ 	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
+ .endr
+ 
+@@ -799,50 +809,17 @@
+ 	_vaesenc_4x	RNDKEY
+ 128:
+ 
+-	// XOR the source data with the last round key, saving the result in
+-	// RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
+-	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+-.if \enc
+-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+-.else
+-	vpxord		GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
+-	vpxord		GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
+-	vpxord		GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
+-	vpxord		GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
+-.endif
+-
+ 	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
+ 	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+-.irp i, 9,8,7,6,5
++.irp i, 9,8,7,6,5,4,3,2,1
++	_ghash_step_4x  (9 - \i)
+ 	_vaesenc_4x	RNDKEY_M\i
+-	_ghash_step_4x	(9 - \i)
+-.endr
+-.irp i, 4,3,2,1
+-	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY
+-	_vaesenc_4x	RNDKEY
+-	_ghash_step_4x	(9 - \i)
+ .endr
+ 	_ghash_step_4x	9
+-
+-	// Do the last AES round.  This handles the XOR with the source data
+-	// too, as per the optimization described above.
+-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
+-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
+-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
+-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
+-
+-	// Store the en/decrypted data to DST.
+-	vmovdqu8	GHASHDATA0, 0*VL(DST)
+-	vmovdqu8	GHASHDATA1, 1*VL(DST)
+-	vmovdqu8	GHASHDATA2, 2*VL(DST)
+-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+-
+-	add		$4*VL, SRC
+-	add		$4*VL, DST
+-	sub		$4*VL, DATALEN
++	_aesenclast_and_xor_4x
++	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
++	sub		$-4*VL, DST
++	add		$-4*VL, DATALEN
+ 	jge		.Lcrypt_loop_4x\@
+ 
+ .if \enc
+@@ -856,7 +833,7 @@
+ .Lcrypt_loop_4x_done\@:
+ 
+ 	// Undo the extra subtraction by 4*VL and check whether data remains.
+-	add		$4*VL, DATALEN
++	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
+ 	jz		.Ldone\@
+ 
+ 	// The data length isn't a multiple of 4*VL.  Process the remaining data
+@@ -940,7 +917,7 @@
+ 	// GHASH.  However, any such blocks are all-zeroes, and the values that
+ 	// they're multiplied with are also all-zeroes.  Therefore they just add
+ 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
+-	vmovdqu8        (POWERS_PTR), H_POW1
++	vmovdqu8	(POWERS_PTR), H_POW1
+ .if \enc
+ 	vmovdqu8	V0, V1{%k1}{z}
+ .endif
+diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
+index 48f97b79f7a9..8a3e23fbcf85 100644
+--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
++++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
+@@ -80,22 +80,6 @@
+ 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .text
+ 
+-// Function parameters
+-.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
+-				// advanced to point to 7th-from-last round key
+-.set	SRC,		%rsi	// Pointer to next source data
+-.set	DST,		%rdx	// Pointer to next destination data
+-.set	LEN,		%ecx	// Remaining length in bytes
+-.set	LEN8,		%cl
+-.set	LEN64,		%rcx
+-.set	TWEAK,		%r8	// Pointer to next tweak
+-
+-// %rax holds the AES key length in bytes.
+-.set	KEYLEN,		%eax
+-.set	KEYLEN64,	%rax
+-
+-// %r9-r11 are available as temporaries.
+-
+ .macro	_define_Vi	i
+ .if VL == 16
+ 	.set	V\i,		%xmm\i
+@@ -112,41 +96,31 @@
+ 	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+ 	// are available, that map to the xmm, ymm, or zmm registers according
+ 	// to the selected Vector Length (VL).
+-	_define_Vi	0
+-	_define_Vi	1
+-	_define_Vi	2
+-	_define_Vi	3
+-	_define_Vi	4
+-	_define_Vi	5
+-	_define_Vi	6
+-	_define_Vi	7
+-	_define_Vi	8
+-	_define_Vi	9
+-	_define_Vi	10
+-	_define_Vi	11
+-	_define_Vi	12
+-	_define_Vi	13
+-	_define_Vi	14
+-	_define_Vi	15
++.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
++	_define_Vi	\i
++.endr
+ .if USE_AVX10
+-	_define_Vi	16
+-	_define_Vi	17
+-	_define_Vi	18
+-	_define_Vi	19
+-	_define_Vi	20
+-	_define_Vi	21
+-	_define_Vi	22
+-	_define_Vi	23
+-	_define_Vi	24
+-	_define_Vi	25
+-	_define_Vi	26
+-	_define_Vi	27
+-	_define_Vi	28
+-	_define_Vi	29
+-	_define_Vi	30
+-	_define_Vi	31
++.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
++	_define_Vi	\i
++.endr
+ .endif
+ 
++	// Function parameters
++	.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
++					// advanced to point to 7th-from-last round key
++	.set	SRC,		%rsi	// Pointer to next source data
++	.set	DST,		%rdx	// Pointer to next destination data
++	.set	LEN,		%ecx	// Remaining length in bytes
++	.set	LEN8,		%cl
++	.set	LEN64,		%rcx
++	.set	TWEAK,		%r8	// Pointer to next tweak
++
++	// %rax holds the AES key length in bytes.
++	.set	KEYLEN,		%eax
++	.set	KEYLEN64,	%rax
++
++	// %r9-r11 are available as temporaries.
++
+ 	// V0-V3 hold the data blocks during the main loop, or temporary values
+ 	// otherwise.  V4-V5 hold temporary values.
+ 
+@@ -214,6 +188,7 @@
+ .endm
+ 
+ // Move a vector between memory and a register.
++// The register operand must be in the first 16 vector registers.
+ .macro	_vmovdqu	src, dst
+ .if VL < 64
+ 	vmovdqu		\src, \dst
+@@ -234,11 +209,12 @@
+ .endm
+ 
+ // XOR two vectors together.
++// Any register operands must be in the first 16 vector registers.
+ .macro	_vpxor	src1, src2, dst
+-.if USE_AVX10
+-	vpxord		\src1, \src2, \dst
+-.else
++.if VL < 64
+ 	vpxor		\src1, \src2, \dst
++.else
++	vpxord		\src1, \src2, \dst
+ .endif
+ .endm
+ 
+@@ -259,8 +235,12 @@
+ 	vpshufd		$0x13, \src, \tmp
+ 	vpaddq		\src, \src, \dst
+ 	vpsrad		$31, \tmp, \tmp
++.if USE_AVX10
++	vpternlogd	$0x78, GF_POLY_XMM, \tmp, \dst
++.else
+ 	vpand		GF_POLY_XMM, \tmp, \tmp
+ 	vpxor		\tmp, \dst, \dst
++.endif
+ .endm
+ 
+ // Given the XTS tweak(s) in the vector \src, compute the next vector of
+@@ -369,9 +349,14 @@
+ 
+ // Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+ // (the same method _next_tweakvec uses for VL > 16).  This means multiplying
+-// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
+-// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+-// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
++// each tweak by x^(4*VL/16) independently.
++//
++// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
++// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
++// to do 128-bit wide shifts.  The 128-bit left shift (vpslldq) saves
++// instructions directly.  The 128-bit right shift (vpsrldq) performs better
++// than a 64-bit right shift on Intel CPUs in the context where it is used here,
++// because it runs on a different execution port from the AES instructions.
+ .macro	_tweak_step_pclmul	i
+ .if \i == 0
+ 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+@@ -406,7 +391,7 @@
+ // \i that include at least 0 through 19, then 1000 which signals the last step.
+ //
+ // This is used to interleave the computation of the next set of tweaks with the
+-// AES en/decryptions, which increases performance in some cases.
++// AES en/decryptions, which increases performance in some cases.  Clobbers V5.
+ .macro	_tweak_step	i
+ .if VL == 16
+ 	_tweak_step_mulx	\i
+@@ -443,9 +428,10 @@
+ 	// the last round needs different instructions.
+ 	//
+ 	// An alternative approach would be to roll up all the round loops.  We
+-	// don't do that because it isn't compatible with caching the round keys
+-	// in registers which we do when possible (see below), and also because
+-	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
++	// don't do that because (a) it isn't compatible with caching the round
++	// keys in registers which we do when possible (see below), (b) we
++	// interleave the AES rounds with the XTS tweak computation, and (c) it
++	// seems unwise to rely *too* heavily on the CPU's branch predictor.
+ 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
+ 
+ 	// If all 32 SIMD registers are available, cache all the round keys.
+@@ -472,90 +458,94 @@
+ .endif
+ .endm
+ 
+-// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+-// on the block(s) in \data using the round key(s) in \key.  The register length
+-// determines the number of AES blocks en/decrypted.
+-.macro	_vaes	enc, last, key, data
++// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
++// \enc==0) on the block(s) in \data using the round key(s) in \key.  The
++// register length determines the number of AES blocks en/decrypted.
++.macro	_vaes	enc, key, data
+ .if \enc
+-.if \last
+-	vaesenclast	\key, \data, \data
+-.else
+ 	vaesenc		\key, \data, \data
+-.endif
+-.else
+-.if \last
+-	vaesdeclast	\key, \data, \data
+ .else
+ 	vaesdec		\key, \data, \data
+ .endif
++.endm
++
++// Same as _vaes, but does the last round.
++.macro	_vaeslast	enc, key, data
++.if \enc
++	vaesenclast	\key, \data, \data
++.else
++	vaesdeclast	\key, \data, \data
+ .endif
+ .endm
+ 
+-// Do a single round of AES en/decryption on the block(s) in \data, using the
+-// same key for all block(s).  The round key is loaded from the appropriate
+-// register or memory location for round \i.  May clobber V4.
+-.macro _vaes_1x		enc, last, i, xmm_suffix, data
++// Do a single non-last round of AES en/decryption on the block(s) in \data,
++// using the same key for all block(s).  The round key is loaded from the
++// appropriate register or memory location for round \i.  May clobber \tmp.
++.macro _vaes_1x		enc, i, xmm_suffix, data, tmp
+ .if USE_AVX10
+-	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
++	_vaes		\enc, KEY\i\xmm_suffix, \data
+ .else
+ .ifnb \xmm_suffix
+-	_vaes		\enc, \last, (\i-7)*16(KEY), \data
++	_vaes		\enc, (\i-7)*16(KEY), \data
+ .else
+-	_vbroadcast128	(\i-7)*16(KEY), V4
+-	_vaes		\enc, \last, V4, \data
++	_vbroadcast128	(\i-7)*16(KEY), \tmp
++	_vaes		\enc, \tmp, \data
+ .endif
+ .endif
+ .endm
+ 
+-// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+-// using the same key for all blocks.  The round key is loaded from the
++// Do a single non-last round of AES en/decryption on the blocks in registers
++// V0-V3, using the same key for all blocks.  The round key is loaded from the
+ // appropriate register or memory location for round \i.  In addition, does two
+-// steps of the computation of the next set of tweaks.  May clobber V4.
+-.macro	_vaes_4x	enc, last, i
++// steps of the computation of the next set of tweaks.  May clobber V4 and V5.
++.macro	_vaes_4x	enc, i
+ .if USE_AVX10
+ 	_tweak_step	(2*(\i-5))
+-	_vaes		\enc, \last, KEY\i, V0
+-	_vaes		\enc, \last, KEY\i, V1
++	_vaes		\enc, KEY\i, V0
++	_vaes		\enc, KEY\i, V1
+ 	_tweak_step	(2*(\i-5) + 1)
+-	_vaes		\enc, \last, KEY\i, V2
+-	_vaes		\enc, \last, KEY\i, V3
++	_vaes		\enc, KEY\i, V2
++	_vaes		\enc, KEY\i, V3
+ .else
+ 	_vbroadcast128	(\i-7)*16(KEY), V4
+ 	_tweak_step	(2*(\i-5))
+-	_vaes		\enc, \last, V4, V0
+-	_vaes		\enc, \last, V4, V1
++	_vaes		\enc, V4, V0
++	_vaes		\enc, V4, V1
+ 	_tweak_step	(2*(\i-5) + 1)
+-	_vaes		\enc, \last, V4, V2
+-	_vaes		\enc, \last, V4, V3
++	_vaes		\enc, V4, V2
++	_vaes		\enc, V4, V3
+ .endif
+ .endm
+ 
+ // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+ // then XOR with \tweak again) of the block(s) in \data.  To process a single
+ // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
+-// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
+-.macro	_aes_crypt	enc, xmm_suffix, tweak, data
++// length VL, use V* registers and leave \xmm_suffix empty.  Clobbers \tmp.
++.macro	_aes_crypt	enc, xmm_suffix, tweak, data, tmp
+ 	_xor3		KEY0\xmm_suffix, \tweak, \data
+ 	cmp		$24, KEYLEN
+ 	jl		.Laes128\@
+ 	je		.Laes192\@
+-	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
++	_vaes_1x	\enc, 1, \xmm_suffix, \data, tmp=\tmp
++	_vaes_1x	\enc, 2, \xmm_suffix, \data, tmp=\tmp
+ .Laes192\@:
+-	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
++	_vaes_1x	\enc, 3, \xmm_suffix, \data, tmp=\tmp
++	_vaes_1x	\enc, 4, \xmm_suffix, \data, tmp=\tmp
+ .Laes128\@:
+-	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
+-	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
+-	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
+-	_vpxor		\tweak, \data, \data
++.irp i, 5,6,7,8,9,10,11,12,13
++	_vaes_1x	\enc, \i, \xmm_suffix, \data, tmp=\tmp
++.endr
++.if USE_AVX10
++	vpxord		KEY14\xmm_suffix, \tweak, \tmp
++.else
++.ifnb \xmm_suffix
++	vpxor		7*16(KEY), \tweak, \tmp
++.else
++	_vbroadcast128	7*16(KEY), \tmp
++	vpxor		\tweak, \tmp, \tmp
++.endif
++.endif
++	_vaeslast	\enc, \tmp, \data
+ .endm
+ 
+ .macro	_aes_xts_crypt	enc
+@@ -581,7 +571,7 @@
+ 	// Compute the first set of tweaks TWEAK[0-3].
+ 	_compute_first_set_of_tweaks
+ 
+-	sub		$4*VL, LEN
++	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
+ 	jl		.Lhandle_remainder\@
+ 
+ .Lmain_loop\@:
+@@ -589,10 +579,10 @@
+ 
+ 	// XOR each source block with its tweak and the zero-th round key.
+ .if USE_AVX10
+-	vmovdqu8	0*VL(SRC), V0
+-	vmovdqu8	1*VL(SRC), V1
+-	vmovdqu8	2*VL(SRC), V2
+-	vmovdqu8	3*VL(SRC), V3
++	_vmovdqu	0*VL(SRC), V0
++	_vmovdqu	1*VL(SRC), V1
++	_vmovdqu	2*VL(SRC), V2
++	_vmovdqu	3*VL(SRC), V3
+ 	vpternlogd	$0x96, TWEAK0, KEY0, V0
+ 	vpternlogd	$0x96, TWEAK1, KEY0, V1
+ 	vpternlogd	$0x96, TWEAK2, KEY0, V2
+@@ -612,28 +602,43 @@
+ 	je		.Laes192\@
+ 	// Do all the AES rounds on the data blocks, interleaved with
+ 	// the computation of the next set of tweaks.
+-	_vaes_4x	\enc, 0, 1
+-	_vaes_4x	\enc, 0, 2
++	_vaes_4x	\enc, 1
++	_vaes_4x	\enc, 2
+ .Laes192\@:
+-	_vaes_4x	\enc, 0, 3
+-	_vaes_4x	\enc, 0, 4
++	_vaes_4x	\enc, 3
++	_vaes_4x	\enc, 4
+ .Laes128\@:
+-	_vaes_4x	\enc, 0, 5
+-	_vaes_4x	\enc, 0, 6
+-	_vaes_4x	\enc, 0, 7
+-	_vaes_4x	\enc, 0, 8
+-	_vaes_4x	\enc, 0, 9
+-	_vaes_4x	\enc, 0, 10
+-	_vaes_4x	\enc, 0, 11
+-	_vaes_4x	\enc, 0, 12
+-	_vaes_4x	\enc, 0, 13
+-	_vaes_4x	\enc, 1, 14
+-
+-	// XOR in the tweaks again.
+-	_vpxor		TWEAK0, V0, V0
+-	_vpxor		TWEAK1, V1, V1
+-	_vpxor		TWEAK2, V2, V2
+-	_vpxor		TWEAK3, V3, V3
++.irp i, 5,6,7,8,9,10,11,12,13
++	_vaes_4x	\enc, \i
++.endr
++	// Do the last AES round, then XOR the results with the tweaks again.
++	// Reduce latency by doing the XOR before the vaesenclast, utilizing the
++	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
++	// (and likewise for vaesdeclast).
++.if USE_AVX10
++	_tweak_step	18
++	_tweak_step	19
++	vpxord		TWEAK0, KEY14, V4
++	vpxord		TWEAK1, KEY14, V5
++	_vaeslast	\enc, V4, V0
++	_vaeslast	\enc, V5, V1
++	vpxord		TWEAK2, KEY14, V4
++	vpxord		TWEAK3, KEY14, V5
++	_vaeslast	\enc, V4, V2
++	_vaeslast	\enc, V5, V3
++.else
++	_vbroadcast128	7*16(KEY), V4
++	_tweak_step	18 // uses V5
++	_tweak_step	19 // uses V5
++	vpxor		TWEAK0, V4, V5
++	_vaeslast	\enc, V5, V0
++	vpxor		TWEAK1, V4, V5
++	_vaeslast	\enc, V5, V1
++	vpxor		TWEAK2, V4, V5
++	vpxor		TWEAK3, V4, V4
++	_vaeslast	\enc, V5, V2
++	_vaeslast	\enc, V4, V3
++.endif
+ 
+ 	// Store the destination blocks.
+ 	_vmovdqu	V0, 0*VL(DST)
+@@ -644,9 +649,9 @@
+ 	// Finish computing the next set of tweaks.
+ 	_tweak_step	1000
+ 
+-	add		$4*VL, SRC
+-	add		$4*VL, DST
+-	sub		$4*VL, LEN
++	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
++	sub		$-4*VL, DST
++	add		$-4*VL, LEN
+ 	jge		.Lmain_loop\@
+ 
+ 	// Check for the uncommon case where the data length isn't a multiple of
+@@ -670,7 +675,7 @@
+ 	jl		.Lvec_at_a_time_done\@
+ .Lvec_at_a_time\@:
+ 	_vmovdqu	(SRC), V0
+-	_aes_crypt	\enc, , TWEAK0, V0
++	_aes_crypt	\enc, , TWEAK0, V0, tmp=V1
+ 	_vmovdqu	V0, (DST)
+ 	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
+ 	add		$VL, SRC
+@@ -687,7 +692,7 @@
+ 	jl		.Lblock_at_a_time_done\@
+ .Lblock_at_a_time\@:
+ 	vmovdqu		(SRC), %xmm0
+-	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
++	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
+ 	vmovdqu		%xmm0, (DST)
+ 	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
+ 	add		$16, SRC
+@@ -715,7 +720,7 @@
+ 	// Do it now by advancing the tweak and decrypting the last full block.
+ 	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
+ 	vmovdqu		(SRC), %xmm0
+-	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0
++	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
+ .endif
+ 
+ .if USE_AVX10
+@@ -758,47 +763,49 @@
+ 	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
+ .endif
+ 	// En/decrypt again and store the last full block.
+-	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
++	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
+ 	vmovdqu		%xmm0, (DST)
+ 	jmp		.Ldone\@
+ .endm
+ 
+ // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ //			   u8 iv[AES_BLOCK_SIZE]);
++//
++// Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
++// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
+ SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+-	vmovdqu		(%rsi), %xmm0
+-	vpxor		(%rdi), %xmm0, %xmm0
+-	movl		480(%rdi), %eax		// AES key length
+-	lea		-16(%rdi, %rax, 4), %rdi
+-	cmp		$24, %eax
++	.set	TWEAK_KEY,	%rdi
++	.set	IV,		%rsi
++	.set	KEYLEN,		%eax
++	.set	KEYLEN64,	%rax
++
++	vmovdqu		(IV), %xmm0
++	vpxor		(TWEAK_KEY), %xmm0, %xmm0
++	movl		480(TWEAK_KEY), KEYLEN
++	lea		-16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
++	cmp		$24, KEYLEN
+ 	jl		.Lencrypt_iv_aes128
+ 	je		.Lencrypt_iv_aes192
+-	vaesenc		-6*16(%rdi), %xmm0, %xmm0
+-	vaesenc		-5*16(%rdi), %xmm0, %xmm0
++	vaesenc		-6*16(TWEAK_KEY), %xmm0, %xmm0
++	vaesenc		-5*16(TWEAK_KEY), %xmm0, %xmm0
+ .Lencrypt_iv_aes192:
+-	vaesenc		-4*16(%rdi), %xmm0, %xmm0
+-	vaesenc		-3*16(%rdi), %xmm0, %xmm0
++	vaesenc		-4*16(TWEAK_KEY), %xmm0, %xmm0
++	vaesenc		-3*16(TWEAK_KEY), %xmm0, %xmm0
+ .Lencrypt_iv_aes128:
+-	vaesenc		-2*16(%rdi), %xmm0, %xmm0
+-	vaesenc		-1*16(%rdi), %xmm0, %xmm0
+-	vaesenc		0*16(%rdi), %xmm0, %xmm0
+-	vaesenc		1*16(%rdi), %xmm0, %xmm0
+-	vaesenc		2*16(%rdi), %xmm0, %xmm0
+-	vaesenc		3*16(%rdi), %xmm0, %xmm0
+-	vaesenc		4*16(%rdi), %xmm0, %xmm0
+-	vaesenc		5*16(%rdi), %xmm0, %xmm0
+-	vaesenc		6*16(%rdi), %xmm0, %xmm0
+-	vaesenclast	7*16(%rdi), %xmm0, %xmm0
+-	vmovdqu		%xmm0, (%rsi)
++.irp i, -2,-1,0,1,2,3,4,5,6
++	vaesenc		\i*16(TWEAK_KEY), %xmm0, %xmm0
++.endr
++	vaesenclast	7*16(TWEAK_KEY), %xmm0, %xmm0
++	vmovdqu		%xmm0, (IV)
+ 	RET
+ SYM_FUNC_END(aes_xts_encrypt_iv)
+ 
+ // Below are the actual AES-XTS encryption and decryption functions,
+ // instantiated from the above macro.  They all have the following prototype:
+ //
+-// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+-//			const u8 *src, u8 *dst, unsigned int len,
+-//			u8 tweak[AES_BLOCK_SIZE]);
++// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
++//			  const u8 *src, u8 *dst, int len,
++//			  u8 tweak[AES_BLOCK_SIZE]);
+ //
+ // |key| is the data key.  |tweak| contains the next tweak; the encryption of
+ // the original IV with the tweak key was already done.  This function supports
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index fbf43482e1f5..11e95fc62636 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -505,7 +505,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
+ typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
+ 				    u8 iv[AES_BLOCK_SIZE]);
+ typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+-			       const u8 *src, u8 *dst, unsigned int len,
++			       const u8 *src, u8 *dst, int len,
+ 			       u8 tweak[AES_BLOCK_SIZE]);
+ 
+ /* This handles cases where the source and/or destination span pages. */
+@@ -624,14 +624,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ }
+ 
+ static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
+-			      const u8 *src, u8 *dst, unsigned int len,
++			      const u8 *src, u8 *dst, int len,
+ 			      u8 tweak[AES_BLOCK_SIZE])
+ {
+ 	aesni_xts_enc(key, dst, src, len, tweak);
+ }
+ 
+ static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
+-			      const u8 *src, u8 *dst, unsigned int len,
++			      const u8 *src, u8 *dst, int len,
+ 			      u8 tweak[AES_BLOCK_SIZE])
+ {
+ 	aesni_xts_dec(key, dst, src, len, tweak);
+@@ -790,10 +790,10 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ 									       \
+ asmlinkage void								       \
+ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
+-			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
++			 u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);	       \
+ asmlinkage void								       \
+ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
+-			 u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
++			 u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);	       \
+ 									       \
+ static int xts_encrypt_##suffix(struct skcipher_request *req)		       \
+ {									       \
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch b/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch
new file mode 100644
index 0000000..eda5a21
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch
@@ -0,0 +1,365 @@
+From 226e2a915189fff660383f067038534fe0346694 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:31:52 +0100
+Subject: [PATCH 07/12] itmt-core-ranking
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/include/asm/topology.h |  4 +-
+ arch/x86/kernel/itmt.c          | 81 ++++++++++++++-------------------
+ arch/x86/kernel/smpboot.c       |  8 +---
+ kernel/sched/fair.c             | 42 +++++++++++++----
+ kernel/sched/sched.h            |  1 -
+ kernel/sched/topology.c         | 15 +-----
+ 6 files changed, 69 insertions(+), 82 deletions(-)
+
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index fd41103ad342..63bab25a4896 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -250,7 +250,7 @@ extern bool x86_topology_update;
+ #include <asm/percpu.h>
+ 
+ DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+-extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
++extern bool __read_mostly sysctl_sched_itmt_enabled;
+ 
+ /* Interface to set priority of a cpu */
+ void sched_set_itmt_core_prio(int prio, int core_cpu);
+@@ -263,7 +263,7 @@ void sched_clear_itmt_support(void);
+ 
+ #else /* CONFIG_SCHED_MC_PRIO */
+ 
+-#define sysctl_sched_itmt_enabled	0
++#define sysctl_sched_itmt_enabled	false
+ static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
+ {
+ }
+diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
+index 51b805c727fc..9cea1fc36c18 100644
+--- a/arch/x86/kernel/itmt.c
++++ b/arch/x86/kernel/itmt.c
+@@ -19,6 +19,7 @@
+ #include <linux/sched.h>
+ #include <linux/cpumask.h>
+ #include <linux/cpuset.h>
++#include <linux/debugfs.h>
+ #include <linux/mutex.h>
+ #include <linux/sysctl.h>
+ #include <linux/nodemask.h>
+@@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable;
+  * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+  * Technology 3.0.
+  *
+- * It can be set via /proc/sys/kernel/sched_itmt_enabled
++ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
+  */
+-unsigned int __read_mostly sysctl_sched_itmt_enabled;
++bool __read_mostly sysctl_sched_itmt_enabled;
+ 
+-static int sched_itmt_update_handler(const struct ctl_table *table, int write,
+-				     void *buffer, size_t *lenp, loff_t *ppos)
++static ssize_t sched_itmt_enabled_write(struct file *filp,
++					const char __user *ubuf,
++					size_t cnt, loff_t *ppos)
+ {
+-	unsigned int old_sysctl;
+-	int ret;
++	ssize_t result;
++	bool orig;
+ 
+-	mutex_lock(&itmt_update_mutex);
++	guard(mutex)(&itmt_update_mutex);
+ 
+-	if (!sched_itmt_capable) {
+-		mutex_unlock(&itmt_update_mutex);
+-		return -EINVAL;
+-	}
+-
+-	old_sysctl = sysctl_sched_itmt_enabled;
+-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
++	orig = sysctl_sched_itmt_enabled;
++	result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+ 
+-	if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
++	if (sysctl_sched_itmt_enabled != orig) {
+ 		x86_topology_update = true;
+ 		rebuild_sched_domains();
+ 	}
+ 
+-	mutex_unlock(&itmt_update_mutex);
+-
+-	return ret;
++	return result;
+ }
+ 
+-static struct ctl_table itmt_kern_table[] = {
+-	{
+-		.procname	= "sched_itmt_enabled",
+-		.data		= &sysctl_sched_itmt_enabled,
+-		.maxlen		= sizeof(unsigned int),
+-		.mode		= 0644,
+-		.proc_handler	= sched_itmt_update_handler,
+-		.extra1		= SYSCTL_ZERO,
+-		.extra2		= SYSCTL_ONE,
+-	},
++static const struct file_operations dfs_sched_itmt_fops = {
++	.read =         debugfs_read_file_bool,
++	.write =        sched_itmt_enabled_write,
++	.open =         simple_open,
++	.llseek =       default_llseek,
+ };
+ 
+-static struct ctl_table_header *itmt_sysctl_header;
++static struct dentry *dfs_sched_itmt;
+ 
+ /**
+  * sched_set_itmt_support() - Indicate platform supports ITMT
+@@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header;
+  */
+ int sched_set_itmt_support(void)
+ {
+-	mutex_lock(&itmt_update_mutex);
++	guard(mutex)(&itmt_update_mutex);
+ 
+-	if (sched_itmt_capable) {
+-		mutex_unlock(&itmt_update_mutex);
++	if (sched_itmt_capable)
+ 		return 0;
+-	}
+ 
+-	itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
+-	if (!itmt_sysctl_header) {
+-		mutex_unlock(&itmt_update_mutex);
++	dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
++						    0644,
++						    arch_debugfs_dir,
++						    &sysctl_sched_itmt_enabled,
++						    &dfs_sched_itmt_fops);
++	if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
++		dfs_sched_itmt = NULL;
+ 		return -ENOMEM;
+ 	}
+ 
+@@ -117,8 +109,6 @@ int sched_set_itmt_support(void)
+ 	x86_topology_update = true;
+ 	rebuild_sched_domains();
+ 
+-	mutex_unlock(&itmt_update_mutex);
+-
+ 	return 0;
+ }
+ 
+@@ -134,18 +124,15 @@ int sched_set_itmt_support(void)
+  */
+ void sched_clear_itmt_support(void)
+ {
+-	mutex_lock(&itmt_update_mutex);
++	guard(mutex)(&itmt_update_mutex);
+ 
+-	if (!sched_itmt_capable) {
+-		mutex_unlock(&itmt_update_mutex);
++	if (!sched_itmt_capable)
+ 		return;
+-	}
++
+ 	sched_itmt_capable = false;
+ 
+-	if (itmt_sysctl_header) {
+-		unregister_sysctl_table(itmt_sysctl_header);
+-		itmt_sysctl_header = NULL;
+-	}
++	debugfs_remove(dfs_sched_itmt);
++	dfs_sched_itmt = NULL;
+ 
+ 	if (sysctl_sched_itmt_enabled) {
+ 		/* disable sched_itmt if we are no longer ITMT capable */
+@@ -153,8 +140,6 @@ void sched_clear_itmt_support(void)
+ 		x86_topology_update = true;
+ 		rebuild_sched_domains();
+ 	}
+-
+-	mutex_unlock(&itmt_update_mutex);
+ }
+ 
+ int arch_asym_cpu_priority(int cpu)
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index f1fac08fdef2..ef63b1c0b491 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -482,12 +482,6 @@ static int x86_core_flags(void)
+ 	return cpu_core_flags() | x86_sched_itmt_flags();
+ }
+ #endif
+-#ifdef CONFIG_SCHED_SMT
+-static int x86_smt_flags(void)
+-{
+-	return cpu_smt_flags();
+-}
+-#endif
+ #ifdef CONFIG_SCHED_CLUSTER
+ static int x86_cluster_flags(void)
+ {
+@@ -510,7 +504,7 @@ static void __init build_sched_topology(void)
+ 
+ #ifdef CONFIG_SCHED_SMT
+ 	x86_topology[i++] = (struct sched_domain_topology_level){
+-		cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
++		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
+ 	};
+ #endif
+ #ifdef CONFIG_SCHED_CLUSTER
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cdb81cb0812c..232e2695a2cd 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9866,6 +9866,8 @@ struct sg_lb_stats {
+ 	unsigned int group_weight;
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
++	unsigned int asym_prefer_cpu;		/* Group CPU with highest asym priority */
++	int highest_asym_prio;			/* Asym priority of asym_prefer_cpu */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+@@ -10195,7 +10197,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
+ 	    (sgs->group_weight - sgs->idle_cpus != 1))
+ 		return false;
+ 
+-	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
++	return sched_asym(env->sd, env->dst_cpu, sgs->asym_prefer_cpu);
+ }
+ 
+ /* One group has more than one SMT CPU while the other group does not */
+@@ -10276,6 +10278,17 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++static inline void
++update_sg_pick_asym_prefer(struct sg_lb_stats *sgs, int cpu)
++{
++	int asym_prio = arch_asym_cpu_priority(cpu);
++
++	if (asym_prio > sgs->highest_asym_prio) {
++		sgs->asym_prefer_cpu = cpu;
++		sgs->highest_asym_prio = asym_prio;
++	}
++}
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10292,11 +10305,13 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 				      bool *sg_overloaded,
+ 				      bool *sg_overutilized)
+ {
+-	int i, nr_running, local_group;
++	int i, nr_running, local_group, sd_flags = env->sd->flags;
++	bool balancing_at_rd = !env->sd->parent;
+ 
+ 	memset(sgs, 0, sizeof(*sgs));
+ 
+ 	local_group = group == sds->local;
++	sgs->highest_asym_prio = INT_MIN;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+ 		struct rq *rq = cpu_rq(i);
+@@ -10310,16 +10325,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		nr_running = rq->nr_running;
+ 		sgs->sum_nr_running += nr_running;
+ 
+-		if (nr_running > 1)
+-			*sg_overloaded = 1;
++		if (sd_flags & SD_ASYM_PACKING)
++			update_sg_pick_asym_prefer(sgs, i);
+ 
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
+-#ifdef CONFIG_NUMA_BALANCING
+-		sgs->nr_numa_running += rq->nr_numa_running;
+-		sgs->nr_preferred_running += rq->nr_preferred_running;
+-#endif
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+@@ -10329,10 +10340,21 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 			continue;
+ 		}
+ 
++		/* Overload indicator is only updated at root domain */
++		if (balancing_at_rd && nr_running > 1)
++			*sg_overloaded = 1;
++
++#ifdef CONFIG_NUMA_BALANCING
++		/* Only fbq_classify_group() uses this to classify NUMA groups */
++		if (sd_flags & SD_NUMA) {
++			sgs->nr_numa_running += rq->nr_numa_running;
++			sgs->nr_preferred_running += rq->nr_preferred_running;
++		}
++#endif
+ 		if (local_group)
+ 			continue;
+ 
+-		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
++		if (sd_flags & SD_ASYM_CPUCAPACITY) {
+ 			/* Check for a misfit task on the cpu */
+ 			if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ 				sgs->group_misfit_task_load = rq->misfit_task_load;
+@@ -10427,7 +10449,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 
+ 	case group_asym_packing:
+ 		/* Prefer to move from lowest priority CPU's work */
+-		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
++		return sched_asym_prefer(busiest->asym_prefer_cpu, sgs->asym_prefer_cpu);
+ 
+ 	case group_misfit_task:
+ 		/*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 18f3955ddb8f..6a9efb0fd86f 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2056,7 +2056,6 @@ struct sched_group {
+ 	unsigned int		group_weight;
+ 	unsigned int		cores;
+ 	struct sched_group_capacity *sgc;
+-	int			asym_prefer_cpu;	/* CPU of highest priority in group */
+ 	int			flags;
+ 
+ 	/*
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 9748a4c8d668..59b8157cb114 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1302,7 +1302,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+ 	WARN_ON(!sg);
+ 
+ 	do {
+-		int cpu, cores = 0, max_cpu = -1;
++		int cpu, cores = 0;
+ 
+ 		sg->group_weight = cpumask_weight(sched_group_span(sg));
+ 
+@@ -1314,19 +1314,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+ #endif
+ 		}
+ 		sg->cores = cores;
+-
+-		if (!(sd->flags & SD_ASYM_PACKING))
+-			goto next;
+-
+-		for_each_cpu(cpu, sched_group_span(sg)) {
+-			if (max_cpu < 0)
+-				max_cpu = cpu;
+-			else if (sched_asym_prefer(cpu, max_cpu))
+-				max_cpu = cpu;
+-		}
+-		sg->asym_prefer_cpu = max_cpu;
+-
+-next:
+ 		sg = sg->next;
+ 	} while (sg != sd->groups);
+ 
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch b/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch
new file mode 100644
index 0000000..3819a18
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch
@@ -0,0 +1,3050 @@
+From 5d635a3b91cbeba5def2a1a1bf1fd64b4a511923 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:32:02 +0100
+Subject: [PATCH 08/12] ntsync
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/userspace-api/index.rst         |    1 +
+ Documentation/userspace-api/ntsync.rst        |  385 +++++
+ MAINTAINERS                                   |    9 +
+ drivers/misc/Kconfig                          |    1 -
+ drivers/misc/ntsync.c                         | 1001 +++++++++++-
+ include/uapi/linux/ntsync.h                   |   42 +-
+ tools/testing/selftests/Makefile              |    1 +
+ .../selftests/drivers/ntsync/.gitignore       |    1 +
+ .../testing/selftests/drivers/ntsync/Makefile |    7 +
+ tools/testing/selftests/drivers/ntsync/config |    1 +
+ .../testing/selftests/drivers/ntsync/ntsync.c | 1343 +++++++++++++++++
+ 11 files changed, 2773 insertions(+), 19 deletions(-)
+ create mode 100644 Documentation/userspace-api/ntsync.rst
+ create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore
+ create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile
+ create mode 100644 tools/testing/selftests/drivers/ntsync/config
+ create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c
+
+diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
+index 274cc7546efc..9c1b15cd89ab 100644
+--- a/Documentation/userspace-api/index.rst
++++ b/Documentation/userspace-api/index.rst
+@@ -63,6 +63,7 @@ Everything else
+    vduse
+    futex2
+    perf_ring_buffer
++   ntsync
+ 
+ .. only::  subproject and html
+ 
+diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst
+new file mode 100644
+index 000000000000..25e7c4aef968
+--- /dev/null
++++ b/Documentation/userspace-api/ntsync.rst
+@@ -0,0 +1,385 @@
++===================================
++NT synchronization primitive driver
++===================================
++
++This page documents the user-space API for the ntsync driver.
++
++ntsync is a support driver for emulation of NT synchronization
++primitives by user-space NT emulators. It exists because implementation
++in user-space, using existing tools, cannot match Windows performance
++while offering accurate semantics. It is implemented entirely in
++software, and does not drive any hardware device.
++
++This interface is meant as a compatibility tool only, and should not
++be used for general synchronization. Instead use generic, versatile
++interfaces such as futex(2) and poll(2).
++
++Synchronization primitives
++==========================
++
++The ntsync driver exposes three types of synchronization primitives:
++semaphores, mutexes, and events.
++
++A semaphore holds a single volatile 32-bit counter, and a static 32-bit
++integer denoting the maximum value. It is considered signaled (that is,
++can be acquired without contention, or will wake up a waiting thread)
++when the counter is nonzero. The counter is decremented by one when a
++wait is satisfied. Both the initial and maximum count are established
++when the semaphore is created.
++
++A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit
++identifier denoting its owner. A mutex is considered signaled when its
++owner is zero (indicating that it is not owned). The recursion count is
++incremented when a wait is satisfied, and ownership is set to the given
++identifier.
++
++A mutex also holds an internal flag denoting whether its previous owner
++has died; such a mutex is said to be abandoned. Owner death is not
++tracked automatically based on thread death, but rather must be
++communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is
++inherently considered unowned.
++
++Except for the "unowned" semantics of zero, the actual value of the
++owner identifier is not interpreted by the ntsync driver at all. The
++intended use is to store a thread identifier; however, the ntsync
++driver does not actually validate that a calling thread provides
++consistent or unique identifiers.
++
++An event is similar to a semaphore with a maximum count of one. It holds
++a volatile boolean state denoting whether it is signaled or not. There
++are two types of events, auto-reset and manual-reset. An auto-reset
++event is designaled when a wait is satisfied; a manual-reset event is
++not. The event type is specified when the event is created.
++
++Unless specified otherwise, all operations on an object are atomic and
++totally ordered with respect to other operations on the same object.
++
++Objects are represented by files. When all file descriptors to an
++object are closed, that object is deleted.
++
++Char device
++===========
++
++The ntsync driver creates a single char device /dev/ntsync. Each file
++description opened on the device represents a unique instance intended
++to back an individual NT virtual machine. Objects created by one ntsync
++instance may only be used with other objects created by the same
++instance.
++
++ioctl reference
++===============
++
++All operations on the device are done through ioctls. There are four
++structures used in ioctl calls::
++
++   struct ntsync_sem_args {
++   	__u32 count;
++   	__u32 max;
++   };
++
++   struct ntsync_mutex_args {
++   	__u32 owner;
++   	__u32 count;
++   };
++
++   struct ntsync_event_args {
++   	__u32 signaled;
++   	__u32 manual;
++   };
++
++   struct ntsync_wait_args {
++   	__u64 timeout;
++   	__u64 objs;
++   	__u32 count;
++   	__u32 owner;
++   	__u32 index;
++   	__u32 alert;
++   	__u32 flags;
++   	__u32 pad;
++   };
++
++Depending on the ioctl, members of the structure may be used as input,
++output, or not at all.
++
++The ioctls on the device file are as follows:
++
++.. c:macro:: NTSYNC_IOC_CREATE_SEM
++
++  Create a semaphore object. Takes a pointer to struct
++  :c:type:`ntsync_sem_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``count``
++       - Initial count of the semaphore.
++     * - ``max``
++       - Maximum count of the semaphore.
++
++  Fails with ``EINVAL`` if ``count`` is greater than ``max``.
++  On success, returns a file descriptor the created semaphore.
++
++.. c:macro:: NTSYNC_IOC_CREATE_MUTEX
++
++  Create a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``count``
++       - Initial recursion count of the mutex.
++     * - ``owner``
++       - Initial owner of the mutex.
++
++  If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is
++  zero and ``count`` is nonzero, the function fails with ``EINVAL``.
++  On success, returns a file descriptor the created mutex.
++
++.. c:macro:: NTSYNC_IOC_CREATE_EVENT
++
++  Create an event object. Takes a pointer to struct
++  :c:type:`ntsync_event_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``signaled``
++       - If nonzero, the event is initially signaled, otherwise
++         nonsignaled.
++     * - ``manual``
++       - If nonzero, the event is a manual-reset event, otherwise
++         auto-reset.
++
++  On success, returns a file descriptor the created event.
++
++The ioctls on the individual objects are as follows:
++
++.. c:macro:: NTSYNC_IOC_SEM_POST
++
++  Post to a semaphore object. Takes a pointer to a 32-bit integer,
++  which on input holds the count to be added to the semaphore, and on
++  output contains its previous count.
++
++  If adding to the semaphore's current count would raise the latter
++  past the semaphore's maximum count, the ioctl fails with
++  ``EOVERFLOW`` and the semaphore is not affected. If raising the
++  semaphore's count causes it to become signaled, eligible threads
++  waiting on this semaphore will be woken and the semaphore's count
++  decremented appropriately.
++
++.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK
++
++  Release a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``owner``
++       - Specifies the owner trying to release this mutex.
++     * - ``count``
++       - On output, contains the previous recursion count.
++
++  If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner``
++  is not the current owner of the mutex, the ioctl fails with
++  ``EPERM``.
++
++  The mutex's count will be decremented by one. If decrementing the
++  mutex's count causes it to become zero, the mutex is marked as
++  unowned and signaled, and eligible threads waiting on it will be
++  woken as appropriate.
++
++.. c:macro:: NTSYNC_IOC_SET_EVENT
++
++  Signal an event object. Takes a pointer to a 32-bit integer, which on
++  output contains the previous state of the event.
++
++  Eligible threads will be woken, and auto-reset events will be
++  designaled appropriately.
++
++.. c:macro:: NTSYNC_IOC_RESET_EVENT
++
++  Designal an event object. Takes a pointer to a 32-bit integer, which
++  on output contains the previous state of the event.
++
++.. c:macro:: NTSYNC_IOC_PULSE_EVENT
++
++  Wake threads waiting on an event object while leaving it in an
++  unsignaled state. Takes a pointer to a 32-bit integer, which on
++  output contains the previous state of the event.
++
++  A pulse operation can be thought of as a set followed by a reset,
++  performed as a single atomic operation. If two threads are waiting on
++  an auto-reset event which is pulsed, only one will be woken. If two
++  threads are waiting a manual-reset event which is pulsed, both will
++  be woken. However, in both cases, the event will be unsignaled
++  afterwards, and a simultaneous read operation will always report the
++  event as unsignaled.
++
++.. c:macro:: NTSYNC_IOC_READ_SEM
++
++  Read the current state of a semaphore object. Takes a pointer to
++  struct :c:type:`ntsync_sem_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``count``
++       - On output, contains the current count of the semaphore.
++     * - ``max``
++       - On output, contains the maximum count of the semaphore.
++
++.. c:macro:: NTSYNC_IOC_READ_MUTEX
++
++  Read the current state of a mutex object. Takes a pointer to struct
++  :c:type:`ntsync_mutex_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``owner``
++       - On output, contains the current owner of the mutex, or zero
++         if the mutex is not currently owned.
++     * - ``count``
++       - On output, contains the current recursion count of the mutex.
++
++  If the mutex is marked as abandoned, the function fails with
++  ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to
++  zero.
++
++.. c:macro:: NTSYNC_IOC_READ_EVENT
++
++  Read the current state of an event object. Takes a pointer to struct
++  :c:type:`ntsync_event_args`, which is used as follows:
++
++  .. list-table::
++
++     * - ``signaled``
++       - On output, contains the current state of the event.
++     * - ``manual``
++       - On output, contains 1 if the event is a manual-reset event,
++         and 0 otherwise.
++
++.. c:macro:: NTSYNC_IOC_KILL_OWNER
++
++  Mark a mutex as unowned and abandoned if it is owned by the given
++  owner. Takes an input-only pointer to a 32-bit integer denoting the
++  owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the
++  owner does not own the mutex, the function fails with ``EPERM``.
++
++  Eligible threads waiting on the mutex will be woken as appropriate
++  (and such waits will fail with ``EOWNERDEAD``, as described below).
++
++.. c:macro:: NTSYNC_IOC_WAIT_ANY
++
++  Poll on any of a list of objects, atomically acquiring at most one.
++  Takes a pointer to struct :c:type:`ntsync_wait_args`, which is
++  used as follows:
++
++  .. list-table::
++
++     * - ``timeout``
++       - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME``
++         is set, the timeout is measured against the REALTIME clock;
++         otherwise it is measured against the MONOTONIC clock. If the
++         timeout is equal to or earlier than the current time, the
++         function returns immediately without sleeping. If ``timeout``
++         is U64_MAX, the function will sleep until an object is
++         signaled, and will not fail with ``ETIMEDOUT``.
++     * - ``objs``
++       - Pointer to an array of ``count`` file descriptors
++         (specified as an integer so that the structure has the same
++         size regardless of architecture). If any object is
++         invalid, the function fails with ``EINVAL``.
++     * - ``count``
++       - Number of objects specified in the ``objs`` array.
++         If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails
++         with ``EINVAL``.
++     * - ``owner``
++       - Mutex owner identifier. If any object in ``objs`` is a mutex,
++         the ioctl will attempt to acquire that mutex on behalf of
++         ``owner``. If ``owner`` is zero, the ioctl fails with
++         ``EINVAL``.
++     * - ``index``
++       - On success, contains the index (into ``objs``) of the object
++         which was signaled. If ``alert`` was signaled instead,
++         this contains ``count``.
++     * - ``alert``
++       - Optional event object file descriptor. If nonzero, this
++         specifies an "alert" event object which, if signaled, will
++         terminate the wait. If nonzero, the identifier must point to a
++         valid event.
++     * - ``flags``
++       - Zero or more flags. Currently the only flag is
++         ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be
++         measured against the REALTIME clock instead of MONOTONIC.
++     * - ``pad``
++       - Unused, must be set to zero.
++
++  This function attempts to acquire one of the given objects. If unable
++  to do so, it sleeps until an object becomes signaled, subsequently
++  acquiring it, or the timeout expires. In the latter case the ioctl
++  fails with ``ETIMEDOUT``. The function only acquires one object, even
++  if multiple objects are signaled.
++
++  A semaphore is considered to be signaled if its count is nonzero, and
++  is acquired by decrementing its count by one. A mutex is considered
++  to be signaled if it is unowned or if its owner matches the ``owner``
++  argument, and is acquired by incrementing its recursion count by one
++  and setting its owner to the ``owner`` argument. An auto-reset event
++  is acquired by designaling it; a manual-reset event is not affected
++  by acquisition.
++
++  Acquisition is atomic and totally ordered with respect to other
++  operations on the same object. If two wait operations (with different
++  ``owner`` identifiers) are queued on the same mutex, only one is
++  signaled. If two wait operations are queued on the same semaphore,
++  and a value of one is posted to it, only one is signaled.
++
++  If an abandoned mutex is acquired, the ioctl fails with
++  ``EOWNERDEAD``. Although this is a failure return, the function may
++  otherwise be considered successful. The mutex is marked as owned by
++  the given owner (with a recursion count of 1) and as no longer
++  abandoned, and ``index`` is still set to the index of the mutex.
++
++  The ``alert`` argument is an "extra" event which can terminate the
++  wait, independently of all other objects.
++
++  It is valid to pass the same object more than once, including by
++  passing the same event in the ``objs`` array and in ``alert``. If a
++  wakeup occurs due to that object being signaled, ``index`` is set to
++  the lowest index corresponding to that object.
++
++  The function may fail with ``EINTR`` if a signal is received.
++
++.. c:macro:: NTSYNC_IOC_WAIT_ALL
++
++  Poll on a list of objects, atomically acquiring all of them. Takes a
++  pointer to struct :c:type:`ntsync_wait_args`, which is used
++  identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is
++  always filled with zero on success if not woken via alert.
++
++  This function attempts to simultaneously acquire all of the given
++  objects. If unable to do so, it sleeps until all objects become
++  simultaneously signaled, subsequently acquiring them, or the timeout
++  expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no
++  objects are modified.
++
++  Objects may become signaled and subsequently designaled (through
++  acquisition by other threads) while this thread is sleeping. Only
++  once all objects are simultaneously signaled does the ioctl acquire
++  them and return. The entire acquisition is atomic and totally ordered
++  with respect to other operations on any of the given objects.
++
++  If an abandoned mutex is acquired, the ioctl fails with
++  ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are
++  nevertheless marked as acquired. Note that if multiple mutex objects
++  are specified, there is no way to know which were marked as
++  abandoned.
++
++  As with "any" waits, the ``alert`` argument is an "extra" event which
++  can terminate the wait. Critically, however, an "all" wait will
++  succeed if all members in ``objs`` are signaled, *or* if ``alert`` is
++  signaled. In the latter case ``index`` will be set to ``count``. As
++  with "any" waits, if both conditions are filled, the former takes
++  priority, and objects in ``objs`` will be acquired.
++
++  Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same
++  object more than once, nor is it valid to pass the same object in
++  ``objs`` and in ``alert``. If this is attempted, the function fails
++  with ``EINVAL``.
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 0fa7c5728f1e..efecb59adfe6 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -16709,6 +16709,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
+ F:	Documentation/filesystems/ntfs3.rst
+ F:	fs/ntfs3/
+ 
++NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER
++M:	Elizabeth Figura <zfigura@codeweavers.com>
++L:	wine-devel@winehq.org
++S:	Supported
++F:	Documentation/userspace-api/ntsync.rst
++F:	drivers/misc/ntsync.c
++F:	include/uapi/linux/ntsync.h
++F:	tools/testing/selftests/drivers/ntsync/
++
+ NUBUS SUBSYSTEM
+ M:	Finn Thain <fthain@linux-m68k.org>
+ L:	linux-m68k@lists.linux-m68k.org
+diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
+index 09cbe3f0ab1e..fb772bfe27c3 100644
+--- a/drivers/misc/Kconfig
++++ b/drivers/misc/Kconfig
+@@ -517,7 +517,6 @@ config OPEN_DICE
+ 
+ config NTSYNC
+ 	tristate "NT synchronization primitive emulation"
+-	depends on BROKEN
+ 	help
+ 	  This module provides kernel support for emulation of Windows NT
+ 	  synchronization primitives. It is not a hardware driver.
+diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
+index 4954553b7baa..586b86243e1d 100644
+--- a/drivers/misc/ntsync.c
++++ b/drivers/misc/ntsync.c
+@@ -6,11 +6,17 @@
+  */
+ 
+ #include <linux/anon_inodes.h>
++#include <linux/atomic.h>
+ #include <linux/file.h>
+ #include <linux/fs.h>
++#include <linux/hrtimer.h>
++#include <linux/ktime.h>
+ #include <linux/miscdevice.h>
+ #include <linux/module.h>
++#include <linux/mutex.h>
+ #include <linux/overflow.h>
++#include <linux/sched.h>
++#include <linux/sched/signal.h>
+ #include <linux/slab.h>
+ #include <linux/spinlock.h>
+ #include <uapi/linux/ntsync.h>
+@@ -19,6 +25,8 @@
+ 
+ enum ntsync_type {
+ 	NTSYNC_TYPE_SEM,
++	NTSYNC_TYPE_MUTEX,
++	NTSYNC_TYPE_EVENT,
+ };
+ 
+ /*
+@@ -30,10 +38,13 @@ enum ntsync_type {
+  *
+  * Both rely on struct file for reference counting. Individual
+  * ntsync_obj objects take a reference to the device when created.
++ * Wait operations take a reference to each object being waited on for
++ * the duration of the wait.
+  */
+ 
+ struct ntsync_obj {
+ 	spinlock_t lock;
++	int dev_locked;
+ 
+ 	enum ntsync_type type;
+ 
+@@ -46,22 +57,344 @@ struct ntsync_obj {
+ 			__u32 count;
+ 			__u32 max;
+ 		} sem;
++		struct {
++			__u32 count;
++			pid_t owner;
++			bool ownerdead;
++		} mutex;
++		struct {
++			bool manual;
++			bool signaled;
++		} event;
+ 	} u;
++
++	/*
++	 * any_waiters is protected by the object lock, but all_waiters is
++	 * protected by the device wait_all_lock.
++	 */
++	struct list_head any_waiters;
++	struct list_head all_waiters;
++
++	/*
++	 * Hint describing how many tasks are queued on this object in a
++	 * wait-all operation.
++	 *
++	 * Any time we do a wake, we may need to wake "all" waiters as well as
++	 * "any" waiters. In order to atomically wake "all" waiters, we must
++	 * lock all of the objects, and that means grabbing the wait_all_lock
++	 * below (and, due to lock ordering rules, before locking this object).
++	 * However, wait-all is a rare operation, and grabbing the wait-all
++	 * lock for every wake would create unnecessary contention.
++	 * Therefore we first check whether all_hint is zero, and, if it is,
++	 * we skip trying to wake "all" waiters.
++	 *
++	 * Since wait requests must originate from user-space threads, we're
++	 * limited here by PID_MAX_LIMIT, so there's no risk of overflow.
++	 */
++	atomic_t all_hint;
++};
++
++struct ntsync_q_entry {
++	struct list_head node;
++	struct ntsync_q *q;
++	struct ntsync_obj *obj;
++	__u32 index;
++};
++
++struct ntsync_q {
++	struct task_struct *task;
++	__u32 owner;
++
++	/*
++	 * Protected via atomic_try_cmpxchg(). Only the thread that wins the
++	 * compare-and-swap may actually change object states and wake this
++	 * task.
++	 */
++	atomic_t signaled;
++
++	bool all;
++	bool ownerdead;
++	__u32 count;
++	struct ntsync_q_entry entries[];
+ };
+ 
+ struct ntsync_device {
++	/*
++	 * Wait-all operations must atomically grab all objects, and be totally
++	 * ordered with respect to each other and wait-any operations.
++	 * If one thread is trying to acquire several objects, another thread
++	 * cannot touch the object at the same time.
++	 *
++	 * This device-wide lock is used to serialize wait-for-all
++	 * operations, and operations on an object that is involved in a
++	 * wait-for-all.
++	 */
++	struct mutex wait_all_lock;
++
+ 	struct file *file;
+ };
+ 
++/*
++ * Single objects are locked using obj->lock.
++ *
++ * Multiple objects are 'locked' while holding dev->wait_all_lock.
++ * In this case however, individual objects are not locked by holding
++ * obj->lock, but by setting obj->dev_locked.
++ *
++ * This means that in order to lock a single object, the sequence is slightly
++ * more complicated than usual. Specifically it needs to check obj->dev_locked
++ * after acquiring obj->lock, if set, it needs to drop the lock and acquire
++ * dev->wait_all_lock in order to serialize against the multi-object operation.
++ */
++
++static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev == dev);
++	spin_lock(&obj->lock);
++	/*
++	 * By setting obj->dev_locked inside obj->lock, it is ensured that
++	 * anyone holding obj->lock must see the value.
++	 */
++	obj->dev_locked = 1;
++	spin_unlock(&obj->lock);
++}
++
++static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev == dev);
++	spin_lock(&obj->lock);
++	obj->dev_locked = 0;
++	spin_unlock(&obj->lock);
++}
++
++static void obj_lock(struct ntsync_obj *obj)
++{
++	struct ntsync_device *dev = obj->dev;
++
++	for (;;) {
++		spin_lock(&obj->lock);
++		if (likely(!obj->dev_locked))
++			break;
++
++		spin_unlock(&obj->lock);
++		mutex_lock(&dev->wait_all_lock);
++		spin_lock(&obj->lock);
++		/*
++		 * obj->dev_locked should be set and released under the same
++		 * wait_all_lock section, since we now own this lock, it should
++		 * be clear.
++		 */
++		lockdep_assert(!obj->dev_locked);
++		spin_unlock(&obj->lock);
++		mutex_unlock(&dev->wait_all_lock);
++	}
++}
++
++static void obj_unlock(struct ntsync_obj *obj)
++{
++	spin_unlock(&obj->lock);
++}
++
++static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	bool all;
++
++	obj_lock(obj);
++	all = atomic_read(&obj->all_hint);
++	if (unlikely(all)) {
++		obj_unlock(obj);
++		mutex_lock(&dev->wait_all_lock);
++		dev_lock_obj(dev, obj);
++	}
++
++	return all;
++}
++
++static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all)
++{
++	if (all) {
++		dev_unlock_obj(dev, obj);
++		mutex_unlock(&dev->wait_all_lock);
++	} else {
++		obj_unlock(obj);
++	}
++}
++
++#define ntsync_assert_held(obj) \
++	lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \
++		       ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
++			(obj)->dev_locked))
++
++static bool is_signaled(struct ntsync_obj *obj, __u32 owner)
++{
++	ntsync_assert_held(obj);
++
++	switch (obj->type) {
++	case NTSYNC_TYPE_SEM:
++		return !!obj->u.sem.count;
++	case NTSYNC_TYPE_MUTEX:
++		if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
++			return false;
++		return obj->u.mutex.count < UINT_MAX;
++	case NTSYNC_TYPE_EVENT:
++		return obj->u.event.signaled;
++	}
++
++	WARN(1, "bad object type %#x\n", obj->type);
++	return false;
++}
++
++/*
++ * "locked_obj" is an optional pointer to an object which is already locked and
++ * should not be locked again. This is necessary so that changing an object's
++ * state and waking it can be a single atomic operation.
++ */
++static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q,
++			 struct ntsync_obj *locked_obj)
++{
++	__u32 count = q->count;
++	bool can_wake = true;
++	int signaled = -1;
++	__u32 i;
++
++	lockdep_assert_held(&dev->wait_all_lock);
++	if (locked_obj)
++		lockdep_assert(locked_obj->dev_locked);
++
++	for (i = 0; i < count; i++) {
++		if (q->entries[i].obj != locked_obj)
++			dev_lock_obj(dev, q->entries[i].obj);
++	}
++
++	for (i = 0; i < count; i++) {
++		if (!is_signaled(q->entries[i].obj, q->owner)) {
++			can_wake = false;
++			break;
++		}
++	}
++
++	if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) {
++		for (i = 0; i < count; i++) {
++			struct ntsync_obj *obj = q->entries[i].obj;
++
++			switch (obj->type) {
++			case NTSYNC_TYPE_SEM:
++				obj->u.sem.count--;
++				break;
++			case NTSYNC_TYPE_MUTEX:
++				if (obj->u.mutex.ownerdead)
++					q->ownerdead = true;
++				obj->u.mutex.ownerdead = false;
++				obj->u.mutex.count++;
++				obj->u.mutex.owner = q->owner;
++				break;
++			case NTSYNC_TYPE_EVENT:
++				if (!obj->u.event.manual)
++					obj->u.event.signaled = false;
++				break;
++			}
++		}
++		wake_up_process(q->task);
++	}
++
++	for (i = 0; i < count; i++) {
++		if (q->entries[i].obj != locked_obj)
++			dev_unlock_obj(dev, q->entries[i].obj);
++	}
++}
++
++static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
++{
++	struct ntsync_q_entry *entry;
++
++	lockdep_assert_held(&dev->wait_all_lock);
++	lockdep_assert(obj->dev_locked);
++
++	list_for_each_entry(entry, &obj->all_waiters, node)
++		try_wake_all(dev, entry->q, obj);
++}
++
++static void try_wake_any_sem(struct ntsync_obj *sem)
++{
++	struct ntsync_q_entry *entry;
++
++	ntsync_assert_held(sem);
++	lockdep_assert(sem->type == NTSYNC_TYPE_SEM);
++
++	list_for_each_entry(entry, &sem->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (!sem->u.sem.count)
++			break;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			sem->u.sem.count--;
++			wake_up_process(q->task);
++		}
++	}
++}
++
++static void try_wake_any_mutex(struct ntsync_obj *mutex)
++{
++	struct ntsync_q_entry *entry;
++
++	ntsync_assert_held(mutex);
++	lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX);
++
++	list_for_each_entry(entry, &mutex->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (mutex->u.mutex.count == UINT_MAX)
++			break;
++		if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner)
++			continue;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			if (mutex->u.mutex.ownerdead)
++				q->ownerdead = true;
++			mutex->u.mutex.ownerdead = false;
++			mutex->u.mutex.count++;
++			mutex->u.mutex.owner = q->owner;
++			wake_up_process(q->task);
++		}
++	}
++}
++
++static void try_wake_any_event(struct ntsync_obj *event)
++{
++	struct ntsync_q_entry *entry;
++
++	ntsync_assert_held(event);
++	lockdep_assert(event->type == NTSYNC_TYPE_EVENT);
++
++	list_for_each_entry(entry, &event->any_waiters, node) {
++		struct ntsync_q *q = entry->q;
++		int signaled = -1;
++
++		if (!event->u.event.signaled)
++			break;
++
++		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
++			if (!event->u.event.manual)
++				event->u.event.signaled = false;
++			wake_up_process(q->task);
++		}
++	}
++}
++
+ /*
+  * Actually change the semaphore state, returning -EOVERFLOW if it is made
+  * invalid.
+  */
+-static int post_sem_state(struct ntsync_obj *sem, __u32 count)
++static int release_sem_state(struct ntsync_obj *sem, __u32 count)
+ {
+ 	__u32 sum;
+ 
+-	lockdep_assert_held(&sem->lock);
++	ntsync_assert_held(sem);
+ 
+ 	if (check_add_overflow(sem->u.sem.count, count, &sum) ||
+ 	    sum > sem->u.sem.max)
+@@ -71,11 +404,13 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
+ 	return 0;
+ }
+ 
+-static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
++static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp)
+ {
++	struct ntsync_device *dev = sem->dev;
+ 	__u32 __user *user_args = argp;
+ 	__u32 prev_count;
+ 	__u32 args;
++	bool all;
+ 	int ret;
+ 
+ 	if (copy_from_user(&args, argp, sizeof(args)))
+@@ -84,12 +419,17 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+ 	if (sem->type != NTSYNC_TYPE_SEM)
+ 		return -EINVAL;
+ 
+-	spin_lock(&sem->lock);
++	all = ntsync_lock_obj(dev, sem);
+ 
+ 	prev_count = sem->u.sem.count;
+-	ret = post_sem_state(sem, args);
++	ret = release_sem_state(sem, args);
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, sem);
++		try_wake_any_sem(sem);
++	}
+ 
+-	spin_unlock(&sem->lock);
++	ntsync_unlock_obj(dev, sem, all);
+ 
+ 	if (!ret && put_user(prev_count, user_args))
+ 		ret = -EFAULT;
+@@ -97,13 +437,229 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+ 	return ret;
+ }
+ 
+-static int ntsync_obj_release(struct inode *inode, struct file *file)
++/*
++ * Actually change the mutex state, returning -EPERM if not the owner.
++ */
++static int unlock_mutex_state(struct ntsync_obj *mutex,
++			      const struct ntsync_mutex_args *args)
+ {
+-	struct ntsync_obj *obj = file->private_data;
++	ntsync_assert_held(mutex);
++
++	if (mutex->u.mutex.owner != args->owner)
++		return -EPERM;
++
++	if (!--mutex->u.mutex.count)
++		mutex->u.mutex.owner = 0;
++	return 0;
++}
++
++static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_mutex_args __user *user_args = argp;
++	struct ntsync_device *dev = mutex->dev;
++	struct ntsync_mutex_args args;
++	__u32 prev_count;
++	bool all;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++	if (!args.owner)
++		return -EINVAL;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	prev_count = mutex->u.mutex.count;
++	ret = unlock_mutex_state(mutex, &args);
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, mutex);
++		try_wake_any_mutex(mutex);
++	}
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	if (!ret && put_user(prev_count, &user_args->count))
++		ret = -EFAULT;
++
++	return ret;
++}
++
++/*
++ * Actually change the mutex state to mark its owner as dead,
++ * returning -EPERM if not the owner.
++ */
++static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner)
++{
++	ntsync_assert_held(mutex);
++
++	if (mutex->u.mutex.owner != owner)
++		return -EPERM;
++
++	mutex->u.mutex.ownerdead = true;
++	mutex->u.mutex.owner = 0;
++	mutex->u.mutex.count = 0;
++	return 0;
++}
++
++static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_device *dev = mutex->dev;
++	__u32 owner;
++	bool all;
++	int ret;
++
++	if (get_user(owner, (__u32 __user *)argp))
++		return -EFAULT;
++	if (!owner)
++		return -EINVAL;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	ret = kill_mutex_state(mutex, owner);
++	if (!ret) {
++		if (all)
++			try_wake_all_obj(dev, mutex);
++		try_wake_any_mutex(mutex);
++	}
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	return ret;
++}
++
++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse)
++{
++	struct ntsync_device *dev = event->dev;
++	__u32 prev_state;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, event);
++
++	prev_state = event->u.event.signaled;
++	event->u.event.signaled = true;
++	if (all)
++		try_wake_all_obj(dev, event);
++	try_wake_any_event(event);
++	if (pulse)
++		event->u.event.signaled = false;
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (put_user(prev_state, (__u32 __user *)argp))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp)
++{
++	struct ntsync_device *dev = event->dev;
++	__u32 prev_state;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, event);
++
++	prev_state = event->u.event.signaled;
++	event->u.event.signaled = false;
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (put_user(prev_state, (__u32 __user *)argp))
++		return -EFAULT;
++
++	return 0;
++}
++
++static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp)
++{
++	struct ntsync_sem_args __user *user_args = argp;
++	struct ntsync_device *dev = sem->dev;
++	struct ntsync_sem_args args;
++	bool all;
++
++	if (sem->type != NTSYNC_TYPE_SEM)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, sem);
++
++	args.count = sem->u.sem.count;
++	args.max = sem->u.sem.max;
++
++	ntsync_unlock_obj(dev, sem, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return 0;
++}
+ 
++static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp)
++{
++	struct ntsync_mutex_args __user *user_args = argp;
++	struct ntsync_device *dev = mutex->dev;
++	struct ntsync_mutex_args args;
++	bool all;
++	int ret;
++
++	if (mutex->type != NTSYNC_TYPE_MUTEX)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, mutex);
++
++	args.count = mutex->u.mutex.count;
++	args.owner = mutex->u.mutex.owner;
++	ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0;
++
++	ntsync_unlock_obj(dev, mutex, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return ret;
++}
++
++static int ntsync_event_read(struct ntsync_obj *event, void __user *argp)
++{
++	struct ntsync_event_args __user *user_args = argp;
++	struct ntsync_device *dev = event->dev;
++	struct ntsync_event_args args;
++	bool all;
++
++	if (event->type != NTSYNC_TYPE_EVENT)
++		return -EINVAL;
++
++	all = ntsync_lock_obj(dev, event);
++
++	args.manual = event->u.event.manual;
++	args.signaled = event->u.event.signaled;
++
++	ntsync_unlock_obj(dev, event, all);
++
++	if (copy_to_user(user_args, &args, sizeof(args)))
++		return -EFAULT;
++	return 0;
++}
++
++static void ntsync_free_obj(struct ntsync_obj *obj)
++{
+ 	fput(obj->dev->file);
+ 	kfree(obj);
++}
+ 
++static int ntsync_obj_release(struct inode *inode, struct file *file)
++{
++	ntsync_free_obj(file->private_data);
+ 	return 0;
+ }
+ 
+@@ -114,8 +670,24 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd,
+ 	void __user *argp = (void __user *)parm;
+ 
+ 	switch (cmd) {
+-	case NTSYNC_IOC_SEM_POST:
+-		return ntsync_sem_post(obj, argp);
++	case NTSYNC_IOC_SEM_RELEASE:
++		return ntsync_sem_release(obj, argp);
++	case NTSYNC_IOC_SEM_READ:
++		return ntsync_sem_read(obj, argp);
++	case NTSYNC_IOC_MUTEX_UNLOCK:
++		return ntsync_mutex_unlock(obj, argp);
++	case NTSYNC_IOC_MUTEX_KILL:
++		return ntsync_mutex_kill(obj, argp);
++	case NTSYNC_IOC_MUTEX_READ:
++		return ntsync_mutex_read(obj, argp);
++	case NTSYNC_IOC_EVENT_SET:
++		return ntsync_event_set(obj, argp, false);
++	case NTSYNC_IOC_EVENT_RESET:
++		return ntsync_event_reset(obj, argp);
++	case NTSYNC_IOC_EVENT_PULSE:
++		return ntsync_event_set(obj, argp, true);
++	case NTSYNC_IOC_EVENT_READ:
++		return ntsync_event_read(obj, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+@@ -140,6 +712,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
+ 	obj->dev = dev;
+ 	get_file(dev->file);
+ 	spin_lock_init(&obj->lock);
++	INIT_LIST_HEAD(&obj->any_waiters);
++	INIT_LIST_HEAD(&obj->all_waiters);
++	atomic_set(&obj->all_hint, 0);
+ 
+ 	return obj;
+ }
+@@ -165,7 +740,6 @@ static int ntsync_obj_get_fd(struct ntsync_obj *obj)
+ 
+ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
+ {
+-	struct ntsync_sem_args __user *user_args = argp;
+ 	struct ntsync_sem_args args;
+ 	struct ntsync_obj *sem;
+ 	int fd;
+@@ -182,12 +756,398 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
+ 	sem->u.sem.count = args.count;
+ 	sem->u.sem.max = args.max;
+ 	fd = ntsync_obj_get_fd(sem);
+-	if (fd < 0) {
+-		kfree(sem);
+-		return fd;
++	if (fd < 0)
++		ntsync_free_obj(sem);
++
++	return fd;
++}
++
++static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_mutex_args args;
++	struct ntsync_obj *mutex;
++	int fd;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	if (!args.owner != !args.count)
++		return -EINVAL;
++
++	mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX);
++	if (!mutex)
++		return -ENOMEM;
++	mutex->u.mutex.count = args.count;
++	mutex->u.mutex.owner = args.owner;
++	fd = ntsync_obj_get_fd(mutex);
++	if (fd < 0)
++		ntsync_free_obj(mutex);
++
++	return fd;
++}
++
++static int ntsync_create_event(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_event_args args;
++	struct ntsync_obj *event;
++	int fd;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT);
++	if (!event)
++		return -ENOMEM;
++	event->u.event.manual = args.manual;
++	event->u.event.signaled = args.signaled;
++	fd = ntsync_obj_get_fd(event);
++	if (fd < 0)
++		ntsync_free_obj(event);
++
++	return fd;
++}
++
++static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
++{
++	struct file *file = fget(fd);
++	struct ntsync_obj *obj;
++
++	if (!file)
++		return NULL;
++
++	if (file->f_op != &ntsync_obj_fops) {
++		fput(file);
++		return NULL;
++	}
++
++	obj = file->private_data;
++	if (obj->dev != dev) {
++		fput(file);
++		return NULL;
+ 	}
+ 
+-	return put_user(fd, &user_args->sem);
++	return obj;
++}
++
++static void put_obj(struct ntsync_obj *obj)
++{
++	fput(obj->file);
++}
++
++static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
++{
++	ktime_t timeout = ns_to_ktime(args->timeout);
++	clockid_t clock = CLOCK_MONOTONIC;
++	ktime_t *timeout_ptr;
++	int ret = 0;
++
++	timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
++
++	if (args->flags & NTSYNC_WAIT_REALTIME)
++		clock = CLOCK_REALTIME;
++
++	do {
++		if (signal_pending(current)) {
++			ret = -ERESTARTSYS;
++			break;
++		}
++
++		set_current_state(TASK_INTERRUPTIBLE);
++		if (atomic_read(&q->signaled) != -1) {
++			ret = 0;
++			break;
++		}
++		ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock);
++	} while (ret < 0);
++	__set_current_state(TASK_RUNNING);
++
++	return ret;
++}
++
++/*
++ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
++ */
++static int setup_wait(struct ntsync_device *dev,
++		      const struct ntsync_wait_args *args, bool all,
++		      struct ntsync_q **ret_q)
++{
++	int fds[NTSYNC_MAX_WAIT_COUNT + 1];
++	const __u32 count = args->count;
++	struct ntsync_q *q;
++	__u32 total_count;
++	__u32 i, j;
++
++	if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME))
++		return -EINVAL;
++
++	if (args->count > NTSYNC_MAX_WAIT_COUNT)
++		return -EINVAL;
++
++	total_count = count;
++	if (args->alert)
++		total_count++;
++
++	if (copy_from_user(fds, u64_to_user_ptr(args->objs),
++			   array_size(count, sizeof(*fds))))
++		return -EFAULT;
++	if (args->alert)
++		fds[count] = args->alert;
++
++	q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL);
++	if (!q)
++		return -ENOMEM;
++	q->task = current;
++	q->owner = args->owner;
++	atomic_set(&q->signaled, -1);
++	q->all = all;
++	q->ownerdead = false;
++	q->count = count;
++
++	for (i = 0; i < total_count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = get_obj(dev, fds[i]);
++
++		if (!obj)
++			goto err;
++
++		if (all) {
++			/* Check that the objects are all distinct. */
++			for (j = 0; j < i; j++) {
++				if (obj == q->entries[j].obj) {
++					put_obj(obj);
++					goto err;
++				}
++			}
++		}
++
++		entry->obj = obj;
++		entry->q = q;
++		entry->index = i;
++	}
++
++	*ret_q = q;
++	return 0;
++
++err:
++	for (j = 0; j < i; j++)
++		put_obj(q->entries[j].obj);
++	kfree(q);
++	return -EINVAL;
++}
++
++static void try_wake_any_obj(struct ntsync_obj *obj)
++{
++	switch (obj->type) {
++	case NTSYNC_TYPE_SEM:
++		try_wake_any_sem(obj);
++		break;
++	case NTSYNC_TYPE_MUTEX:
++		try_wake_any_mutex(obj);
++		break;
++	case NTSYNC_TYPE_EVENT:
++		try_wake_any_event(obj);
++		break;
++	}
++}
++
++static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_wait_args args;
++	__u32 i, total_count;
++	struct ntsync_q *q;
++	int signaled;
++	bool all;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	ret = setup_wait(dev, &args, false, &q);
++	if (ret < 0)
++		return ret;
++
++	total_count = args.count;
++	if (args.alert)
++		total_count++;
++
++	/* queue ourselves */
++
++	for (i = 0; i < total_count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		all = ntsync_lock_obj(dev, obj);
++		list_add_tail(&entry->node, &obj->any_waiters);
++		ntsync_unlock_obj(dev, obj, all);
++	}
++
++	/*
++	 * Check if we are already signaled.
++	 *
++	 * Note that the API requires that normal objects are checked before
++	 * the alert event. Hence we queue the alert event last, and check
++	 * objects in order.
++	 */
++
++	for (i = 0; i < total_count; i++) {
++		struct ntsync_obj *obj = q->entries[i].obj;
++
++		if (atomic_read(&q->signaled) != -1)
++			break;
++
++		all = ntsync_lock_obj(dev, obj);
++		try_wake_any_obj(obj);
++		ntsync_unlock_obj(dev, obj, all);
++	}
++
++	/* sleep */
++
++	ret = ntsync_schedule(q, &args);
++
++	/* and finally, unqueue */
++
++	for (i = 0; i < total_count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		all = ntsync_lock_obj(dev, obj);
++		list_del(&entry->node);
++		ntsync_unlock_obj(dev, obj, all);
++
++		put_obj(obj);
++	}
++
++	signaled = atomic_read(&q->signaled);
++	if (signaled != -1) {
++		struct ntsync_wait_args __user *user_args = argp;
++
++		/* even if we caught a signal, we need to communicate success */
++		ret = q->ownerdead ? -EOWNERDEAD : 0;
++
++		if (put_user(signaled, &user_args->index))
++			ret = -EFAULT;
++	} else if (!ret) {
++		ret = -ETIMEDOUT;
++	}
++
++	kfree(q);
++	return ret;
++}
++
++static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp)
++{
++	struct ntsync_wait_args args;
++	struct ntsync_q *q;
++	int signaled;
++	__u32 i;
++	int ret;
++
++	if (copy_from_user(&args, argp, sizeof(args)))
++		return -EFAULT;
++
++	ret = setup_wait(dev, &args, true, &q);
++	if (ret < 0)
++		return ret;
++
++	/* queue ourselves */
++
++	mutex_lock(&dev->wait_all_lock);
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		atomic_inc(&obj->all_hint);
++
++		/*
++		 * obj->all_waiters is protected by dev->wait_all_lock rather
++		 * than obj->lock, so there is no need to acquire obj->lock
++		 * here.
++		 */
++		list_add_tail(&entry->node, &obj->all_waiters);
++	}
++	if (args.alert) {
++		struct ntsync_q_entry *entry = &q->entries[args.count];
++		struct ntsync_obj *obj = entry->obj;
++
++		dev_lock_obj(dev, obj);
++		list_add_tail(&entry->node, &obj->any_waiters);
++		dev_unlock_obj(dev, obj);
++	}
++
++	/* check if we are already signaled */
++
++	try_wake_all(dev, q, NULL);
++
++	mutex_unlock(&dev->wait_all_lock);
++
++	/*
++	 * Check if the alert event is signaled, making sure to do so only
++	 * after checking if the other objects are signaled.
++	 */
++
++	if (args.alert) {
++		struct ntsync_obj *obj = q->entries[args.count].obj;
++
++		if (atomic_read(&q->signaled) == -1) {
++			bool all = ntsync_lock_obj(dev, obj);
++			try_wake_any_obj(obj);
++			ntsync_unlock_obj(dev, obj, all);
++		}
++	}
++
++	/* sleep */
++
++	ret = ntsync_schedule(q, &args);
++
++	/* and finally, unqueue */
++
++	mutex_lock(&dev->wait_all_lock);
++
++	for (i = 0; i < args.count; i++) {
++		struct ntsync_q_entry *entry = &q->entries[i];
++		struct ntsync_obj *obj = entry->obj;
++
++		/*
++		 * obj->all_waiters is protected by dev->wait_all_lock rather
++		 * than obj->lock, so there is no need to acquire it here.
++		 */
++		list_del(&entry->node);
++
++		atomic_dec(&obj->all_hint);
++
++		put_obj(obj);
++	}
++
++	mutex_unlock(&dev->wait_all_lock);
++
++	if (args.alert) {
++		struct ntsync_q_entry *entry = &q->entries[args.count];
++		struct ntsync_obj *obj = entry->obj;
++		bool all;
++
++		all = ntsync_lock_obj(dev, obj);
++		list_del(&entry->node);
++		ntsync_unlock_obj(dev, obj, all);
++
++		put_obj(obj);
++	}
++
++	signaled = atomic_read(&q->signaled);
++	if (signaled != -1) {
++		struct ntsync_wait_args __user *user_args = argp;
++
++		/* even if we caught a signal, we need to communicate success */
++		ret = q->ownerdead ? -EOWNERDEAD : 0;
++
++		if (put_user(signaled, &user_args->index))
++			ret = -EFAULT;
++	} else if (!ret) {
++		ret = -ETIMEDOUT;
++	}
++
++	kfree(q);
++	return ret;
+ }
+ 
+ static int ntsync_char_open(struct inode *inode, struct file *file)
+@@ -198,6 +1158,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file)
+ 	if (!dev)
+ 		return -ENOMEM;
+ 
++	mutex_init(&dev->wait_all_lock);
++
+ 	file->private_data = dev;
+ 	dev->file = file;
+ 	return nonseekable_open(inode, file);
+@@ -219,8 +1181,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
+ 	void __user *argp = (void __user *)parm;
+ 
+ 	switch (cmd) {
++	case NTSYNC_IOC_CREATE_EVENT:
++		return ntsync_create_event(dev, argp);
++	case NTSYNC_IOC_CREATE_MUTEX:
++		return ntsync_create_mutex(dev, argp);
+ 	case NTSYNC_IOC_CREATE_SEM:
+ 		return ntsync_create_sem(dev, argp);
++	case NTSYNC_IOC_WAIT_ALL:
++		return ntsync_wait_all(dev, argp);
++	case NTSYNC_IOC_WAIT_ANY:
++		return ntsync_wait_any(dev, argp);
+ 	default:
+ 		return -ENOIOCTLCMD;
+ 	}
+@@ -238,6 +1208,7 @@ static struct miscdevice ntsync_misc = {
+ 	.minor		= MISC_DYNAMIC_MINOR,
+ 	.name		= NTSYNC_NAME,
+ 	.fops		= &ntsync_fops,
++	.mode		= 0666, // Setting file permissions to 0666
+ };
+ 
+ module_misc_device(ntsync_misc);
+diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
+index dcfa38fdc93c..6d06793512b1 100644
+--- a/include/uapi/linux/ntsync.h
++++ b/include/uapi/linux/ntsync.h
+@@ -11,13 +11,49 @@
+ #include <linux/types.h>
+ 
+ struct ntsync_sem_args {
+-	__u32 sem;
+ 	__u32 count;
+ 	__u32 max;
+ };
+ 
+-#define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
++struct ntsync_mutex_args {
++	__u32 owner;
++	__u32 count;
++};
++
++struct ntsync_event_args {
++	__u32 manual;
++	__u32 signaled;
++};
++
++#define NTSYNC_WAIT_REALTIME	0x1
++
++struct ntsync_wait_args {
++	__u64 timeout;
++	__u64 objs;
++	__u32 count;
++	__u32 index;
++	__u32 flags;
++	__u32 owner;
++	__u32 alert;
++	__u32 pad;
++};
++
++#define NTSYNC_MAX_WAIT_COUNT 64
++
++#define NTSYNC_IOC_CREATE_SEM		_IOW ('N', 0x80, struct ntsync_sem_args)
++#define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
++#define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
++#define NTSYNC_IOC_CREATE_MUTEX		_IOW ('N', 0x84, struct ntsync_mutex_args)
++#define NTSYNC_IOC_CREATE_EVENT		_IOW ('N', 0x87, struct ntsync_event_args)
+ 
+-#define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
++#define NTSYNC_IOC_SEM_RELEASE		_IOWR('N', 0x81, __u32)
++#define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
++#define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
++#define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
++#define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
++#define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
++#define NTSYNC_IOC_SEM_READ		_IOR ('N', 0x8b, struct ntsync_sem_args)
++#define NTSYNC_IOC_MUTEX_READ		_IOR ('N', 0x8c, struct ntsync_mutex_args)
++#define NTSYNC_IOC_EVENT_READ		_IOR ('N', 0x8d, struct ntsync_event_args)
+ 
+ #endif
+diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
+index 2401e973c359..a8c9648e5adc 100644
+--- a/tools/testing/selftests/Makefile
++++ b/tools/testing/selftests/Makefile
+@@ -18,6 +18,7 @@ TARGETS += devices/error_logs
+ TARGETS += devices/probe
+ TARGETS += dmabuf-heaps
+ TARGETS += drivers/dma-buf
++TARGETS += drivers/ntsync
+ TARGETS += drivers/s390x/uvdevice
+ TARGETS += drivers/net
+ TARGETS += drivers/net/bonding
+diff --git a/tools/testing/selftests/drivers/ntsync/.gitignore b/tools/testing/selftests/drivers/ntsync/.gitignore
+new file mode 100644
+index 000000000000..848573a3d3ea
+--- /dev/null
++++ b/tools/testing/selftests/drivers/ntsync/.gitignore
+@@ -0,0 +1 @@
++ntsync
+diff --git a/tools/testing/selftests/drivers/ntsync/Makefile b/tools/testing/selftests/drivers/ntsync/Makefile
+new file mode 100644
+index 000000000000..dbf2b055c0b2
+--- /dev/null
++++ b/tools/testing/selftests/drivers/ntsync/Makefile
+@@ -0,0 +1,7 @@
++# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only
++TEST_GEN_PROGS := ntsync
++
++CFLAGS += $(KHDR_INCLUDES)
++LDLIBS += -lpthread
++
++include ../../lib.mk
+diff --git a/tools/testing/selftests/drivers/ntsync/config b/tools/testing/selftests/drivers/ntsync/config
+new file mode 100644
+index 000000000000..60539c826d06
+--- /dev/null
++++ b/tools/testing/selftests/drivers/ntsync/config
+@@ -0,0 +1 @@
++CONFIG_WINESYNC=y
+diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
+new file mode 100644
+index 000000000000..3aad311574c4
+--- /dev/null
++++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
+@@ -0,0 +1,1343 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * Various unit tests for the "ntsync" synchronization primitive driver.
++ *
++ * Copyright (C) 2021-2022 Elizabeth Figura <zfigura@codeweavers.com>
++ */
++
++#define _GNU_SOURCE
++#include <sys/ioctl.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <time.h>
++#include <pthread.h>
++#include <linux/ntsync.h>
++#include "../../kselftest_harness.h"
++
++static int read_sem_state(int sem, __u32 *count, __u32 *max)
++{
++	struct ntsync_sem_args args;
++	int ret;
++
++	memset(&args, 0xcc, sizeof(args));
++	ret = ioctl(sem, NTSYNC_IOC_SEM_READ, &args);
++	*count = args.count;
++	*max = args.max;
++	return ret;
++}
++
++#define check_sem_state(sem, count, max) \
++	({ \
++		__u32 __count, __max; \
++		int ret = read_sem_state((sem), &__count, &__max); \
++		EXPECT_EQ(0, ret); \
++		EXPECT_EQ((count), __count); \
++		EXPECT_EQ((max), __max); \
++	})
++
++static int release_sem(int sem, __u32 *count)
++{
++	return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count);
++}
++
++static int read_mutex_state(int mutex, __u32 *count, __u32 *owner)
++{
++	struct ntsync_mutex_args args;
++	int ret;
++
++	memset(&args, 0xcc, sizeof(args));
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &args);
++	*count = args.count;
++	*owner = args.owner;
++	return ret;
++}
++
++#define check_mutex_state(mutex, count, owner) \
++	({ \
++		__u32 __count, __owner; \
++		int ret = read_mutex_state((mutex), &__count, &__owner); \
++		EXPECT_EQ(0, ret); \
++		EXPECT_EQ((count), __count); \
++		EXPECT_EQ((owner), __owner); \
++	})
++
++static int unlock_mutex(int mutex, __u32 owner, __u32 *count)
++{
++	struct ntsync_mutex_args args;
++	int ret;
++
++	args.owner = owner;
++	args.count = 0xdeadbeef;
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_UNLOCK, &args);
++	*count = args.count;
++	return ret;
++}
++
++static int read_event_state(int event, __u32 *signaled, __u32 *manual)
++{
++	struct ntsync_event_args args;
++	int ret;
++
++	memset(&args, 0xcc, sizeof(args));
++	ret = ioctl(event, NTSYNC_IOC_EVENT_READ, &args);
++	*signaled = args.signaled;
++	*manual = args.manual;
++	return ret;
++}
++
++#define check_event_state(event, signaled, manual) \
++	({ \
++		__u32 __signaled, __manual; \
++		int ret = read_event_state((event), &__signaled, &__manual); \
++		EXPECT_EQ(0, ret); \
++		EXPECT_EQ((signaled), __signaled); \
++		EXPECT_EQ((manual), __manual); \
++	})
++
++static int wait_objs(int fd, unsigned long request, __u32 count,
++		     const int *objs, __u32 owner, int alert, __u32 *index)
++{
++	struct ntsync_wait_args args = {0};
++	struct timespec timeout;
++	int ret;
++
++	clock_gettime(CLOCK_MONOTONIC, &timeout);
++
++	args.timeout = timeout.tv_sec * 1000000000 + timeout.tv_nsec;
++	args.count = count;
++	args.objs = (uintptr_t)objs;
++	args.owner = owner;
++	args.index = 0xdeadbeef;
++	args.alert = alert;
++	ret = ioctl(fd, request, &args);
++	*index = args.index;
++	return ret;
++}
++
++static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
++{
++	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, 0, index);
++}
++
++static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
++{
++	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, 0, index);
++}
++
++static int wait_any_alert(int fd, __u32 count, const int *objs,
++			  __u32 owner, int alert, __u32 *index)
++{
++	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY,
++			 count, objs, owner, alert, index);
++}
++
++static int wait_all_alert(int fd, __u32 count, const int *objs,
++			  __u32 owner, int alert, __u32 *index)
++{
++	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL,
++			 count, objs, owner, alert, index);
++}
++
++TEST(semaphore_state)
++{
++	struct ntsync_sem_args sem_args;
++	struct timespec timeout;
++	__u32 count, index;
++	int fd, ret, sem;
++
++	clock_gettime(CLOCK_MONOTONIC, &timeout);
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 3;
++	sem_args.max = 2;
++	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_EQ(-1, sem);
++	EXPECT_EQ(EINVAL, errno);
++
++	sem_args.count = 2;
++	sem_args.max = 2;
++	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, sem);
++	check_sem_state(sem, 2, 2);
++
++	count = 0;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, count);
++	check_sem_state(sem, 2, 2);
++
++	count = 1;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOVERFLOW, errno);
++	check_sem_state(sem, 2, 2);
++
++	ret = wait_any(fd, 1, &sem, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(sem, 1, 2);
++
++	ret = wait_any(fd, 1, &sem, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(sem, 0, 2);
++
++	ret = wait_any(fd, 1, &sem, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	count = 3;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOVERFLOW, errno);
++	check_sem_state(sem, 0, 2);
++
++	count = 2;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++	check_sem_state(sem, 2, 2);
++
++	ret = wait_any(fd, 1, &sem, 123, &index);
++	EXPECT_EQ(0, ret);
++	ret = wait_any(fd, 1, &sem, 123, &index);
++	EXPECT_EQ(0, ret);
++
++	count = 1;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++	check_sem_state(sem, 1, 2);
++
++	count = ~0u;
++	ret = release_sem(sem, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOVERFLOW, errno);
++	check_sem_state(sem, 1, 2);
++
++	close(sem);
++
++	close(fd);
++}
++
++TEST(mutex_state)
++{
++	struct ntsync_mutex_args mutex_args;
++	__u32 owner, count, index;
++	struct timespec timeout;
++	int fd, ret, mutex;
++
++	clock_gettime(CLOCK_MONOTONIC, &timeout);
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	mutex_args.owner = 123;
++	mutex_args.count = 0;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_EQ(-1, mutex);
++	EXPECT_EQ(EINVAL, errno);
++
++	mutex_args.owner = 0;
++	mutex_args.count = 2;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_EQ(-1, mutex);
++	EXPECT_EQ(EINVAL, errno);
++
++	mutex_args.owner = 123;
++	mutex_args.count = 2;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
++	check_mutex_state(mutex, 2, 123);
++
++	ret = unlock_mutex(mutex, 0, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EINVAL, errno);
++
++	ret = unlock_mutex(mutex, 456, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EPERM, errno);
++	check_mutex_state(mutex, 2, 123);
++
++	ret = unlock_mutex(mutex, 123, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, count);
++	check_mutex_state(mutex, 1, 123);
++
++	ret = unlock_mutex(mutex, 123, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, count);
++	check_mutex_state(mutex, 0, 0);
++
++	ret = unlock_mutex(mutex, 123, &count);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EPERM, errno);
++
++	ret = wait_any(fd, 1, &mutex, 456, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_mutex_state(mutex, 1, 456);
++
++	ret = wait_any(fd, 1, &mutex, 456, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_mutex_state(mutex, 2, 456);
++
++	ret = unlock_mutex(mutex, 456, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, count);
++	check_mutex_state(mutex, 1, 456);
++
++	ret = wait_any(fd, 1, &mutex, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	owner = 0;
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EINVAL, errno);
++
++	owner = 123;
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EPERM, errno);
++	check_mutex_state(mutex, 1, 456);
++
++	owner = 456;
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(0, ret);
++
++	memset(&mutex_args, 0xcc, sizeof(mutex_args));
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(0, mutex_args.count);
++	EXPECT_EQ(0, mutex_args.owner);
++
++	memset(&mutex_args, 0xcc, sizeof(mutex_args));
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(0, mutex_args.count);
++	EXPECT_EQ(0, mutex_args.owner);
++
++	ret = wait_any(fd, 1, &mutex, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(0, index);
++	check_mutex_state(mutex, 1, 123);
++
++	owner = 123;
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(0, ret);
++
++	memset(&mutex_args, 0xcc, sizeof(mutex_args));
++	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(0, mutex_args.count);
++	EXPECT_EQ(0, mutex_args.owner);
++
++	ret = wait_any(fd, 1, &mutex, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(0, index);
++	check_mutex_state(mutex, 1, 123);
++
++	close(mutex);
++
++	mutex_args.owner = 0;
++	mutex_args.count = 0;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
++	check_mutex_state(mutex, 0, 0);
++
++	ret = wait_any(fd, 1, &mutex, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_mutex_state(mutex, 1, 123);
++
++	close(mutex);
++
++	mutex_args.owner = 123;
++	mutex_args.count = ~0u;
++	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, mutex);
++	check_mutex_state(mutex, ~0u, 123);
++
++	ret = wait_any(fd, 1, &mutex, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	close(mutex);
++
++	close(fd);
++}
++
++TEST(manual_event_state)
++{
++	struct ntsync_event_args event_args;
++	__u32 index, signaled;
++	int fd, event, ret;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	event_args.manual = 1;
++	event_args.signaled = 0;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++	check_event_state(event, 0, 1);
++
++	signaled = 0xdeadbeef;
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(event, 1, 1);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++	check_event_state(event, 1, 1);
++
++	ret = wait_any(fd, 1, &event, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_event_state(event, 1, 1);
++
++	signaled = 0xdeadbeef;
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++	check_event_state(event, 0, 1);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(event, 0, 1);
++
++	ret = wait_any(fd, 1, &event, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++	check_event_state(event, 0, 1);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(event, 0, 1);
++
++	close(event);
++
++	close(fd);
++}
++
++TEST(auto_event_state)
++{
++	struct ntsync_event_args event_args;
++	__u32 index, signaled;
++	int fd, event, ret;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	event_args.manual = 0;
++	event_args.signaled = 1;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++
++	check_event_state(event, 1, 0);
++
++	signaled = 0xdeadbeef;
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++	check_event_state(event, 1, 0);
++
++	ret = wait_any(fd, 1, &event, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_event_state(event, 0, 0);
++
++	signaled = 0xdeadbeef;
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(event, 0, 0);
++
++	ret = wait_any(fd, 1, &event, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++	check_event_state(event, 0, 0);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(event, 0, 0);
++
++	close(event);
++
++	close(fd);
++}
++
++TEST(test_wait_any)
++{
++	int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret;
++	struct ntsync_mutex_args mutex_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	__u32 owner, index, count, i;
++	struct timespec timeout;
++
++	clock_gettime(CLOCK_MONOTONIC, &timeout);
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 2;
++	sem_args.max = 3;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	mutex_args.owner = 0;
++	mutex_args.count = 0;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
++
++	ret = wait_any(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 0, 0);
++
++	ret = wait_any(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 0, 0);
++
++	ret = wait_any(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, index);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	count = 1;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++
++	ret = wait_any(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	ret = wait_any(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, index);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
++
++	ret = wait_any(fd, 2, objs, 456, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	owner = 123;
++	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_any(fd, 2, objs, 456, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	EXPECT_EQ(1, index);
++
++	ret = wait_any(fd, 2, objs, 456, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, index);
++
++	close(objs[1]);
++
++	/* test waiting on the same object twice */
++
++	count = 2;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++
++	objs[1] = objs[0];
++	ret = wait_any(fd, 2, objs, 456, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 1, 3);
++
++	ret = wait_any(fd, 0, NULL, 456, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i)
++		objs[i] = objs[0];
++
++	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT + 1, objs, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EINVAL, errno);
++
++	ret = wait_any(fd, -1, objs, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EINVAL, errno);
++
++	close(objs[0]);
++
++	close(fd);
++}
++
++TEST(test_wait_all)
++{
++	struct ntsync_event_args event_args = {0};
++	struct ntsync_mutex_args mutex_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	__u32 owner, index, count;
++	int objs[2], fd, ret;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 2;
++	sem_args.max = 3;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	mutex_args.owner = 0;
++	mutex_args.count = 0;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	ret = wait_all(fd, 2, objs, 456, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++	check_sem_state(objs[0], 0, 3);
++	check_mutex_state(objs[1], 2, 123);
++
++	count = 3;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 2, 3);
++	check_mutex_state(objs[1], 3, 123);
++
++	owner = 123;
++	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EOWNERDEAD, errno);
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 123);
++
++	close(objs[1]);
++
++	event_args.manual = true;
++	event_args.signaled = true;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
++
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++	check_sem_state(objs[0], 0, 3);
++	check_event_state(objs[1], 1, 1);
++
++	close(objs[1]);
++
++	/* test waiting on the same object twice */
++	objs[1] = objs[0];
++	ret = wait_all(fd, 2, objs, 123, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(EINVAL, errno);
++
++	close(objs[0]);
++
++	close(fd);
++}
++
++struct wake_args {
++	int fd;
++	int obj;
++};
++
++struct wait_args {
++	int fd;
++	unsigned long request;
++	struct ntsync_wait_args *args;
++	int ret;
++	int err;
++};
++
++static void *wait_thread(void *arg)
++{
++	struct wait_args *args = arg;
++
++	args->ret = ioctl(args->fd, args->request, args->args);
++	args->err = errno;
++	return NULL;
++}
++
++static __u64 get_abs_timeout(unsigned int ms)
++{
++	struct timespec timeout;
++	clock_gettime(CLOCK_MONOTONIC, &timeout);
++	return (timeout.tv_sec * 1000000000) + timeout.tv_nsec + (ms * 1000000);
++}
++
++static int wait_for_thread(pthread_t thread, unsigned int ms)
++{
++	struct timespec timeout;
++
++	clock_gettime(CLOCK_REALTIME, &timeout);
++	timeout.tv_nsec += ms * 1000000;
++	timeout.tv_sec += (timeout.tv_nsec / 1000000000);
++	timeout.tv_nsec %= 1000000000;
++	return pthread_timedjoin_np(thread, NULL, &timeout);
++}
++
++TEST(wake_any)
++{
++	struct ntsync_event_args event_args = {0};
++	struct ntsync_mutex_args mutex_args = {0};
++	struct ntsync_wait_args wait_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	struct wait_args thread_args;
++	__u32 count, index, signaled;
++	int objs[2], fd, ret;
++	pthread_t thread;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 0;
++	sem_args.max = 3;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	mutex_args.owner = 123;
++	mutex_args.count = 1;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
++
++	/* test waking the semaphore */
++
++	wait_args.timeout = get_abs_timeout(1000);
++	wait_args.objs = (uintptr_t)objs;
++	wait_args.count = 2;
++	wait_args.owner = 456;
++	wait_args.index = 0xdeadbeef;
++	thread_args.fd = fd;
++	thread_args.args = &wait_args;
++	thread_args.request = NTSYNC_IOC_WAIT_ANY;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	count = 1;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++	check_sem_state(objs[0], 0, 3);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(0, wait_args.index);
++
++	/* test waking the mutex */
++
++	/* first grab it again for owner 123 */
++	ret = wait_any(fd, 1, &objs[1], 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	wait_args.owner = 456;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = unlock_mutex(objs[1], 123, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, count);
++
++	ret = pthread_tryjoin_np(thread, NULL);
++	EXPECT_EQ(EBUSY, ret);
++
++	ret = unlock_mutex(objs[1], 123, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, mutex_args.count);
++	check_mutex_state(objs[1], 1, 456);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(1, wait_args.index);
++
++	close(objs[1]);
++
++	/* test waking events */
++
++	event_args.manual = false;
++	event_args.signaled = false;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(objs[1], 0, 0);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(1, wait_args.index);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(objs[1], 0, 0);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(1, wait_args.index);
++
++	close(objs[1]);
++
++	event_args.manual = true;
++	event_args.signaled = false;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, objs[1]);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(objs[1], 1, 1);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(1, wait_args.index);
++
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++	check_event_state(objs[1], 0, 1);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(1, wait_args.index);
++
++	/* delete an object while it's being waited on */
++
++	wait_args.timeout = get_abs_timeout(200);
++	wait_args.owner = 123;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	close(objs[0]);
++	close(objs[1]);
++
++	ret = wait_for_thread(thread, 200);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(-1, thread_args.ret);
++	EXPECT_EQ(ETIMEDOUT, thread_args.err);
++
++	close(fd);
++}
++
++TEST(wake_all)
++{
++	struct ntsync_event_args manual_event_args = {0};
++	struct ntsync_event_args auto_event_args = {0};
++	struct ntsync_mutex_args mutex_args = {0};
++	struct ntsync_wait_args wait_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	struct wait_args thread_args;
++	__u32 count, index, signaled;
++	int objs[4], fd, ret;
++	pthread_t thread;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 0;
++	sem_args.max = 3;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	mutex_args.owner = 123;
++	mutex_args.count = 1;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, objs[1]);
++
++	manual_event_args.manual = true;
++	manual_event_args.signaled = true;
++	objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args);
++	EXPECT_LE(0, objs[2]);
++
++	auto_event_args.manual = false;
++	auto_event_args.signaled = true;
++	objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args);
++	EXPECT_EQ(0, objs[3]);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	wait_args.objs = (uintptr_t)objs;
++	wait_args.count = 4;
++	wait_args.owner = 456;
++	thread_args.fd = fd;
++	thread_args.args = &wait_args;
++	thread_args.request = NTSYNC_IOC_WAIT_ALL;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	count = 1;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++
++	ret = pthread_tryjoin_np(thread, NULL);
++	EXPECT_EQ(EBUSY, ret);
++
++	check_sem_state(objs[0], 1, 3);
++
++	ret = wait_any(fd, 1, &objs[0], 123, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = unlock_mutex(objs[1], 123, &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, count);
++
++	ret = pthread_tryjoin_np(thread, NULL);
++	EXPECT_EQ(EBUSY, ret);
++
++	check_mutex_state(objs[1], 0, 0);
++
++	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++
++	count = 2;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, count);
++	check_sem_state(objs[0], 2, 3);
++
++	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, signaled);
++
++	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++
++	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, signaled);
++
++	check_sem_state(objs[0], 1, 3);
++	check_mutex_state(objs[1], 1, 456);
++	check_event_state(objs[2], 1, 1);
++	check_event_state(objs[3], 0, 0);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++
++	/* delete an object while it's being waited on */
++
++	wait_args.timeout = get_abs_timeout(200);
++	wait_args.owner = 123;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	close(objs[0]);
++	close(objs[1]);
++	close(objs[2]);
++	close(objs[3]);
++
++	ret = wait_for_thread(thread, 200);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(-1, thread_args.ret);
++	EXPECT_EQ(ETIMEDOUT, thread_args.err);
++
++	close(fd);
++}
++
++TEST(alert_any)
++{
++	struct ntsync_event_args event_args = {0};
++	struct ntsync_wait_args wait_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	__u32 index, count, signaled;
++	struct wait_args thread_args;
++	int objs[2], event, fd, ret;
++	pthread_t thread;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 0;
++	sem_args.max = 2;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	sem_args.count = 1;
++	sem_args.max = 2;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[1]);
++
++	event_args.manual = true;
++	event_args.signaled = true;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++
++	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(1, index);
++
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, index);
++
++	/* test wakeup via alert */
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	wait_args.objs = (uintptr_t)objs;
++	wait_args.count = 2;
++	wait_args.owner = 123;
++	wait_args.index = 0xdeadbeef;
++	wait_args.alert = event;
++	thread_args.fd = fd;
++	thread_args.args = &wait_args;
++	thread_args.request = NTSYNC_IOC_WAIT_ANY;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(2, wait_args.index);
++
++	close(event);
++
++	/* test with an auto-reset event */
++
++	event_args.manual = false;
++	event_args.signaled = true;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++
++	count = 1;
++	ret = release_sem(objs[0], &count);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, index);
++
++	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	close(event);
++
++	close(objs[0]);
++	close(objs[1]);
++
++	close(fd);
++}
++
++TEST(alert_all)
++{
++	struct ntsync_event_args event_args = {0};
++	struct ntsync_wait_args wait_args = {0};
++	struct ntsync_sem_args sem_args = {0};
++	struct wait_args thread_args;
++	__u32 index, count, signaled;
++	int objs[2], event, fd, ret;
++	pthread_t thread;
++
++	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, fd);
++
++	sem_args.count = 2;
++	sem_args.max = 2;
++	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[0]);
++
++	sem_args.count = 1;
++	sem_args.max = 2;
++	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
++	EXPECT_LE(0, objs[1]);
++
++	event_args.manual = true;
++	event_args.signaled = true;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, index);
++
++	/* test wakeup via alert */
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	wait_args.timeout = get_abs_timeout(1000);
++	wait_args.objs = (uintptr_t)objs;
++	wait_args.count = 2;
++	wait_args.owner = 123;
++	wait_args.index = 0xdeadbeef;
++	wait_args.alert = event;
++	thread_args.fd = fd;
++	thread_args.args = &wait_args;
++	thread_args.request = NTSYNC_IOC_WAIT_ALL;
++	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(ETIMEDOUT, ret);
++
++	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_for_thread(thread, 100);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, thread_args.ret);
++	EXPECT_EQ(2, wait_args.index);
++
++	close(event);
++
++	/* test with an auto-reset event */
++
++	event_args.manual = false;
++	event_args.signaled = true;
++	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, event);
++
++	count = 2;
++	ret = release_sem(objs[1], &count);
++	EXPECT_EQ(0, ret);
++
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(0, index);
++
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(0, ret);
++	EXPECT_EQ(2, index);
++
++	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
++	EXPECT_EQ(-1, ret);
++	EXPECT_EQ(ETIMEDOUT, errno);
++
++	close(event);
++
++	close(objs[0]);
++	close(objs[1]);
++
++	close(fd);
++}
++
++#define STRESS_LOOPS 10000
++#define STRESS_THREADS 4
++
++static unsigned int stress_counter;
++static int stress_device, stress_start_event, stress_mutex;
++
++static void *stress_thread(void *arg)
++{
++	struct ntsync_wait_args wait_args = {0};
++	__u32 index, count, i;
++	int ret;
++
++	wait_args.timeout = UINT64_MAX;
++	wait_args.count = 1;
++	wait_args.objs = (uintptr_t)&stress_start_event;
++	wait_args.owner = gettid();
++	wait_args.index = 0xdeadbeef;
++
++	ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
++
++	wait_args.objs = (uintptr_t)&stress_mutex;
++
++	for (i = 0; i < STRESS_LOOPS; ++i) {
++		ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
++
++		++stress_counter;
++
++		unlock_mutex(stress_mutex, wait_args.owner, &count);
++	}
++
++	return NULL;
++}
++
++TEST(stress_wait)
++{
++	struct ntsync_event_args event_args;
++	struct ntsync_mutex_args mutex_args;
++	pthread_t threads[STRESS_THREADS];
++	__u32 signaled, i;
++	int ret;
++
++	stress_device = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
++	ASSERT_LE(0, stress_device);
++
++	mutex_args.owner = 0;
++	mutex_args.count = 0;
++	stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
++	EXPECT_LE(0, stress_mutex);
++
++	event_args.manual = 1;
++	event_args.signaled = 0;
++	stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args);
++	EXPECT_LE(0, stress_start_event);
++
++	for (i = 0; i < STRESS_THREADS; ++i)
++		pthread_create(&threads[i], NULL, stress_thread, NULL);
++
++	ret = ioctl(stress_start_event, NTSYNC_IOC_EVENT_SET, &signaled);
++	EXPECT_EQ(0, ret);
++
++	for (i = 0; i < STRESS_THREADS; ++i) {
++		ret = pthread_join(threads[i], NULL);
++		EXPECT_EQ(0, ret);
++	}
++
++	EXPECT_EQ(STRESS_LOOPS * STRESS_THREADS, stress_counter);
++
++	close(stress_start_event);
++	close(stress_mutex);
++	close(stress_device);
++}
++
++TEST_HARNESS_MAIN
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch b/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch
new file mode 100644
index 0000000..c3dc64f
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch
@@ -0,0 +1,898 @@
+From 7de62a7c4da5a2b267f3faacc8d50eb24fdfd89e Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:32:14 +0100
+Subject: [PATCH 09/12] perf-per-core
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/arch/x86/topology.rst   |   4 +
+ arch/x86/events/rapl.c                | 415 ++++++++++++++++----------
+ arch/x86/include/asm/processor.h      |   1 +
+ arch/x86/include/asm/topology.h       |   1 +
+ arch/x86/kernel/cpu/debugfs.c         |   1 +
+ arch/x86/kernel/cpu/topology_common.c |   1 +
+ 6 files changed, 273 insertions(+), 150 deletions(-)
+
+diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
+index 7352ab89a55a..c12837e61bda 100644
+--- a/Documentation/arch/x86/topology.rst
++++ b/Documentation/arch/x86/topology.rst
+@@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
+     The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
+     "core_id."
+ 
++  - topology_logical_core_id();
++
++    The logical core ID to which a thread belongs.
++
+ 
+ 
+ System topology examples
+diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
+index a8defc813c36..d3bb3865c1b1 100644
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -39,6 +39,10 @@
+  *	  event: rapl_energy_psys
+  *    perf code: 0x5
+  *
++ *  core counter: consumption of a single physical core
++ *	  event: rapl_energy_core (power_core PMU)
++ *    perf code: 0x1
++ *
+  * We manage those counters as free running (read-only). They may be
+  * use simultaneously by other tools, such as turbostat.
+  *
+@@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
+ /*
+  * RAPL energy status counters
+  */
+-enum perf_rapl_events {
++enum perf_rapl_pkg_events {
+ 	PERF_RAPL_PP0 = 0,		/* all cores */
+ 	PERF_RAPL_PKG,			/* entire package */
+ 	PERF_RAPL_RAM,			/* DRAM */
+ 	PERF_RAPL_PP1,			/* gpu */
+ 	PERF_RAPL_PSYS,			/* psys */
+ 
+-	PERF_RAPL_MAX,
+-	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
++	PERF_RAPL_PKG_EVENTS_MAX,
++	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+ };
+ 
+-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
++#define PERF_RAPL_CORE			0		/* single core */
++#define PERF_RAPL_CORE_EVENTS_MAX	1
++#define NR_RAPL_CORE_DOMAINS		PERF_RAPL_CORE_EVENTS_MAX
++
++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
+ 	"pp0-core",
+ 	"package",
+ 	"dram",
+@@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ 	"psys",
+ };
+ 
++static const char *const rapl_core_domain_name __initconst = "core";
++
+ /*
+  * event code: LSB 8 bits, passed in attr->config
+  * any other bit is reserved
+@@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
+  *	     considered as either pkg-scope or die-scope, and we are considering
+  *	     them as die-scope.
+  */
+-#define rapl_pmu_is_pkg_scope()				\
++#define rapl_pkg_pmu_is_pkg_scope()				\
+ 	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
+ 	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ 
+@@ -129,7 +139,8 @@ struct rapl_pmu {
+ struct rapl_pmus {
+ 	struct pmu		pmu;
+ 	unsigned int		nr_rapl_pmu;
+-	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
++	unsigned int		cntr_mask;
++	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
+ };
+ 
+ enum rapl_unit_quirk {
+@@ -139,44 +150,43 @@ enum rapl_unit_quirk {
+ };
+ 
+ struct rapl_model {
+-	struct perf_msr *rapl_msrs;
+-	unsigned long	events;
++	struct perf_msr *rapl_pkg_msrs;
++	struct perf_msr *rapl_core_msrs;
++	unsigned long	pkg_events;
++	unsigned long	core_events;
+ 	unsigned int	msr_power_unit;
+ 	enum rapl_unit_quirk	unit_quirk;
+ };
+ 
+  /* 1/2^hw_unit Joule */
+-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+-static struct rapl_pmus *rapl_pmus;
+-static unsigned int rapl_cntr_mask;
++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
++static int rapl_core_hw_unit __read_mostly;
++static struct rapl_pmus *rapl_pmus_pkg;
++static struct rapl_pmus *rapl_pmus_core;
+ static u64 rapl_timer_ms;
+-static struct perf_msr *rapl_msrs;
++static struct rapl_model *rapl_model;
+ 
+ /*
+- * Helper functions to get the correct topology macros according to the
++ * Helper function to get the correct topology id according to the
+  * RAPL PMU scope.
+  */
+-static inline unsigned int get_rapl_pmu_idx(int cpu)
+-{
+-	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+-					 topology_logical_die_id(cpu);
+-}
+-
+-static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
+-{
+-	return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
+-					 topology_die_cpumask(cpu);
+-}
+-
+-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
+ {
+-	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+-
+ 	/*
+-	 * The unsigned check also catches the '-1' return value for non
+-	 * existent mappings in the topology map.
++	 * Returns unsigned int, which converts the '-1' return value
++	 * (for non-existent mappings in topology map) to UINT_MAX, so
++	 * the error check in the caller is simplified.
+ 	 */
+-	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
++	switch (scope) {
++	case PERF_PMU_SCOPE_PKG:
++		return topology_logical_package_id(cpu);
++	case PERF_PMU_SCOPE_DIE:
++		return topology_logical_die_id(cpu);
++	case PERF_PMU_SCOPE_CORE:
++		return topology_logical_core_id(cpu);
++	default:
++		return -EINVAL;
++	}
+ }
+ 
+ static inline u64 rapl_read_counter(struct perf_event *event)
+@@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
+ 	return raw;
+ }
+ 
+-static inline u64 rapl_scale(u64 v, int cfg)
++static inline u64 rapl_scale(u64 v, struct perf_event *event)
+ {
+-	if (cfg > NR_RAPL_DOMAINS) {
+-		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+-		return v;
+-	}
++	int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
++
++	if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
++		hw_unit = rapl_core_hw_unit;
++
+ 	/*
+ 	 * scale delta to smallest unit (1/2^32)
+ 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ 	 * or use ldexp(count, -32).
+ 	 * Watts = Joules/Time delta
+ 	 */
+-	return v << (32 - rapl_hw_unit[cfg - 1]);
++	return v << (32 - hw_unit);
+ }
+ 
+ static u64 rapl_event_update(struct perf_event *event)
+@@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
+ 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ 	delta >>= shift;
+ 
+-	sdelta = rapl_scale(delta, event->hw.config);
++	sdelta = rapl_scale(delta, event);
+ 
+ 	local64_add(sdelta, &event->count);
+ 
+@@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+ 
+ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+ {
+-	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
++	struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ 	struct perf_event *event;
+ 	unsigned long flags;
+ 
+-	if (!pmu->n_active)
++	if (!rapl_pmu->n_active)
+ 		return HRTIMER_NORESTART;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+-	list_for_each_entry(event, &pmu->active_list, active_entry)
++	list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
+ 		rapl_event_update(event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
++	hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
+ 
+ 	return HRTIMER_RESTART;
+ }
+ 
+-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
+ {
+-	struct hrtimer *hr = &pmu->hrtimer;
++	struct hrtimer *hr = &rapl_pmu->hrtimer;
+ 
+ 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	hr->function = rapl_hrtimer_handle;
+ }
+ 
+-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
+ 				   struct perf_event *event)
+ {
+ 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+@@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ 
+ 	event->hw.state = 0;
+ 
+-	list_add_tail(&event->active_entry, &pmu->active_list);
++	list_add_tail(&event->active_entry, &rapl_pmu->active_list);
+ 
+ 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+ 
+-	pmu->n_active++;
+-	if (pmu->n_active == 1)
+-		rapl_start_hrtimer(pmu);
++	rapl_pmu->n_active++;
++	if (rapl_pmu->n_active == 1)
++		rapl_start_hrtimer(rapl_pmu);
+ }
+ 
+ static void rapl_pmu_event_start(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
+-	__rapl_pmu_event_start(pmu, event);
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
++	__rapl_pmu_event_start(rapl_pmu, event);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	/* mark event as deactivated and stopped */
+ 	if (!(hwc->state & PERF_HES_STOPPED)) {
+-		WARN_ON_ONCE(pmu->n_active <= 0);
+-		pmu->n_active--;
+-		if (pmu->n_active == 0)
+-			hrtimer_cancel(&pmu->hrtimer);
++		WARN_ON_ONCE(rapl_pmu->n_active <= 0);
++		rapl_pmu->n_active--;
++		if (rapl_pmu->n_active == 0)
++			hrtimer_cancel(&rapl_pmu->hrtimer);
+ 
+ 		list_del(&event->active_entry);
+ 
+@@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+ 		hwc->state |= PERF_HES_UPTODATE;
+ 	}
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ }
+ 
+ static int rapl_pmu_event_add(struct perf_event *event, int mode)
+ {
+-	struct rapl_pmu *pmu = event->pmu_private;
++	struct rapl_pmu *rapl_pmu = event->pmu_private;
+ 	struct hw_perf_event *hwc = &event->hw;
+ 	unsigned long flags;
+ 
+-	raw_spin_lock_irqsave(&pmu->lock, flags);
++	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ 
+ 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ 
+ 	if (mode & PERF_EF_START)
+-		__rapl_pmu_event_start(pmu, event);
++		__rapl_pmu_event_start(rapl_pmu, event);
+ 
+-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
++	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+ 
+ 	return 0;
+ }
+@@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
+ static int rapl_pmu_event_init(struct perf_event *event)
+ {
+ 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+-	int bit, ret = 0;
+-	struct rapl_pmu *pmu;
++	int bit, rapl_pmus_scope, ret = 0;
++	struct rapl_pmu *rapl_pmu;
++	unsigned int rapl_pmu_idx;
++	struct rapl_pmus *rapl_pmus;
+ 
+-	/* only look at RAPL events */
+-	if (event->attr.type != rapl_pmus->pmu.type)
+-		return -ENOENT;
++	/* unsupported modes and filters */
++	if (event->attr.sample_period) /* no sampling */
++		return -EINVAL;
+ 
+ 	/* check only supported bits are set */
+ 	if (event->attr.config & ~RAPL_EVENT_MASK)
+@@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
++	rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
++	if (!rapl_pmus)
++		return -EINVAL;
++	rapl_pmus_scope = rapl_pmus->pmu.scope;
++
++	if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
++		/* only look at RAPL package events */
++		if (event->attr.type != rapl_pmus_pkg->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
++	} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
++		/* only look at RAPL core events */
++		if (event->attr.type != rapl_pmus_core->pmu.type)
++			return -ENOENT;
++
++		cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
++		if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
++			return -EINVAL;
++
++		bit = cfg - 1;
++		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
++	} else
+ 		return -EINVAL;
+-
+-	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
+-	bit = cfg - 1;
+ 
+ 	/* check event supported */
+-	if (!(rapl_cntr_mask & (1 << bit)))
++	if (!(rapl_pmus->cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+ 
+-	/* unsupported modes and filters */
+-	if (event->attr.sample_period) /* no sampling */
++	rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
++	if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ 		return -EINVAL;
+-
+ 	/* must be done before validate_group */
+-	pmu = cpu_to_rapl_pmu(event->cpu);
+-	if (!pmu)
++	rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
++	if (!rapl_pmu)
+ 		return -EINVAL;
+-	event->pmu_private = pmu;
+-	event->hw.event_base = rapl_msrs[bit].msr;
++
++	event->pmu_private = rapl_pmu;
+ 	event->hw.config = cfg;
+ 	event->hw.idx = bit;
+ 
+@@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+ RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+ RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+ RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
++RAPL_EVENT_ATTR_STR(energy-core,   rapl_core, "event=0x01");
+ 
+ RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+ RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
++RAPL_EVENT_ATTR_STR(energy-core.unit,   rapl_core_unit, "Joules");
+ 
+ /*
+  * we compute in 0.23 nJ increments regardless of MSR
+@@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890
+ RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+ RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
++RAPL_EVENT_ATTR_STR(energy-core.scale,   rapl_core_scale, "2.3283064365386962890625e-10");
+ 
+ /*
+  * There are no default events, but we need to create
+@@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
+ 	NULL,
+ };
+ 
++static const struct attribute_group *rapl_core_attr_groups[] = {
++	&rapl_pmu_format_group,
++	&rapl_pmu_events_group,
++	NULL,
++};
++
+ static struct attribute *rapl_events_cores[] = {
+ 	EVENT_PTR(rapl_cores),
+ 	EVENT_PTR(rapl_cores_unit),
+@@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
+ 	.attrs = rapl_events_psys,
+ };
+ 
++static struct attribute *rapl_events_core[] = {
++	EVENT_PTR(rapl_core),
++	EVENT_PTR(rapl_core_unit),
++	EVENT_PTR(rapl_core_scale),
++	NULL,
++};
++
++static struct attribute_group rapl_events_core_group = {
++	.name  = "events",
++	.attrs = rapl_events_core,
++};
++
+ static bool test_msr(int idx, void *data)
+ {
+ 	return test_bit(idx, (unsigned long *) data);
+@@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
+ };
+ 
+ /*
+- * Force to PERF_RAPL_MAX size due to:
+- * - perf_msr_probe(PERF_RAPL_MAX)
++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
+  * - want to use same event codes across both architectures
+  */
+-static struct perf_msr amd_rapl_msrs[] = {
++static struct perf_msr amd_rapl_pkg_msrs[] = {
+ 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
+ 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
+ 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
+@@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
+ 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
+ };
+ 
+-static int rapl_check_hw_unit(struct rapl_model *rm)
++static struct perf_msr amd_rapl_core_msrs[] = {
++	[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
++				 test_msr, false, RAPL_MSR_MASK },
++};
++
++static int rapl_check_hw_unit(void)
+ {
+ 	u64 msr_rapl_power_unit_bits;
+ 	int i;
+ 
+ 	/* protect rdmsrl() to handle virtualization */
+-	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
++	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
+ 		return -1;
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+-		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
++		rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ 
+-	switch (rm->unit_quirk) {
++	rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
++
++	switch (rapl_model->unit_quirk) {
+ 	/*
+ 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ 	 * different than the unit from power unit MSR. See
+@@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ 	 */
+ 	case RAPL_UNIT_QUIRK_INTEL_HSW:
+-		rapl_hw_unit[PERF_RAPL_RAM] = 16;
++		rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
+ 		break;
+ 	/* SPR uses a fixed energy unit for Psys domain. */
+ 	case RAPL_UNIT_QUIRK_INTEL_SPR:
+-		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
++		rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
+ 		break;
+ 	default:
+ 		break;
+ 	}
+ 
+-
+ 	/*
+ 	 * Calculate the timer rate:
+ 	 * Use reference of 200W for scaling the timeout to avoid counter
+@@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ 	 * if hw unit is 32, then we use 2 ms 1/200/2
+ 	 */
+ 	rapl_timer_ms = 2;
+-	if (rapl_hw_unit[0] < 32) {
++	if (rapl_pkg_hw_unit[0] < 32) {
+ 		rapl_timer_ms = (1000 / (2 * 100));
+-		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
++		rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
+ 	}
+ 	return 0;
+ }
+@@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
+ static void __init rapl_advertise(void)
+ {
+ 	int i;
++	int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
++
++	if (rapl_pmus_core)
++		num_counters += hweight32(rapl_pmus_core->cntr_mask);
+ 
+ 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+-		hweight32(rapl_cntr_mask), rapl_timer_ms);
++		num_counters, rapl_timer_ms);
+ 
+-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+-		if (rapl_cntr_mask & (1 << i)) {
++	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
++		if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
+ 			pr_info("hw unit of domain %s 2^-%d Joules\n",
+-				rapl_domain_names[i], rapl_hw_unit[i]);
++				rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ 		}
+ 	}
++
++	if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
++		pr_info("hw unit of domain %s 2^-%d Joules\n",
++			rapl_core_domain_name, rapl_core_hw_unit);
+ }
+ 
+-static void cleanup_rapl_pmus(void)
++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
+-		kfree(rapl_pmus->pmus[i]);
++		kfree(rapl_pmus->rapl_pmu[i]);
+ 	kfree(rapl_pmus);
+ }
+ 
+@@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
+ 	NULL,
+ };
+ 
+-static int __init init_rapl_pmu(void)
++static const struct attribute_group *rapl_core_attr_update[] = {
++	&rapl_events_core_group,
++	NULL,
++};
++
++static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
+ {
+-	struct rapl_pmu *pmu;
++	struct rapl_pmu *rapl_pmu;
+ 	int idx;
+ 
+ 	for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
+-		pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+-		if (!pmu)
++		rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
++		if (!rapl_pmu)
+ 			goto free;
+ 
+-		raw_spin_lock_init(&pmu->lock);
+-		INIT_LIST_HEAD(&pmu->active_list);
+-		pmu->pmu = &rapl_pmus->pmu;
+-		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+-		rapl_hrtimer_init(pmu);
++		raw_spin_lock_init(&rapl_pmu->lock);
++		INIT_LIST_HEAD(&rapl_pmu->active_list);
++		rapl_pmu->pmu = &rapl_pmus->pmu;
++		rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
++		rapl_hrtimer_init(rapl_pmu);
+ 
+-		rapl_pmus->pmus[idx] = pmu;
++		rapl_pmus->rapl_pmu[idx] = rapl_pmu;
+ 	}
+ 
+ 	return 0;
+ free:
+ 	for (; idx > 0; idx--)
+-		kfree(rapl_pmus->pmus[idx - 1]);
++		kfree(rapl_pmus->rapl_pmu[idx - 1]);
+ 	return -ENOMEM;
+ }
+ 
+-static int __init init_rapl_pmus(void)
++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
++				 const struct attribute_group **rapl_attr_groups,
++				 const struct attribute_group **rapl_attr_update)
+ {
+ 	int nr_rapl_pmu = topology_max_packages();
+-	int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
++	struct rapl_pmus *rapl_pmus;
+ 
+-	if (!rapl_pmu_is_pkg_scope()) {
+-		nr_rapl_pmu *= topology_max_dies_per_package();
+-		rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+-	}
++	/*
++	 * rapl_pmu_scope must be either PKG, DIE or CORE
++	 */
++	if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
++		nr_rapl_pmu	*= topology_max_dies_per_package();
++	else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
++		nr_rapl_pmu	*= topology_num_cores_per_package();
++	else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
++		return -EINVAL;
+ 
+-	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
++	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ 	if (!rapl_pmus)
+ 		return -ENOMEM;
+ 
++	*rapl_pmus_ptr = rapl_pmus;
++
+ 	rapl_pmus->nr_rapl_pmu		= nr_rapl_pmu;
+ 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
+ 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
+@@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
+ 	rapl_pmus->pmu.module		= THIS_MODULE;
+ 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+ 
+-	return init_rapl_pmu();
++	return init_rapl_pmu(rapl_pmus);
+ }
+ 
+ static struct rapl_model model_snb = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_snbep = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsw = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_hsx = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_knl = {
+-	.events		= BIT(PERF_RAPL_PKG) |
++	.pkg_events	= BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs	= intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_skl = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PP1) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_msrs,
++	.rapl_pkg_msrs      = intel_rapl_msrs,
+ };
+ 
+ static struct rapl_model model_spr = {
+-	.events		= BIT(PERF_RAPL_PP0) |
++	.pkg_events	= BIT(PERF_RAPL_PP0) |
+ 			  BIT(PERF_RAPL_PKG) |
+ 			  BIT(PERF_RAPL_RAM) |
+ 			  BIT(PERF_RAPL_PSYS),
+ 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
+ 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+-	.rapl_msrs      = intel_rapl_spr_msrs,
++	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
+ };
+ 
+ static struct rapl_model model_amd_hygon = {
+-	.events		= BIT(PERF_RAPL_PKG),
++	.pkg_events	= BIT(PERF_RAPL_PKG),
++	.core_events	= BIT(PERF_RAPL_CORE),
+ 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+-	.rapl_msrs      = amd_rapl_msrs,
++	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
++	.rapl_core_msrs	= amd_rapl_core_msrs,
+ };
+ 
+ static const struct x86_cpu_id rapl_model_match[] __initconst = {
+@@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
+ static int __init rapl_pmu_init(void)
+ {
+ 	const struct x86_cpu_id *id;
+-	struct rapl_model *rm;
++	int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
+ 	int ret;
+ 
++	if (rapl_pkg_pmu_is_pkg_scope())
++		rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
++
+ 	id = x86_match_cpu(rapl_model_match);
+ 	if (!id)
+ 		return -ENODEV;
+ 
+-	rm = (struct rapl_model *) id->driver_data;
+-
+-	rapl_msrs = rm->rapl_msrs;
++	rapl_model = (struct rapl_model *) id->driver_data;
+ 
+-	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
+-					false, (void *) &rm->events);
+-
+-	ret = rapl_check_hw_unit(rm);
++	ret = rapl_check_hw_unit();
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = init_rapl_pmus();
++	ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
++			     rapl_attr_update);
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
++	rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
++						  PERF_RAPL_PKG_EVENTS_MAX, false,
++						  (void *) &rapl_model->pkg_events);
++
++	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
+ 	if (ret)
+ 		goto out;
+ 
++	if (rapl_model->core_events) {
++		ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
++				     rapl_core_attr_groups,
++				     rapl_core_attr_update);
++		if (ret) {
++			pr_warn("power-core PMU initialization failed (%d)\n", ret);
++			goto core_init_failed;
++		}
++
++		rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
++						     PERF_RAPL_CORE_EVENTS_MAX, false,
++						     (void *) &rapl_model->core_events);
++
++		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
++		if (ret) {
++			pr_warn("power-core PMU registration failed (%d)\n", ret);
++			cleanup_rapl_pmus(rapl_pmus_core);
++		}
++	}
++
++core_init_failed:
+ 	rapl_advertise();
+ 	return 0;
+ 
+ out:
+ 	pr_warn("Initialization failed (%d), disabled\n", ret);
+-	cleanup_rapl_pmus();
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ 	return ret;
+ }
+ module_init(rapl_pmu_init);
+ 
+ static void __exit intel_rapl_exit(void)
+ {
+-	perf_pmu_unregister(&rapl_pmus->pmu);
+-	cleanup_rapl_pmus();
++	if (rapl_pmus_core) {
++		perf_pmu_unregister(&rapl_pmus_core->pmu);
++		cleanup_rapl_pmus(rapl_pmus_core);
++	}
++	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
++	cleanup_rapl_pmus(rapl_pmus_pkg);
+ }
+ module_exit(intel_rapl_exit);
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 20e6009381ed..c0cd10182e90 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -98,6 +98,7 @@ struct cpuinfo_topology {
+ 	// Logical ID mappings
+ 	u32			logical_pkg_id;
+ 	u32			logical_die_id;
++	u32			logical_core_id;
+ 
+ 	// AMD Node ID and Nodes per Package info
+ 	u32			amd_node_id;
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index 63bab25a4896..ec134b719144 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
+ #define topology_logical_package_id(cpu)	(cpu_data(cpu).topo.logical_pkg_id)
+ #define topology_physical_package_id(cpu)	(cpu_data(cpu).topo.pkg_id)
+ #define topology_logical_die_id(cpu)		(cpu_data(cpu).topo.logical_die_id)
++#define topology_logical_core_id(cpu)		(cpu_data(cpu).topo.logical_core_id)
+ #define topology_die_id(cpu)			(cpu_data(cpu).topo.die_id)
+ #define topology_core_id(cpu)			(cpu_data(cpu).topo.core_id)
+ #define topology_ppin(cpu)			(cpu_data(cpu).ppin)
+diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
+index 10719aba6276..cacfd3f6abef 100644
+--- a/arch/x86/kernel/cpu/debugfs.c
++++ b/arch/x86/kernel/cpu/debugfs.c
+@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
+ 	seq_printf(m, "cpu_type:            %s\n", get_topology_cpu_type_name(c));
+ 	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
+ 	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
++	seq_printf(m, "logical_core_id:     %u\n", c->topo.logical_core_id);
+ 	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
+ 	seq_printf(m, "l2c_id:              %u\n", c->topo.l2c_id);
+ 	seq_printf(m, "amd_node_id:         %u\n", c->topo.amd_node_id);
+diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
+index 8277c64f88db..b5a5e1411469 100644
+--- a/arch/x86/kernel/cpu/topology_common.c
++++ b/arch/x86/kernel/cpu/topology_common.c
+@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
+ 	if (!early) {
+ 		c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ 		c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
++		c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ 	}
+ 
+ 	/* Package relative core ID */
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0010-pksm.patch b/sys-kernel/gentoo-sources-6.13/0010-pksm.patch
new file mode 100644
index 0000000..2ec1324
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0010-pksm.patch
@@ -0,0 +1,433 @@
+From 9c28765934eafaff8d73a642512b2b6118aea976 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:32:25 +0100
+Subject: [PATCH 10/12] pksm
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/alpha/kernel/syscalls/syscall.tbl        |   3 +
+ arch/arm/tools/syscall.tbl                    |   3 +
+ arch/m68k/kernel/syscalls/syscall.tbl         |   3 +
+ arch/microblaze/kernel/syscalls/syscall.tbl   |   3 +
+ arch/mips/kernel/syscalls/syscall_n32.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_n64.tbl     |   3 +
+ arch/mips/kernel/syscalls/syscall_o32.tbl     |   3 +
+ arch/parisc/kernel/syscalls/syscall.tbl       |   3 +
+ arch/powerpc/kernel/syscalls/syscall.tbl      |   3 +
+ arch/s390/kernel/syscalls/syscall.tbl         |   3 +
+ arch/sh/kernel/syscalls/syscall.tbl           |   3 +
+ arch/sparc/kernel/syscalls/syscall.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_32.tbl        |   3 +
+ arch/x86/entry/syscalls/syscall_64.tbl        |   3 +
+ arch/xtensa/kernel/syscalls/syscall.tbl       |   3 +
+ include/linux/syscalls.h                      |   3 +
+ include/uapi/asm-generic/unistd.h             |   9 +-
+ kernel/sys.c                                  | 138 ++++++++++++++++++
+ kernel/sys_ni.c                               |   3 +
+ scripts/syscall.tbl                           |   3 +
+ .../arch/powerpc/entry/syscalls/syscall.tbl   |   3 +
+ .../perf/arch/s390/entry/syscalls/syscall.tbl |   3 +
+ 22 files changed, 206 insertions(+), 1 deletion(-)
+
+diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
+index c59d53d6d3f3..121696f903e8 100644
+--- a/arch/alpha/kernel/syscalls/syscall.tbl
++++ b/arch/alpha/kernel/syscalls/syscall.tbl
+@@ -506,3 +506,6 @@
+ 574	common	getxattrat			sys_getxattrat
+ 575	common	listxattrat			sys_listxattrat
+ 576	common	removexattrat			sys_removexattrat
++577	common	process_ksm_enable		sys_process_ksm_enable
++578	common	process_ksm_disable		sys_process_ksm_disable
++579	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
+index 49eeb2ad8dbd..1ce4d983b5b2 100644
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -481,3 +481,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
+index f5ed71f1910d..17e865370d37 100644
+--- a/arch/m68k/kernel/syscalls/syscall.tbl
++++ b/arch/m68k/kernel/syscalls/syscall.tbl
+@@ -466,3 +466,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
+index 680f568b77f2..64740e895587 100644
+--- a/arch/microblaze/kernel/syscalls/syscall.tbl
++++ b/arch/microblaze/kernel/syscalls/syscall.tbl
+@@ -472,3 +472,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
+index 0b9b7e25b69a..bfafb91a2eda 100644
+--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
+@@ -405,3 +405,6 @@
+ 464	n32	getxattrat			sys_getxattrat
+ 465	n32	listxattrat			sys_listxattrat
+ 466	n32	removexattrat			sys_removexattrat
++467	n32	process_ksm_enable		sys_process_ksm_enable
++468	n32	process_ksm_disable		sys_process_ksm_disable
++469	n32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
+index c844cd5cda62..39d446aeac64 100644
+--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
+@@ -381,3 +381,6 @@
+ 464	n64	getxattrat			sys_getxattrat
+ 465	n64	listxattrat			sys_listxattrat
+ 466	n64	removexattrat			sys_removexattrat
++467	n64	process_ksm_enable		sys_process_ksm_enable
++468	n64	process_ksm_disable		sys_process_ksm_disable
++469	n64	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
+index 349b8aad1159..61536c55715a 100644
+--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
+@@ -454,3 +454,6 @@
+ 464	o32	getxattrat			sys_getxattrat
+ 465	o32	listxattrat			sys_listxattrat
+ 466	o32	removexattrat			sys_removexattrat
++467	o32	process_ksm_enable		sys_process_ksm_enable
++468	o32	process_ksm_disable		sys_process_ksm_disable
++469	o32	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
+index d9fc94c86965..85dca5afcf06 100644
+--- a/arch/parisc/kernel/syscalls/syscall.tbl
++++ b/arch/parisc/kernel/syscalls/syscall.tbl
+@@ -465,3 +465,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
+index d8b4ab78bef0..57aa958c1b97 100644
+--- a/arch/powerpc/kernel/syscalls/syscall.tbl
++++ b/arch/powerpc/kernel/syscalls/syscall.tbl
+@@ -557,3 +557,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
+index e9115b4d8b63..2afc778f2d17 100644
+--- a/arch/s390/kernel/syscalls/syscall.tbl
++++ b/arch/s390/kernel/syscalls/syscall.tbl
+@@ -469,3 +469,6 @@
+ 464  common	getxattrat		sys_getxattrat			sys_getxattrat
+ 465  common	listxattrat		sys_listxattrat			sys_listxattrat
+ 466  common	removexattrat		sys_removexattrat		sys_removexattrat
++467  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++468  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++469  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
+index c8cad33bf250..dfe06a84d902 100644
+--- a/arch/sh/kernel/syscalls/syscall.tbl
++++ b/arch/sh/kernel/syscalls/syscall.tbl
+@@ -470,3 +470,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
+index 727f99d333b3..4c43b0d2d09f 100644
+--- a/arch/sparc/kernel/syscalls/syscall.tbl
++++ b/arch/sparc/kernel/syscalls/syscall.tbl
+@@ -512,3 +512,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 4d0fb2fba7e2..a63252b84261 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -472,3 +472,6 @@
+ 464	i386	getxattrat		sys_getxattrat
+ 465	i386	listxattrat		sys_listxattrat
+ 466	i386	removexattrat		sys_removexattrat
++467	i386	process_ksm_enable		sys_process_ksm_enable
++468	i386	process_ksm_disable		sys_process_ksm_disable
++469	i386	process_ksm_status		sys_process_ksm_status
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 5eb708bff1c7..b5fe77405938 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -390,6 +390,9 @@
+ 464	common	getxattrat		sys_getxattrat
+ 465	common	listxattrat		sys_listxattrat
+ 466	common	removexattrat		sys_removexattrat
++467	common	process_ksm_enable	sys_process_ksm_enable
++468	common	process_ksm_disable	sys_process_ksm_disable
++469	common	process_ksm_status	sys_process_ksm_status
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
+index 37effc1b134e..5c944f0dcc20 100644
+--- a/arch/xtensa/kernel/syscalls/syscall.tbl
++++ b/arch/xtensa/kernel/syscalls/syscall.tbl
+@@ -437,3 +437,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index c6333204d451..00400d99eef3 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -831,6 +831,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+ asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ 			size_t vlen, int behavior, unsigned int flags);
+ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
+ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+ 			unsigned long prot, unsigned long pgoff,
+ 			unsigned long flags);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 88dc393c2bca..34d73f16b478 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -850,8 +850,15 @@ __SYSCALL(__NR_listxattrat, sys_listxattrat)
+ #define __NR_removexattrat 466
+ __SYSCALL(__NR_removexattrat, sys_removexattrat)
+ 
++#define __NR_process_ksm_enable 467
++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
++#define __NR_process_ksm_disable 468
++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
++#define __NR_process_ksm_status 469
++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 467
++#define __NR_syscalls 470
+ 
+ /*
+  * 32 bit systems traditionally used different
+diff --git a/kernel/sys.c b/kernel/sys.c
+index c4c701c6f0b4..8806d113f5db 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -2816,6 +2816,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ 	return error;
+ }
+ 
++#ifdef CONFIG_KSM
++enum pkc_action {
++	PKSM_ENABLE = 0,
++	PKSM_DISABLE,
++	PKSM_STATUS,
++};
++
++static long do_process_ksm_control(int pidfd, enum pkc_action action)
++{
++	long ret;
++	struct task_struct *task;
++	struct mm_struct *mm;
++	unsigned int f_flags;
++
++	task = pidfd_get_task(pidfd, &f_flags);
++	if (IS_ERR(task)) {
++		ret = PTR_ERR(task);
++		goto out;
++	}
++
++	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
++	if (IS_ERR_OR_NULL(mm)) {
++		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
++		goto release_task;
++	}
++
++	/* Require CAP_SYS_NICE for influencing process performance. */
++	if (!capable(CAP_SYS_NICE)) {
++		ret = -EPERM;
++		goto release_mm;
++	}
++
++	if (mmap_write_lock_killable(mm)) {
++		ret = -EINTR;
++		goto release_mm;
++	}
++
++	switch (action) {
++		case PKSM_ENABLE:
++			ret = ksm_enable_merge_any(mm);
++			break;
++		case PKSM_DISABLE:
++			ret = ksm_disable_merge_any(mm);
++			break;
++		case PKSM_STATUS:
++			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
++			break;
++	}
++
++	mmap_write_unlock(mm);
++
++release_mm:
++	mmput(mm);
++release_task:
++	put_task_struct(task);
++out:
++	return ret;
++}
++#endif /* CONFIG_KSM */
++
++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_ENABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_DISABLE);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	if (flags != 0)
++		return -EINVAL;
++
++	return do_process_ksm_control(pidfd, PKSM_STATUS);
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++#ifdef CONFIG_KSM
++static ssize_t process_ksm_enable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
++}
++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
++
++static ssize_t process_ksm_disable_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
++}
++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
++
++static ssize_t process_ksm_status_show(struct kobject *kobj,
++		struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_process_ksm_status);
++}
++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
++
++static struct attribute *process_ksm_sysfs_attrs[] = {
++	&process_ksm_enable_attr.attr,
++	&process_ksm_disable_attr.attr,
++	&process_ksm_status_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group process_ksm_sysfs_attr_group = {
++	.attrs = process_ksm_sysfs_attrs,
++	.name = "process_ksm",
++};
++
++static int __init process_ksm_sysfs_init(void)
++{
++	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
++}
++subsys_initcall(process_ksm_sysfs_init);
++#endif /* CONFIG_KSM */
++
+ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ 		struct getcpu_cache __user *, unused)
+ {
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index c00a86931f8c..d82213d68522 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
+ COND_SYSCALL(madvise);
+ COND_SYSCALL(process_madvise);
+ COND_SYSCALL(process_mrelease);
++COND_SYSCALL(process_ksm_enable);
++COND_SYSCALL(process_ksm_disable);
++COND_SYSCALL(process_ksm_status);
+ COND_SYSCALL(remap_file_pages);
+ COND_SYSCALL(mbind);
+ COND_SYSCALL(get_mempolicy);
+diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
+index ebbdb3c42e9f..b19b6bfe5cd4 100644
+--- a/scripts/syscall.tbl
++++ b/scripts/syscall.tbl
+@@ -407,3 +407,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable			sys_process_ksm_enable
++468	common	process_ksm_disable			sys_process_ksm_disable
++469	common	process_ksm_status			sys_process_ksm_status
+diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+index d8b4ab78bef0..57aa958c1b97 100644
+--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+@@ -557,3 +557,6 @@
+ 464	common	getxattrat			sys_getxattrat
+ 465	common	listxattrat			sys_listxattrat
+ 466	common	removexattrat			sys_removexattrat
++467	common	process_ksm_enable		sys_process_ksm_enable
++468	common	process_ksm_disable		sys_process_ksm_disable
++469	common	process_ksm_status		sys_process_ksm_status
+diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+index e9115b4d8b63..2afc778f2d17 100644
+--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+@@ -469,3 +469,6 @@
+ 464  common	getxattrat		sys_getxattrat			sys_getxattrat
+ 465  common	listxattrat		sys_listxattrat			sys_listxattrat
+ 466  common	removexattrat		sys_removexattrat		sys_removexattrat
++467  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
++468  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
++469  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.13/0012-zstd.patch b/sys-kernel/gentoo-sources-6.13/0012-zstd.patch
new file mode 100644
index 0000000..df7f814
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.13/0012-zstd.patch
@@ -0,0 +1,23530 @@
+From 0b468cb06e1605b1cdb08b8c16d6d775ce653cf2 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 7 Mar 2025 19:33:03 +0100
+Subject: [PATCH 12/12] zstd
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/zstd.h                          |   86 +-
+ include/linux/zstd_errors.h                   |   30 +-
+ include/linux/zstd_lib.h                      | 1123 ++++--
+ lib/zstd/Makefile                             |    3 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  150 +
+ lib/zstd/common/bitstream.h                   |  155 +-
+ lib/zstd/common/compiler.h                    |  151 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   37 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   13 +-
+ lib/zstd/common/error_private.h               |   88 +-
+ lib/zstd/common/fse.h                         |  103 +-
+ lib/zstd/common/fse_decompress.c              |  132 +-
+ lib/zstd/common/huf.h                         |  240 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   45 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |  153 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
+ lib/zstd/compress/hist.c                      |   13 +-
+ lib/zstd/compress/hist.h                      |   10 +-
+ lib/zstd/compress/huf_compress.c              |  441 ++-
+ lib/zstd/compress/zstd_compress.c             | 3289 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  621 +++-
+ lib/zstd/compress/zstd_compress_literals.c    |  157 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |   21 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |   16 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  394 +-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  222 +-
+ lib/zstd/compress/zstd_double_fast.c          |  245 +-
+ lib/zstd/compress/zstd_double_fast.h          |   27 +-
+ lib/zstd/compress/zstd_fast.c                 |  703 +++-
+ lib/zstd/compress/zstd_fast.h                 |   16 +-
+ lib/zstd/compress/zstd_lazy.c                 |  840 +++--
+ lib/zstd/compress/zstd_lazy.h                 |  195 +-
+ lib/zstd/compress/zstd_ldm.c                  |  102 +-
+ lib/zstd/compress/zstd_ldm.h                  |   17 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  571 +--
+ lib/zstd/compress/zstd_opt.h                  |   55 +-
+ lib/zstd/compress/zstd_preSplit.c             |  239 ++
+ lib/zstd/compress/zstd_preSplit.h             |   34 +
+ lib/zstd/decompress/huf_decompress.c          |  887 +++--
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  377 +-
+ lib/zstd/decompress/zstd_decompress_block.c   |  724 ++--
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |   19 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |   75 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 60 files changed, 8747 insertions(+), 4380 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+ create mode 100644 lib/zstd/compress/zstd_preSplit.c
+ create mode 100644 lib/zstd/compress/zstd_preSplit.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index b2c7cf310c8f..d7be07c887e7 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -160,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters;
+ zstd_parameters zstd_get_params(int level,
+ 	unsigned long long estimated_src_size);
+ 
++typedef ZSTD_CCtx zstd_cctx;
++typedef ZSTD_cParameter zstd_cparameter;
++
++/**
++ * zstd_cctx_set_param() - sets a compression parameter
++ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
++ * @param:        The parameter to set.
++ * @value:        The value to set the parameter to.
++ *
++ * Return:        Zero or an error, which can be checked using zstd_is_error().
++ */
++size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value);
++
+ 
+ /**
+  * zstd_get_cparams() - returns zstd_compression_parameters for selected level
+@@ -175,8 +188,6 @@ zstd_compression_parameters zstd_get_cparams(int level,
+ 
+ /* ======   Single-pass Compression   ====== */
+ 
+-typedef ZSTD_CCtx zstd_cctx;
+-
+ /**
+  * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+  * @parameters: The compression parameters to be used.
+@@ -190,6 +201,20 @@ typedef ZSTD_CCtx zstd_cctx;
+  */
+ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+ 
++/**
++ * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to
++ * initialize a zstd_cctx when using the block-level external sequence
++ * producer API.
++ * @parameters: The compression parameters to be used.
++ *
++ * If multiple compression parameters might be used, the caller must call
++ * this function for each set of parameters and use the maximum size.
++ *
++ * Return:      A lower bound on the size of the workspace that is passed to
++ *              zstd_init_cctx().
++ */
++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters);
++
+ /**
+  * zstd_init_cctx() - initialize a zstd compression context
+  * @workspace:      The workspace to emplace the context into. It must outlive
+@@ -424,6 +449,16 @@ typedef ZSTD_CStream zstd_cstream;
+  */
+ size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+ 
++/**
++ * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize
++ * a zstd_cstream when using the block-level external sequence producer API.
++ * @cparams: The compression parameters to be used for compression.
++ *
++ * Return:   A lower bound on the size of the workspace that is passed to
++ *           zstd_init_cstream().
++ */
++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams);
++
+ /**
+  * zstd_init_cstream() - initialize a zstd streaming compression context
+  * @parameters        The zstd parameters to use for compression.
+@@ -583,6 +618,18 @@ size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+  */
+ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
+ 
++/**
++ * zstd_register_sequence_producer() - exposes the zstd library function
++ * ZSTD_registerSequenceProducer(). This is used for the block-level external
++ * sequence producer API. See upstream zstd.h for detailed documentation.
++ */
++typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f;
++void zstd_register_sequence_producer(
++  zstd_cctx *cctx,
++  void* sequence_producer_state,
++  zstd_sequence_producer_f sequence_producer
++);
++
+ /**
+  * struct zstd_frame_params - zstd frame parameters stored in the frame header
+  * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+@@ -596,7 +643,7 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
+  *
+  * See zstd_lib.h.
+  */
+-typedef ZSTD_frameHeader zstd_frame_header;
++typedef ZSTD_FrameHeader zstd_frame_header;
+ 
+ /**
+  * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+@@ -611,4 +658,35 @@ typedef ZSTD_frameHeader zstd_frame_header;
+ size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+ 	size_t src_size);
+ 
++/**
++ * struct zstd_sequence - a sequence of literals or a match
++ *
++ * @offset: The offset of the match
++ * @litLength: The literal length of the sequence
++ * @matchLength: The match length of the sequence
++ * @rep: Represents which repeat offset is used
++ */
++typedef ZSTD_Sequence zstd_sequence;
++
++/**
++ * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals
++ *
++ * @cctx: The zstd compression context.
++ * @dst: The buffer to compress the data into.
++ * @dst_capacity: The size of the destination buffer.
++ * @in_seqs: The array of zstd_sequence to compress.
++ * @in_seqs_size: The number of sequences in in_seqs.
++ * @literals: The literals associated to the sequences to be compressed.
++ * @lit_size: The size of the literals in the literals buffer.
++ * @lit_capacity: The size of the literals buffer.
++ * @decompressed_size: The size of the input data
++ *
++ * Return: The compressed size or an error, which can be checked using
++ * 	   zstd_is_error().
++ */
++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity,
++					    const zstd_sequence *in_seqs, size_t in_seqs_size,
++					    const void* literals, size_t lit_size, size_t lit_capacity,
++					    size_t decompressed_size);
++
+ #endif  /* LINUX_ZSTD_H */
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..c307fb011132 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,13 +13,18 @@
+ #define ZSTD_ERRORS_H_398273423
+ 
+ 
+-/*===== dependency =====*/
+-#include <linux/types.h>   /* size_t */
++/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
++#define ZSTDERRORLIB_VISIBLE 
+ 
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
+ 
+-/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +49,18 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_cannotProduce_uncompressedBlock = 49,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,18 +68,18 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+-/*! ZSTD_getErrorCode() :
+-    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+-    which can be used to compare with enum list published above */
+-ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+ 
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..e295d4125dde 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,47 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
+-#include <linux/limits.h>   /* INT_MAX */
++
++/* ======   Dependencies   ======*/
+ #include <linux/types.h>   /* size_t */
+ 
++#include <linux/zstd_errors.h> /* list of errors */
++#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
++#include <linux/limits.h>   /* INT_MAX */
++#endif /* ZSTD_STATIC_LINKING_ONLY */
++
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +90,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  7
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ 
+ 
+ /* *************************************
+-*  Simple API
++*  Simple Core API
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                                   int compressionLevel);
+ 
+ /*! ZSTD_decompress() :
+- *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+- *  `dstCapacity` is an upper bound of originalSize to regenerate.
+- *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+- *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+- *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
++ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
++ *  Multiple compressed frames can be decompressed at once with this method.
++ *  The result will be the concatenation of all decompressed frames, back to back.
++ * `dstCapacity` is an upper bound of originalSize to regenerate.
++ *  First frame's decompressed size can be extracted using ZSTD_getFrameContentSize().
++ *  If maximum upper bound isn't known, prefer using streaming mode to decompress data.
++ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
++ *           or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                               const void* src, size_t compressedSize);
+ 
++
++/*======  Decompression helper functions  ======*/
++
+ /*! ZSTD_getFrameContentSize() : requires v1.3.0+
+- *  `src` should point to the start of a ZSTD encoded frame.
+- *  `srcSize` must be at least as large as the frame header.
+- *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+- *  @return : - decompressed size of `src` frame content, if known
+- *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+- *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+- *   note 1 : a 0 return value means the frame is valid but "empty".
+- *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+- *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+- *            In which case, it's necessary to use streaming mode to decompress data.
+- *            Optionally, application can rely on some implicit limit,
+- *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+- *            (For example, data could be necessarily cut into blocks <= 16 KB).
+- *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+- *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+- *   note 4 : decompressed size can be very large (64-bits value),
+- *            potentially larger than what local system can handle as a single memory segment.
+- *            In which case, it's necessary to use streaming mode to decompress data.
+- *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+- *            Always ensure return value fits within application's authorized limits.
+- *            Each application can set its own limits.
+- *   note 6 : This function replaces ZSTD_getDecompressedSize() */
++ * `src` should point to the start of a ZSTD encoded frame.
++ * `srcSize` must be at least as large as the frame header.
++ *           hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
++ * @return : - decompressed size of `src` frame content, if known
++ *           - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
++ *           - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
++ *  note 1 : a 0 return value means the frame is valid but "empty".
++ *           When invoking this method on a skippable frame, it will return 0.
++ *  note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode).
++ *           When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
++ *           In which case, it's necessary to use streaming mode to decompress data.
++ *           Optionally, application can rely on some implicit limit,
++ *           as ZSTD_decompress() only needs an upper bound of decompressed size.
++ *           (For example, data could be necessarily cut into blocks <= 16 KB).
++ *  note 3 : decompressed size is always present when compression is completed using single-pass functions,
++ *           such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
++ *  note 4 : decompressed size can be very large (64-bits value),
++ *           potentially larger than what local system can handle as a single memory segment.
++ *           In which case, it's necessary to use streaming mode to decompress data.
++ *  note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
++ *           Always ensure return value fits within application's authorized limits.
++ *           Each application can set its own limits.
++ *  note 6 : This function replaces ZSTD_getDecompressedSize() */
+ #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+ #define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+ 
+-/*! ZSTD_getDecompressedSize() :
+- *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
++/*! ZSTD_getDecompressedSize() (obsolete):
++ *  This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+  *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+@@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t
+  * `srcSize` must be >= first frame size
+  * @return : the compressed size of the first frame starting at `src`,
+  *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+- *        or an error code if input is invalid */
++ *           or an error code if input is invalid
++ *  Note 1: this method is called _find*() because it's not enough to read the header,
++ *          it may have to scan through the frame's content, to reach its end.
++ *  Note 2: this method also works with Skippable Frames. In which case,
++ *          it returns the size of the complete skippable frame,
++ *          which is always equal to its content size + 8 bytes for headers. */
+ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+ 
+ 
+-/*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+-ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+-ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+-ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+-ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
++/*======  Compression helper functions  ======*/
++
++/*! ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()`, or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize is too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++
++
++/*======  Error helper functions  ======*/
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
++ZSTDLIB_API unsigned     ZSTD_isError(size_t result);      /*!< tells if a `size_t` function result is an error code */
++ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */
++ZSTDLIB_API const char*  ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */
++ZSTDLIB_API int          ZSTD_minCLevel(void);             /*!< minimum negative compression level allowed, requires v1.4.0+ */
++ZSTDLIB_API int          ZSTD_maxCLevel(void);             /*!< maximum compression level available */
++ZSTDLIB_API int          ZSTD_defaultCLevel(void);         /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
+ 
+ 
+ /* *************************************
+@@ -182,25 +248,25 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
+ ***************************************/
+ /*= Compression context
+  *  When compressing many times,
+- *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
+- *  This will make workload friendlier for system's memory.
++ *  it is recommended to allocate a compression context just once,
++ *  and reuse it for each successive compression operation.
++ *  This will make the workload easier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+- *  Note 2 : In multi-threaded environments,
+- *         use one different context per thread for parallel execution.
++ *  Note 2: For parallel execution in multi-threaded environments,
++ *         use one different context per thread .
+  */
+ typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+-ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer */
++ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* compatible with NULL pointer */
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+- *  they will all be reset. Only `compressionLevel` remains.
++ *  they will all be reset. Only @compressionLevel remains.
+  */
+ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +286,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +390,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximately targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,15 +482,18 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+      * ZSTD_c_stableOutBuffer
+      * ZSTD_c_blockDelimiters
+      * ZSTD_c_validateSequences
+-     * ZSTD_c_useBlockSplitter
++     * ZSTD_c_blockSplitterLevel
++     * ZSTD_c_splitAfterSequences
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -421,7 +503,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +512,12 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016,
++     ZSTD_c_experimentalParam20=1017
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +580,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +632,17 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
+ 
+ } ZSTD_dParameter;
+ 
+@@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +793,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be re-employed multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *  The function will update both `pos` fields.
+ *  If `input.pos < input.size`, some input has not been consumed.
+ *  It's up to the caller to present again remaining data.
++*
+ *  The function tries to flush all data decoded immediately, respecting output buffer size.
+ *  If `output.pos < output.size`, decoder has flushed everything it could.
+-*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
++*
++*  However, when `output.pos == output.size`, it's more difficult to know.
++*  If @return > 0, the frame is not complete, meaning
++*  either there is still some data left to flush within internal buffers,
++*  or there is more input to read to complete the frame (or both).
+ *  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+ *  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *        or an error code, which can be tested using ZSTD_isError(),
+ *        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+ *                                the return value is a suggested next input size (just a hint for better latency)
+-*                                that will never request more than the remaining frame size.
++*                                that will never request more than the remaining content of the compressed frame.
+ * *******************************************************************************/
+ 
+ typedef ZSTD_DCtx ZSTD_DStream;  /*< DCtx and DStream are now effectively same object (>= v1.3.0) */
+@@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder flushed internal output buffer.
++ * - `output.pos == output.size`, unflushed data potentially present in the internal buffers,
++ *   check ZSTD_decompressStream() @return value,
++ *   if > 0, invoke it again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is reused,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ 
++
+ #endif  /* ZSTD_H_235446 */
+ 
+ 
+@@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+ #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+ 
++
+ /* This can be overridden externally to hide static symbols. */
+ #ifndef ZSTDLIB_STATIC_API
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1188,7 +1311,7 @@ typedef struct {
+                                *
+                                * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                                * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+-                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
++                               * sequence provider perspective. For example, ZSTD_compressSequences() does not
+                                * use this 'rep' field at all (as of now).
+                                */
+ } ZSTD_Sequence;
+@@ -1293,17 +1416,18 @@ typedef enum {
+ } ZSTD_literalCompressionMode_e;
+ 
+ typedef enum {
+-  /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final
+-   * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable
+-   * or ZSTD_ps_disable allow for a force enable/disable the feature.
++  /* Note: This enum controls features which are conditionally beneficial.
++   * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto),
++   * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature.
+    */
+   ZSTD_ps_auto = 0,         /* Let the library automatically determine whether the feature shall be enabled */
+   ZSTD_ps_enable = 1,       /* Force-enable the feature */
+   ZSTD_ps_disable = 2       /* Do not use the feature */
+-} ZSTD_paramSwitch_e;
++} ZSTD_ParamSwitch_e;
++#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e  /* old name */
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src,
+ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_frameHeaderSize() :
+- *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
++ *  srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+  * @return : size of the Frame Header,
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e;
++#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_FrameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;                     /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_FrameHeader;
++#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header into `zfhPtr`, or requires larger `srcSize`.
++ * @return : 0 => header is complete, `zfhPtr` is correctly filled,
++ *          >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize);
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+-  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+-  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+-} ZSTD_sequenceFormat_e;
++  ZSTD_sf_noBlockDelimiters = 0,         /* ZSTD_Sequence[] has no block delimiters, just sequences */
++  ZSTD_sf_explicitBlockDelimiters = 1    /* ZSTD_Sequence[] contains explicit block delimiters */
++} ZSTD_SequenceFormat_e;
++#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */
++
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+ 
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsCapacity The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
+  */
+-
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++                       const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+- * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
++ * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.).
+  * The entire source is compressed into a single frame.
+  *
+  * The compression behavior changes based on cctx params. In particular:
+@@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  *    the block size derived from the cctx, and sequences may be split. This is the default setting.
+  *
+  *    If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+- *    block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
++ *    valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
++ *
++ *    When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes
++ *    using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit
++ *    can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation.
++ *    By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10).
++ *    ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction.
+  *
+- *    If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined
+- *    behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for
+- *    specifics regarding offset/matchlength requirements) then the function will bail out and return an error.
++ *    If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined
++ *    behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for
++ *    specifics regarding offset/matchlength requirements) and then bail out and return an error.
+  *
+  *    In addition to the two adjustable experimental params, there are other important cctx params.
+  *    - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN.
+@@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  *    - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset
+  *      is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md
+  *
+- * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+- * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+- *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
+- */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused.
++ * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly,
++ *         and cannot emit an RLE block that disagrees with the repcode history.
++ * @return : final compressed size, or a ZSTD error code.
++ */
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                       void* dst, size_t dstCapacity,
++                 const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                 const void* src, size_t srcSize);
++
++
++/*! ZSTD_compressSequencesAndLiterals() :
++ * This is a variant of ZSTD_compressSequences() which,
++ * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize),
++ * aka all the literals, already extracted and laid out into a single continuous buffer.
++ * This can be useful if the process generating the sequences also happens to generate the buffer of literals,
++ * thus skipping an extraction + caching stage.
++ * It's a speed optimization, useful when the right conditions are met,
++ * but it also features the following limitations:
++ * - Only supports explicit delimiter mode
++ * - Currently does not support Sequences validation (so input Sequences are trusted)
++ * - Not compatible with frame checksum, which must be disabled
++ * - If any block is incompressible, will fail and return an error
++ * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error.
++ * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals.
++ *   @litBufCapacity must be at least 8 bytes larger than @litSize.
++ * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error.
++ * @return : final compressed size, or a ZSTD error code.
++ */
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx,
++                                  void* dst, size_t dstCapacity,
++                            const ZSTD_Sequence* inSeqs, size_t nbSequences,
++                            const void* literals, size_t litSize, size_t litBufCapacity,
++                            size_t decompressedSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds
+  *
+  * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number,
+  * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15.
+- * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so
+- * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
++ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used,
++ * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
+  *
+  * Returns an error if destination buffer is not large enough, if the source size is not representable
+  * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid).
+@@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds
+  * @return : number of bytes written or a ZSTD error.
+  */
+ ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+-                                            const void* src, size_t srcSize, unsigned magicVariant);
++                                             const void* src, size_t srcSize,
++                                                   unsigned magicVariant);
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer.
+  *
+- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+- * in the magicVariant.
++ * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written,
++ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.
++ * This can be NULL if the caller is not interested in the magicVariant.
+  *
+  * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                                                  unsigned* magicVariant,
++                                                  const void* src, size_t srcSize);
+ 
+ /*! ZSTD_isSkippableFrame() :
+  *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
+  */
+-ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
++ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ 
+ 
+ 
+@@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1568,7 +1837,15 @@ typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+ typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+ static
+ __attribute__((__unused__))
++
++#if defined(__clang__) && __clang_major__ >= 5
++#pragma clang diagnostic push
++#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
++#endif
+ ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /*< this constant defers to stdlib's functions */
++#if defined(__clang__) && __clang_major__ >= 5
++#pragma clang diagnostic pop
++#endif
+ 
+ ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+@@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * See the comments on that enum for an explanation of the feature. */
+ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+ 
+-/* Controlled with ZSTD_paramSwitch_e enum.
++/* Controlled with ZSTD_ParamSwitch_e enum.
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never compress literals.
+  * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals
+@@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ /* ZSTD_c_validateSequences
+  * Default is 0 == disabled. Set to 1 to enable sequence validation.
+  *
+- * For use with sequence compression API: ZSTD_compressSequences().
+- * Designates whether or not we validate sequences provided to ZSTD_compressSequences()
++ * For use with sequence compression API: ZSTD_compressSequences*().
++ * Designates whether or not provided sequences are validated within ZSTD_compressSequences*()
+  * during function execution.
+  *
+- * Without validation, providing a sequence that does not conform to the zstd spec will cause
+- * undefined behavior, and may produce a corrupted block.
++ * When Sequence validation is disabled (default), Sequences are compressed as-is,
++ * so they must correct, otherwise it would result in a corruption error.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * Sequence validation adds some protection, by ensuring that all values respect boundary conditions.
++ * If a Sequence is detected invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+- *
+  */
+ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
+ 
+-/* ZSTD_c_useBlockSplitter
+- * Controlled with ZSTD_paramSwitch_e enum.
++/* ZSTD_c_blockSplitterLevel
++ * note: this parameter only influences the first splitter stage,
++ *       which is active before producing the sequences.
++ *       ZSTD_c_splitAfterSequences controls the next splitter stage,
++ *       which is active after sequence production.
++ *       Note that both can be combined.
++ * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included.
++ * 0 means "auto", which will select a value depending on current ZSTD_c_strategy.
++ * 1 means no splitting.
++ * Then, values from 2 to 6 are sorted in increasing cpu load order.
++ *
++ * Note that currently the first block is never split,
++ * to ensure expansion guarantees in presence of incompressible data.
++ */
++#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6
++#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20
++
++/* ZSTD_c_splitAfterSequences
++ * This is a stronger splitter algorithm,
++ * based on actual sequences previously produced by the selected parser.
++ * It's also slower, and as a consequence, mostly used for high compression levels.
++ * While the post-splitter does overlap with the pre-splitter,
++ * both can nonetheless be combined,
++ * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX,
++ * resulting in higher compression ratio than just one of them.
++ *
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never use block splitter.
+  * Set to ZSTD_ps_enable to always use block splitter.
+@@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
+  * block splitting based on the compression parameters.
+  */
+-#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
++#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13
+ 
+ /* ZSTD_c_useRowMatchFinder
+- * Controlled with ZSTD_paramSwitch_e enum.
++ * Controlled with ZSTD_ParamSwitch_e enum.
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never use row-based matchfinder.
+  * Set to ZSTD_ps_enable to force usage of row-based matchfinder.
+@@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_repcodeResolution
++ * This parameter only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters (may change in the future).
++ *
++ * This parameter affects how zstd parses external sequences,
++ * provided via the ZSTD_compressSequences*() API
++ * or from an external block-level sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets within
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences*() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level (currently: level<10 disables, level>=10 enables).
++ */
++#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */
++
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t (*ZSTD_sequenceProducer_F) (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+-  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
++  It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame,
+   such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+   Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+   As a consequence, check that values remain within valid application range.
+@@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+ 
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+-
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..be218b5e0ed5 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+@@ -26,6 +26,7 @@ zstd_compress-y := \
+ 		compress/zstd_lazy.o \
+ 		compress/zstd_ldm.o \
+ 		compress/zstd_opt.o \
++		compress/zstd_preSplit.o \
+ 
+ zstd_decompress-y := \
+ 		zstd_decompress_module.o \
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..16c3d08e8d1a
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "compiler.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..c5faaa3d7b08
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,150 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)__builtin_ctz(val);
++#else
++    return ZSTD_countTrailingZeros32_fallback(val);
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)__builtin_clz(val);
++#else
++    return ZSTD_countLeadingZeros32_fallback(val);
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4) && defined(__LP64__)
++    return (unsigned)__builtin_ctzll(val);
++#else
++    {
++        U32 mostSignificantWord = (U32)(val >> 32);
++        U32 leastSignificantWord = (U32)val;
++        if (leastSignificantWord == 0) {
++            return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++        } else {
++            return ZSTD_countTrailingZeros32(leastSignificantWord);
++        }
++    }
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)(__builtin_clzll(val));
++#else
++    {
++        U32 mostSignificantWord = (U32)(val >> 32);
++        U32 leastSignificantWord = (U32)val;
++        if (mostSignificantWord == 0) {
++            return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++        } else {
++            return ZSTD_countLeadingZeros32(mostSignificantWord);
++        }
++    }
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..86439da0eea7 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,7 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
+-
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ /*=========================================
+ *  Target specific
+@@ -41,12 +42,13 @@
+ /*-******************************************
+ *  bitStream encoding API (write forward)
+ ********************************************/
++typedef size_t BitContainerType;
+ /* bitStream can mix input from multiple sources.
+  * A critical property of these streams is that they encode and decode in **reverse** direction.
+  * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+  */
+ typedef struct {
+-    size_t bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitPos;
+     char*  startPtr;
+     char*  ptr;
+@@ -54,7 +56,7 @@ typedef struct {
+ } BIT_CStream_t;
+ 
+ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+-MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
++MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits);
+ MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ 
+@@ -63,7 +65,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ *  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+ *
+ *  bits are first added to a local register.
+-*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
++*  Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+ *  Writing data into memory is an explicit operation, performed by the flushBits function.
+ *  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+ *  After a flushBits, a maximum of 7 bits might still be stored into local register.
+@@ -80,28 +82,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ *  bitStream decoding API (read backward)
+ **********************************************/
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+-MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
++MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ 
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -113,7 +115,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ /*-****************************************
+ *  unsafe API
+ ******************************************/
+-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
++MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits);
+ /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+ 
+ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,16 +153,22 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+-                            size_t value, unsigned nbBits)
++                            BitContainerType value, unsigned nbBits)
+ {
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -195,7 +176,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+  *  works only if `value` is _clean_,
+  *  meaning all high bits above nbBits are 0 */
+ MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+-                                size_t value, unsigned nbBits)
++                                BitContainerType value, unsigned nbBits)
+ {
+     assert((value>>nbBits) == 0);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+@@ -242,7 +223,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+     BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+     BIT_flushBits(bitC);
+     if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
++    return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+ }
+ 
+ 
+@@ -266,35 +247,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -303,12 +284,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -318,26 +299,20 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+      * such cpus old (pre-Haswell, 2013) and their performance is not of that
+      * importance.
+      */
+-#if defined(__x86_64__) || defined(_M_X86)
++#if defined(__x86_64__) || defined(_M_X64)
+     return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
+ #else
+     return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -353,14 +328,14 @@ MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U3
+ 
+ /*! BIT_lookBitsFast() :
+  *  unsafe version; only works if nbBits >= 1 */
+-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
++MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+ {
+     U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+     assert(nbBits >= 1);
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,23 +344,38 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+-    size_t const value = BIT_lookBits(bitD, nbBits);
++    BitContainerType const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+     return value;
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
+-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
++ *  unsafe version; only works if nbBits >= 1 */
++MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+-    size_t const value = BIT_lookBitsFast(bitD, nbBits);
++    BitContainerType const value = BIT_lookBitsFast(bitD, nbBits);
+     assert(nbBits >= 1);
+     BIT_skipBits(bitD, nbBits);
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +386,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
+         return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
+@@ -442,5 +436,4 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+     return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+ }
+ 
+-
+ #endif /* BITSTREAM_H_MODULE */
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..dc9bd15e174e 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,16 +143,13 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+ 
+-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
+-
+-
+ /* compile time determination of SIMD support */
+ 
+ /* C-language Attributes are added in C23. */
+@@ -158,9 +172,15 @@
+ #define ZSTD_FALLTHROUGH fallthrough
+ 
+ /*-**************************************************************
+-*  Alignment check
++*  Alignment
+ *****************************************************************/
+ 
++/* @return 1 if @u is a 2^n value, 0 otherwise
++ * useful to check a value is valid for alignment restrictions */
++MEM_STATIC int ZSTD_isPower2(size_t u) {
++    return (u & (u-1)) == 0;
++}
++
+ /* this test was initially positioned in mem.h,
+  * but this file is removed (or replaced) for linux kernel
+  * so it's now hosted in compiler.h,
+@@ -175,10 +195,95 @@
+ 
+ #endif /* ZSTD_ALIGNOF */
+ 
++#ifndef ZSTD_ALIGNED
++/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
++#define ZSTD_ALIGNED(a) __attribute__((aligned(a)))
++#endif /* ZSTD_ALIGNED */
++
++
+ /*-**************************************************************
+ *  Sanitizer
+ *****************************************************************/
+ 
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without triggering
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..8eb6aa9a3b20 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,10 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..c8a10281f112 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -33,7 +34,6 @@
+ #define DEBUG_H_12987983217
+ 
+ 
+-
+ /* static assert is triggered at compile time, leaving no runtime artefact.
+  * static assert only works with compile-time constants.
+  * Also, this variant can only be used inside a function. */
+@@ -82,20 +82,27 @@ extern int g_debuglevel; /* the variable is only declared,
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+-
+-
+ #endif /* DEBUG_H_12987983217 */
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..6c3dbad838b6 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,23 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..08ee87b68cca 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,8 +14,6 @@
+ #ifndef ERROR_H_MODULE
+ #define ERROR_H_MODULE
+ 
+-
+-
+ /* ****************************************
+ *  Dependencies
+ ******************************************/
+@@ -23,7 +22,6 @@
+ #include "debug.h"
+ #include "zstd_deps.h"       /* size_t */
+ 
+-
+ /* ****************************************
+ *  Compiler-specific
+ ******************************************/
+@@ -49,8 +47,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +87,12 @@ void _force_has_format_string(const char *format, ...) {
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +103,49 @@ void _force_has_format_string(const char *format, ...) {
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
+-
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ #endif /* ERROR_H_MODULE */
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..b36ce7a2a8c3 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -11,8 +12,6 @@
+  * in the COPYING file in the root directory of this source tree).
+  * You may select, at your option, one of the above-listed licenses.
+ ****************************************************************** */
+-
+-
+ #ifndef FSE_H
+ #define FSE_H
+ 
+@@ -22,7 +21,6 @@
+ ******************************************/
+ #include "zstd_deps.h"    /* size_t, ptrdiff_t */
+ 
+-
+ /*-*****************************************
+ *  FSE_PUBLIC_API : control library symbols visibility
+ ******************************************/
+@@ -50,34 +48,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +58,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +192,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -286,13 +224,11 @@ If there is an error, the function will return an error code, which can be teste
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+-
+-/* *** Dependency *** */
+ #include "bitstream.h"
+ 
+-
+ /* *****************************************
+ *  Static allocation
+ *******************************************/
+@@ -317,16 +253,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +270,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -539,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+@@ -705,7 +623,4 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+ 
+ #define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3)
+ 
+-
+ #endif /* FSE_STATIC_LINKING_ONLY */
+-
+-
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39767..15081d8dc607 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+ #include "error_private.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +56,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -248,6 +191,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+     FSE_initDState(&state1, &bitD, dt);
+     FSE_initDState(&state2, &bitD, dt);
+ 
++    RETURN_ERROR_IF(BIT_reloadDStream(&bitD)==BIT_DStream_overflow, corruption_detected, "");
++
+ #define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+ 
+     /* 4 symbols per loop */
+@@ -287,32 +232,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+             break;
+     }   }
+ 
+-    return op-ostart;
+-}
+-
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +252,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +272,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +313,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..49736dcd8f49 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -12,105 +13,26 @@
+  * You may select, at your option, one of the above-listed licenses.
+ ****************************************************************** */
+ 
+-
+ #ifndef HUF_H_298734234
+ #define HUF_H_298734234
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
+-
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+ 
+-/* ***   Advanced function   *** */
+-
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +73,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +128,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +142,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +153,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +185,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -246,9 +193,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +236,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +252,27 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+-
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
+-
++#endif   /* HUF_H_298734234 */
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index c22a2e69bf46..d9bd752fe17b 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a527d..05286af72683 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -45,30 +46,35 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+ 
++/* Compile time determination of BMI2 support */
++
++
+ /* Enable runtime BMI2 dispatch based on the CPU.
+  * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+  */
+ #ifndef DYNAMIC_BMI2
+-  #if ((defined(__clang__) && __has_attribute(__target__)) \
++#  if ((defined(__clang__) && __has_attribute(__target__)) \
+       || (defined(__GNUC__) \
+           && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+-      && (defined(__x86_64__) || defined(_M_X64)) \
++      && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \
+       && !defined(__BMI2__)
+-  #  define DYNAMIC_BMI2 1
+-  #else
+-  #  define DYNAMIC_BMI2 0
+-  #endif
++#    define DYNAMIC_BMI2 1
++#  else
++#    define DYNAMIC_BMI2 0
++#  endif
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNU C compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+- * Only enable assembly for Linux / MacOS, other platforms may
++ * Only enable assembly for Linux / MacOS / Win32, other platforms may
+  * work, but they haven't been tested. This could likely be
+  * extended to BSD systems.
+  *
+@@ -90,4 +96,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..44b95b25344a 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33a1c..f931f7d0e294 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..52a79435caf6 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,12 +29,10 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+ 
+-
+ /* ---- static assert (debug) --- */
+ #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+ #define ZSTD_isError ERR_isError   /* for inlining */
+@@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+-typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
++typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+ 
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +103,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +168,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +188,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +217,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+@@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -273,62 +268,6 @@ typedef enum {
+ /*-*******************************************
+ *  Private declarations
+ *********************************************/
+-typedef struct seqDef_s {
+-    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
+-    U16 litLength;
+-    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
+-} seqDef;
+-
+-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
+-typedef enum {
+-    ZSTD_llt_none = 0,             /* no longLengthType */
+-    ZSTD_llt_literalLength = 1,    /* represents a long literal */
+-    ZSTD_llt_matchLength = 2       /* represents a long match */
+-} ZSTD_longLengthType_e;
+-
+-typedef struct {
+-    seqDef* sequencesStart;
+-    seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
+-    size_t maxNbSeq;
+-    size_t maxNbLit;
+-
+-    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
+-     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+-     * the existing value of the litLength or matchLength by 0x10000.
+-     */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+-} seqStore_t;
+-
+-typedef struct {
+-    U32 litLength;
+-    U32 matchLength;
+-} ZSTD_sequenceLength;
+-
+-/*
+- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
+- */
+-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+-{
+-    ZSTD_sequenceLength seqLen;
+-    seqLen.litLength = seq->litLength;
+-    seqLen.matchLength = seq->mlBase + MINMATCH;
+-    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+-        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
+-        }
+-        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
+-        }
+-    }
+-    return seqLen;
+-}
+ 
+ /*
+  * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+@@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
+-
+-
+ /* ZSTD_invalidateRepCodes() :
+  * ensures next compression will not use repcodes from previous block.
+  * Note : only works with regular variant;
+@@ -420,13 +296,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
+@@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+     return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+ }
+ 
+-
+ #endif   /* ZSTD_CCOMMON_H_MODULE */
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..44a3c10becf2 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..87145a2d9160 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,6 +27,16 @@ unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+ /*-**************************************************************
+  *  Histogram functions
+  ****************************************************************/
++void HIST_add(unsigned* count, const void* src, size_t srcSize)
++{
++    const BYTE* ip = (const BYTE*)src;
++    const BYTE* const end = ip + srcSize;
++
++    while (ip<end) {
++        count[*ip++]++;
++    }
++}
++
+ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                            const void* src, size_t srcSize)
+ {
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..e5d57d79e4d5 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -73,3 +74,10 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+  */
+ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                            const void* src, size_t srcSize);
++
++/*! HIST_add() :
++ *  Lowest level: just add nb of occurrences of characters from @src into @count.
++ *  @count is not reset. @count array is presumed large enough (i.e. 1 KB).
++ @  This function does not need any additional stack memory.
++ */
++void HIST_add(unsigned* count, const void* src, size_t srcSize);
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..0b229f5d2ae2 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
+ }
+ 
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
++}
++
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +519,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +528,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1249,81 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
++
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index 16bb995bc6c4..fc0a0f4e71a6 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,13 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
++#include "../common/error_private.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +29,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -44,7 +47,7 @@
+  * in log format, aka 17 => 1 << 17 == 128Ki positions.
+  * This structure is only used in zstd_opt.
+  * Since allocation is centralized for all strategies, it has to be known here.
+- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
++ * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3,
+  * so that zstd_opt.c doesn't need to know about this constant.
+  */
+ #ifndef ZSTD_HASHLOG3_MAX
+@@ -55,14 +58,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -75,12 +81,12 @@ struct ZSTD_CDict_s {
+     ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */
+     U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+     ZSTD_cwksp workspace;
+-    ZSTD_matchState_t matchState;
++    ZSTD_MatchState_t matchState;
+     ZSTD_compressedBlockState_t cBlockState;
+     ZSTD_customMem customMem;
+     U32 dictID;
+     int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+-    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
++    ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+                                            * row-based matchfinder. Unless the cdict is reloaded, we will use
+                                            * the same greedy/lazy matchfinder at compression time.
+                                            */
+@@ -130,11 +136,12 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+     ZSTD_cwksp_move(&cctx->workspace, &ws);
+     cctx->staticSize = workspaceSize;
+ 
+-    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+-    if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
++    /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */
++    if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+     cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+     cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+-    cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE);
++    cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE);
++    cctx->tmpWkspSize = TMP_WORKSPACE_SIZE;
+     cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+     return cctx;
+ }
+@@ -168,15 +175,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -205,7 +210,7 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+ }
+ 
+ /* private API call, for dictBuilder only */
+-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+ 
+ /* Returns true if the strategy supports using a row based matchfinder */
+ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+@@ -215,32 +220,23 @@ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+ /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
+  * for this compression.
+  */
+-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
++static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) {
+     assert(mode != ZSTD_ps_auto);
+     return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
+ }
+ 
+ /* Returns row matchfinder usage given an initial mode and cParams */
+-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode,
+                                                          const ZSTD_compressionParameters* const cParams) {
+-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
+-    int const kHasSIMD128 = 1;
+-#else
+-    int const kHasSIMD128 = 0;
+-#endif
+     if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+     mode = ZSTD_ps_disable;
+     if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+-    if (kHasSIMD128) {
+-        if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
+-    } else {
+-        if (cParams->windowLog > 17) mode = ZSTD_ps_enable;
+-    }
++    if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
+     return mode;
+ }
+ 
+ /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
+-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode,
+                                                         const ZSTD_compressionParameters* const cParams) {
+     if (mode != ZSTD_ps_auto) return mode;
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
+@@ -248,7 +244,7 @@ static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
+ 
+ /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
+ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+-                                   const ZSTD_paramSwitch_e useRowMatchFinder,
++                                   const ZSTD_ParamSwitch_e useRowMatchFinder,
+                                    const U32 forDDSDict) {
+     assert(useRowMatchFinder != ZSTD_ps_auto);
+     /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
+@@ -257,16 +253,44 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+     if (mode != ZSTD_ps_auto) return mode;
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -282,8 +306,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
+         assert(cctxParams.ldmParams.hashRateLog < 32);
+     }
+-    cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
++    cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +357,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -343,10 +374,13 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+      */
+     cctxParams->compressionLevel = compressionLevel;
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+-    cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
++    cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+-                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
++                cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+ 
+ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+@@ -359,7 +393,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +489,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -534,11 +568,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
+         bounds.lowerBound = (int)ZSTD_ps_auto;
+         bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
++    case ZSTD_c_blockSplitterLevel:
++        bounds.lowerBound = 0;
++        bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX;
++        return bounds;
++
+     case ZSTD_c_useRowMatchFinder:
+         bounds.lowerBound = (int)ZSTD_ps_auto;
+         bounds.upperBound = (int)ZSTD_ps_disable;
+@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_repcodeResolution:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -567,10 +626,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -584,6 +644,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_minMatch:
+     case ZSTD_c_targetLength:
+     case ZSTD_c_strategy:
++    case ZSTD_c_blockSplitterLevel:
+         return 1;
+ 
+     case ZSTD_c_format:
+@@ -610,9 +671,13 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_stableOutBuffer:
+     case ZSTD_c_blockDelimiters:
+     case ZSTD_c_validateSequences:
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_repcodeResolution:
+     default:
+         return 0;
+     }
+@@ -625,7 +690,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -665,9 +730,14 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_stableOutBuffer:
+     case ZSTD_c_blockDelimiters:
+     case ZSTD_c_validateSequences:
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
++    case ZSTD_c_blockSplitterLevel:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_repcodeResolution:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+-        const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value;
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
+-        CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
++        CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        }
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -843,28 +916,55 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_blockDelimiters:
+         BOUNDCHECK(ZSTD_c_blockDelimiters, value);
+-        CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value;
++        CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value;
+         return CCtxParams->blockDelimiters;
+ 
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
+ 
+-    case ZSTD_c_useBlockSplitter:
+-        BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+-        CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
+-        return CCtxParams->useBlockSplitter;
++    case ZSTD_c_splitAfterSequences:
++        BOUNDCHECK(ZSTD_c_splitAfterSequences, value);
++        CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->postBlockSplitter;
++
++    case ZSTD_c_blockSplitterLevel:
++        BOUNDCHECK(ZSTD_c_blockSplitterLevel, value);
++        CCtxParams->preBlockSplitter_level = value;
++        return (size_t)CCtxParams->preBlockSplitter_level;
+ 
+     case ZSTD_c_useRowMatchFinder:
+         BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+-        CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
++        CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value;
+         return CCtxParams->useRowMatchFinder;
+ 
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        assert(value>=0);
++        CCtxParams->maxBlockSize = (size_t)value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_repcodeResolution:
++        BOUNDCHECK(ZSTD_c_repcodeResolution, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+@@ -881,7 +981,7 @@ size_t ZSTD_CCtxParams_getParameter(
+     switch(param)
+     {
+     case ZSTD_c_format :
+-        *value = CCtxParams->format;
++        *value = (int)CCtxParams->format;
+         break;
+     case ZSTD_c_compressionLevel :
+         *value = CCtxParams->compressionLevel;
+@@ -896,16 +996,16 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = (int)CCtxParams->cParams.chainLog;
+         break;
+     case ZSTD_c_searchLog :
+-        *value = CCtxParams->cParams.searchLog;
++        *value = (int)CCtxParams->cParams.searchLog;
+         break;
+     case ZSTD_c_minMatch :
+-        *value = CCtxParams->cParams.minMatch;
++        *value = (int)CCtxParams->cParams.minMatch;
+         break;
+     case ZSTD_c_targetLength :
+-        *value = CCtxParams->cParams.targetLength;
++        *value = (int)CCtxParams->cParams.targetLength;
+         break;
+     case ZSTD_c_strategy :
+-        *value = (unsigned)CCtxParams->cParams.strategy;
++        *value = (int)CCtxParams->cParams.strategy;
+         break;
+     case ZSTD_c_contentSizeFlag :
+         *value = CCtxParams->fParams.contentSizeFlag;
+@@ -920,10 +1020,10 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = CCtxParams->forceWindow;
+         break;
+     case ZSTD_c_forceAttachDict :
+-        *value = CCtxParams->attachDictPref;
++        *value = (int)CCtxParams->attachDictPref;
+         break;
+     case ZSTD_c_literalCompressionMode :
+-        *value = CCtxParams->literalCompressionMode;
++        *value = (int)CCtxParams->literalCompressionMode;
+         break;
+     case ZSTD_c_nbWorkers :
+         assert(CCtxParams->nbWorkers == 0);
+@@ -939,19 +1039,19 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = CCtxParams->enableDedicatedDictSearch;
+         break;
+     case ZSTD_c_enableLongDistanceMatching :
+-        *value = CCtxParams->ldmParams.enableLdm;
++        *value = (int)CCtxParams->ldmParams.enableLdm;
+         break;
+     case ZSTD_c_ldmHashLog :
+-        *value = CCtxParams->ldmParams.hashLog;
++        *value = (int)CCtxParams->ldmParams.hashLog;
+         break;
+     case ZSTD_c_ldmMinMatch :
+-        *value = CCtxParams->ldmParams.minMatchLength;
++        *value = (int)CCtxParams->ldmParams.minMatchLength;
+         break;
+     case ZSTD_c_ldmBucketSizeLog :
+-        *value = CCtxParams->ldmParams.bucketSizeLog;
++        *value = (int)CCtxParams->ldmParams.bucketSizeLog;
+         break;
+     case ZSTD_c_ldmHashRateLog :
+-        *value = CCtxParams->ldmParams.hashRateLog;
++        *value = (int)CCtxParams->ldmParams.hashRateLog;
+         break;
+     case ZSTD_c_targetCBlockSize :
+         *value = (int)CCtxParams->targetCBlockSize;
+@@ -971,8 +1071,11 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_validateSequences :
+         *value = (int)CCtxParams->validateSequences;
+         break;
+-    case ZSTD_c_useBlockSplitter :
+-        *value = (int)CCtxParams->useBlockSplitter;
++    case ZSTD_c_splitAfterSequences :
++        *value = (int)CCtxParams->postBlockSplitter;
++        break;
++    case ZSTD_c_blockSplitterLevel :
++        *value = CCtxParams->preBlockSplitter_level;
+         break;
+     case ZSTD_c_useRowMatchFinder :
+         *value = (int)CCtxParams->useRowMatchFinder;
+@@ -980,6 +1083,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_repcodeResolution:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1121,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1177,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1192,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1213,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,7 +1306,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+@@ -1168,7 +1325,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+     BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+     BOUNDCHECK(ZSTD_c_minMatch,  (int)cParams.minMatch);
+     BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+-    BOUNDCHECK(ZSTD_c_strategy,  cParams.strategy);
++    BOUNDCHECK(ZSTD_c_strategy,  (int)cParams.strategy);
+     return 0;
+ }
+ 
+@@ -1178,11 +1335,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1240,19 +1398,62 @@ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize)
+  *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+  *  mostly downsize to reduce memory consumption and initialization latency.
+  * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`.
++ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`.
+  *  note : `srcSize==0` means 0!
+  *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_CParamMode_e mode,
++                            ZSTD_ParamSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1482,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1501,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,11 +1547,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
++static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
+ 
+ static void ZSTD_overrideCParams(
+               ZSTD_compressionParameters* cParams,
+@@ -1330,24 +1567,25 @@ static void ZSTD_overrideCParams(
+ }
+ 
+ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     ZSTD_compressionParameters cParams;
+     if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+-      srcSizeHint = CCtxParams->srcSizeHint;
++        assert(CCtxParams->srcSizeHint>=0);
++        srcSizeHint = (U64)CCtxParams->srcSizeHint;
+     }
+     cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
+     if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+-                       const ZSTD_paramSwitch_e useRowMatchFinder,
+-                       const U32 enableDedicatedDictSearch,
++                       const ZSTD_ParamSwitch_e useRowMatchFinder,
++                       const int enableDedicatedDictSearch,
+                        const U32 forCCtx)
+ {
+     /* chain table size should be 0 for fast or row-hash strategies */
+@@ -1363,14 +1601,14 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                             + hSize * sizeof(U32)
+                             + h3Size * sizeof(U32);
+     size_t const optPotentialSpace =
+-        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++        ZSTD_cwksp_aligned64_alloc_size((MaxML+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((MaxLL+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((MaxOff+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((1<<Litbits) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned64_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,30 +1624,38 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+         const int isStatic,
+-        const ZSTD_paramSwitch_e useRowMatchFinder,
++        const ZSTD_ParamSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+-                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
++                            + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+-    size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
++    size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE);
+     size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+     size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
+ 
+     size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
+     size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
+     size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ?
+-        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
++        ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
+ 
+ 
+     size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize)
+@@ -1417,15 +1663,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+-        entropySpace +
++        tmpWorkSpace +
+         blockStateSpace +
+         ldmSpace +
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1435,7 +1687,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ {
+     ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
++    ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+                                                                                &cParams);
+ 
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+@@ -1443,7 +1695,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,18 +1745,18 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+         size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+-        ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
++        ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+     }
+ }
+ 
+@@ -1600,7 +1852,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+  *  Invalidate all the matches in the match finder tables.
+  *  Requires nextSrc and base to be set (can be NULL).
+  */
+-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
++static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms)
+ {
+     ZSTD_window_clear(&ms->window);
+ 
+@@ -1637,12 +1889,25 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+-ZSTD_reset_matchState(ZSTD_matchState_t* ms,
++ZSTD_reset_matchState(ZSTD_MatchState_t* ms,
+                       ZSTD_cwksp* ws,
+                 const ZSTD_compressionParameters* cParams,
+-                const ZSTD_paramSwitch_e useRowMatchFinder,
++                const ZSTD_ParamSwitch_e useRowMatchFinder,
+                 const ZSTD_compResetPolicy_e crp,
+                 const ZSTD_indexResetPolicy_e forceResetIndex,
+                 const ZSTD_resetTarget_e forWho)
+@@ -1664,6 +1929,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,22 +1951,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1972,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         }
+     }
+ 
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1754,7 +2028,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ {
+     ZSTD_cwksp* const ws = &zc->workspace;
+     DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
+-                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
++                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter);
+     assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+ 
+     zc->isFirstBlock = 1;
+@@ -1766,8 +2040,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     params = &zc->appliedParams;
+ 
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+-    assert(params->useBlockSplitter != ZSTD_ps_auto);
++    assert(params->postBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +2051,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,8 +2069,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+ 
+@@ -1805,7 +2078,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1823,21 +2096,23 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+                 DEBUGLOG(5, "reserving object space");
+                 /* Statically sized space.
+-                 * entropyWorkspace never moves,
++                 * tmpWorkspace never moves,
+                  * though prev/next block swap places */
+                 assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+                 zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                 RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+                 zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                 RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+-                zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
+-                RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
++                zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE);
++                RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace");
++                zc->tmpWkspSize = TMP_WORKSPACE_SIZE;
+         }   }
+ 
+         ZSTD_cwksp_clear(ws);
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1845,7 +2120,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+             zc->appliedParams.fParams.contentSizeFlag = 0;
+         DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+             (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+-        zc->blockSize = blockSize;
++        zc->blockSizeMax = blockSize;
+ 
+         xxh64_reset(&zc->xxhState, 0);
+         zc->stage = ZSTDcs_init;
+@@ -1854,13 +2129,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (ZSTD_hasExtSeqProd(params)) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2191,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2265,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2305,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,26 +2356,29 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+     /* Zero the hashTable3, since the cdict never fills it */
+-    {   int const h3log = cctx->blockState.matchState.hashLog3;
++    assert(cctx->blockState.matchState.hashLog3 <= 31);
++    {   U32 const h3log = cctx->blockState.matchState.hashLog3;
+         size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+         assert(cdict->matchState.hashLog3 == 0);
+         ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+@@ -2082,8 +2387,8 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+     ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+ 
+     /* copy dictionary offsets */
+-    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+-        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
++    {   ZSTD_MatchState_t const* srcMatchState = &cdict->matchState;
++        ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState;
+         dstMatchState->window       = srcMatchState->window;
+         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+@@ -2141,12 +2446,13 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         /* Copy only compression parameters related to tables. */
+         params.cParams = srcCCtx->appliedParams.cParams;
+         assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
+-        assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
++        assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto);
+         assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
+         params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
+-        params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
++        params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2166,7 +2472,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                                     ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
+                                     : 0;
+         size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+-        int const h3log = srcCCtx->blockState.matchState.hashLog3;
++        U32 const h3log = srcCCtx->blockState.matchState.hashLog3;
+         size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+ 
+         ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable,
+@@ -2184,8 +2490,8 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+ 
+     /* copy dictionary offsets */
+     {
+-        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+-        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
++        const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState;
++        ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+         dstMatchState->window       = srcMatchState->window;
+         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+@@ -2234,7 +2540,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
+     /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+     U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
+     assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+-    assert(size < (1U<<31));   /* can be casted to int */
++    assert(size < (1U<<31));   /* can be cast to int */
+ 
+ 
+     for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+@@ -2267,7 +2573,7 @@ static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const
+ 
+ /*! ZSTD_reduceIndex() :
+ *   rescale all indexes to avoid future overflow (indexes are U32) */
+-static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
++static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+ {
+     {   U32 const hSize = (U32)1 << params->cParams.hashLog;
+         ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+@@ -2294,26 +2600,32 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr)
+ {
+-    const seqDef* const sequences = seqStorePtr->sequencesStart;
++    const SeqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+     BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2333,9 +2645,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+  * Returns 1 if true, 0 otherwise. */
+ static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
+ {
+-    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
+-    assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
+-    return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
++    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter);
++    assert(cctxParams->postBlockSplitter != ZSTD_ps_auto);
++    return (cctxParams->postBlockSplitter == ZSTD_ps_enable);
+ }
+ 
+ /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
+@@ -2347,6 +2659,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2670,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const SeqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2690,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2392,7 +2707,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype,
++                CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype,
+                 countWorkspace, max, llCodeTable, nbSeq,
+                 LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                 prevEntropy->litlengthCTable,
+@@ -2413,7 +2728,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         size_t const mostFrequent = HIST_countFast_wksp(
+             countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */
+         /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+-        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
++        ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+         DEBUGLOG(5, "Building OF table");
+         nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+         stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+@@ -2424,7 +2739,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype,
++                CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype,
+                 countWorkspace, max, ofCodeTable, nbSeq,
+                 OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                 prevEntropy->offcodeCTable,
+@@ -2454,7 +2769,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype,
++                CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype,
+                 countWorkspace, max, mlCodeTable, nbSeq,
+                 ML_defaultNorm, ML_defaultNormLog, MaxML,
+                 prevEntropy->matchlengthCTable,
+@@ -2480,22 +2795,23 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                              void* dst, size_t dstCapacity,
++                        const void* literals, size_t litSize,
++                        const SeqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+-    const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const SeqDef* const sequences = seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2819,28 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+-    {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++    {   size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+-        unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+-        size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++        int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2866,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2878,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2597,104 +2912,146 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     return (size_t)(op - ostart);
+ }
+ 
+-MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++static size_t
++ZSTD_entropyCompressSeqStore_wExtLitBuffer(
++                          void* dst, size_t dstCapacity,
++                    const void* literals, size_t litSize,
++                          size_t blockSize,
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+-                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                             dst, dstCapacity,
++                            literals, litSize,
++                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                             entropyWorkspace, entropyWkspSize, bmi2);
+     if (cSize == 0) return 0;
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+-    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
++    {   size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
++static size_t
++ZSTD_entropyCompressSeqStore(
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
++{
++    return ZSTD_entropyCompressSeqStore_wExtLitBuffer(
++                dst, dstCapacity,
++                seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart),
++                srcSize,
++                seqStorePtr,
++                prevEntropy, nextEntropy,
++                cctxParams,
++                entropyWorkspace, entropyWkspSize,
++                bmi2);
++}
++
+ /* ZSTD_selectBlockCompressor() :
+  * Not static, but internal use only (used by long distance matcher)
+  * assumption : strat is a valid strategy */
+-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
+ {
+-    static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
++    static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+           NULL }
+     };
+-    ZSTD_blockCompressor selectedCompressor;
++    ZSTD_BlockCompressor_f selectedCompressor;
+     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+ 
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+-    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
++    DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+-        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++        static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = {
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+-        DEBUGLOG(4, "Selecting a row-based matchfinder");
++        DEBUGLOG(5, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+         selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
+     } else {
+@@ -2704,30 +3061,126 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     return selectedCompressor;
+ }
+ 
+-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
++static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr,
+                                    const BYTE* anchor, size_t lastLLSize)
+ {
+     ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize);
+     seqStorePtr->lit += lastLLSize;
+ }
+ 
+-void ZSTD_resetSeqStore(seqStore_t* ssPtr)
++void ZSTD_resetSeqStore(SeqStore_t* ssPtr)
+ {
+     ssPtr->lit = ssPtr->litStart;
+     ssPtr->sequences = ssPtr->sequencesStart;
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
+-typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
++/*
++ * Function to validate sequences produced by a block compressor.
++ */
++static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams)
++{
++#if DEBUGLEVEL >= 1
++    const SeqDef* seq = seqStore->sequencesStart;
++    const SeqDef* const seqEnd = seqStore->sequences;
++    size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4;
++    for (; seq < seqEnd; ++seq) {
++        const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq);
++        assert(seqLength.matchLength >= matchLenLowerBound);
++        (void)seqLength;
++        (void)matchLenLowerBound;
++    }
++#else
++    (void)seqStore;
++    (void)cParams;
++#endif
++}
++
++static size_t
++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx,
++                                   ZSTD_SequencePosition* seqPos,
++                             const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                             const void* src, size_t blockSize,
++                                   ZSTD_ParamSwitch_e externalRepSearch);
++
++typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ {
+-    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
++    ZSTD_MatchState_t* const ms = &zc->blockState.matchState;
+     DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3216,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2772,7 +3234,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        src, srcSize);
+             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+-            rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
++            RawSeqStore_t ldmSeqStore = kNullRawSeqStore;
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
+ 
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+@@ -2788,42 +3258,116 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
++            assert(
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->extSeqBuf,
++                    nbExternalSeqs,
++                    zc->extSeqBufCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_SequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_transferSequences_wBlockDelim(
++                            zc, &seqPos,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_BlockCompressor_f const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
++            ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+         {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+             ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+     }   }
++    ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams);
+     return ZSTDbss_compress;
+ }
+ 
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
++    const SeqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs);
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+ 
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    Repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+-
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,46 +3376,75 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
++}
++
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                               size_t outSeqsSize, const void* src, size_t srcSize)
+ {
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+-    void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
++    void* dst; /* Make C90 happy. */
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
++    dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+     seqCollector.collectSequences = 1;
+@@ -2880,8 +3453,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3487,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2930,7 +3505,7 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+  * This is just a heuristic based on the compressibility.
+  * It may return both false positives and false negatives.
+  */
+-static int ZSTD_maybeRLE(seqStore_t const* seqStore)
++static int ZSTD_maybeRLE(SeqStore_t const* seqStore)
+ {
+     size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+@@ -2938,7 +3513,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,12 +3522,14 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+     MEM_writeLE24(op, cBlockHeader);
+-    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
++    DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
+ }
+ 
+ /* ZSTD_buildBlockEntropyStats_literals() :
+@@ -2959,13 +3537,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3554,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3571,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
+-        }
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
++        }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3651,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3664,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const SeqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3103,9 +3691,9 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+                                           entropyWorkspace, entropyWorkspaceSize)
+                        : ZSTD_buildDummySequencesStatistics(nextEntropy);
+     FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+-    fseMetadata->llType = (symbolEncodingType_e) stats.LLtype;
+-    fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype;
+-    fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype;
++    fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype;
++    fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype;
++    fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype;
+     fseMetadata->lastCountSize = stats.lastCountSize;
+     return stats.size;
+ }
+@@ -3114,23 +3702,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const SeqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3736,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3763,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,116 +3801,121 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->tmpWorkspace, zc->tmpWkspSize), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->tmpWorkspace, zc->tmpWkspSize,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        SeqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        SeqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+ /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx).
+  * Stores the result in resultSeqStore.
+  */
+-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+-                               const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore,
++                               const SeqStore_t* originalSeqStore,
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3928,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3941,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3976,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes,
++                        const SeqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+-        seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        SeqDef* const seq = seqStore->sequencesStart + idx;
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +4012,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+-                                  repcodes_t* const dRep, repcodes_t* const cRep,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const SeqStore_t* const seqStore,
++                                  Repcodes_t* const dRep, Repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3417,7 +4026,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+     size_t cSeqsSize;
+ 
+     /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
+-    repcodes_t const dRepOriginal = *dRep;
++    Repcodes_t const dRepOriginal = *dRep;
+     DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
+     if (isPartition)
+         ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+@@ -3428,7 +4037,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+                 &zc->appliedParams,
+                 op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
+                 srcSize,
+-                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++                zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */,
+                 zc->bmi2);
+     FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
+ 
+@@ -3442,8 +4051,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3451,18 +4061,18 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+     if (cSeqsSize == 0) {
+         cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+         FORWARD_IF_ERROR(cSize, "Nocompress block failed");
+-        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize);
+         *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+     } else if (cSeqsSize == 1) {
+         cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
+         FORWARD_IF_ERROR(cSize, "RLE compress block failed");
+-        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize);
+         *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+     } else {
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
+         cSize = ZSTD_blockHeaderSize + cSeqsSize;
+-        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize);
+     }
+ 
+     if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+@@ -3481,45 +4091,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+-                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
++                             ZSTD_CCtx* zc, const SeqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +4141,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4168,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3577,36 +4197,37 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+      *
+      * See ZSTD_seqStore_resolveOffCodes() for more details.
+      */
+-    repcodes_t dRep;
+-    repcodes_t cRep;
+-    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
++    Repcodes_t dRep;
++    Repcodes_t cRep;
++    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4242,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,12 +4251,12 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+-    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
++    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t));
+     return cSize;
+ }
+ 
+@@ -3643,21 +4265,20 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+-    assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock");
++    assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable);
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+-            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
++            DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+         }
+         nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
+@@ -3673,9 +4294,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3687,11 +4308,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3702,7 +4327,7 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+             &zc->appliedParams,
+             dst, dstCapacity,
+             srcSize,
+-            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++            zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */,
+             zc->bmi2);
+ 
+     if (frame &&
+@@ -3767,10 +4392,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4404,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3807,7 +4433,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+     return cSize;
+ }
+ 
+-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
++static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          void const* ip,
+@@ -3831,39 +4457,82 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+     }
+ }
+ 
++#include "zstd_preSplit.h"
++
++static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings)
++{
++    /* split level based on compression strategy, from `fast` to `btultra2` */
++    static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 };
++    /* note: conservatively only split full blocks (128 KB) currently.
++     * While it's possible to go lower, let's keep it simple for a first implementation.
++     * Besides, benefits of splitting are reduced when blocks are already small.
++     */
++    if (srcSize < 128 KB || blockSizeMax < 128 KB)
++        return MIN(srcSize, blockSizeMax);
++    /* do not split incompressible data though:
++     * require verified savings to allow pre-splitting.
++     * Note: as a consequence, the first full block is not split.
++     */
++    if (savings < 3) {
++        DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings);
++        return 128 KB;
++    }
++    /* apply @splitLevel, or use default value (which depends on @strat).
++     * note that splitting heuristic is still conditioned by @savings >= 3,
++     * so the first block will not reach this code path */
++    if (splitLevel == 1) return 128 KB;
++    if (splitLevel == 0) {
++        assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2);
++        splitLevel = splitLevels[strat];
++    } else {
++        assert(2 <= splitLevel && splitLevel <= 6);
++        splitLevel -= 2;
++    }
++    return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize);
++}
++
+ /*! ZSTD_compress_frameChunk() :
+ *   Compress a chunk of data into one or multiple blocks.
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                      U32 lastFrameChunk)
+ {
+-    size_t blockSize = cctx->blockSize;
++    size_t blockSizeMax = cctx->blockSizeMax;
+     size_t remaining = srcSize;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+     U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
++    S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize;
+ 
+     assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+ 
+-    DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
++    DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax);
+     if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+         xxh64_update(&cctx->xxhState, src, srcSize);
+ 
+     while (remaining) {
+-        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+-        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+-
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        ZSTD_MatchState_t* const ms = &cctx->blockState.matchState;
++        size_t const blockSize = ZSTD_optimalBlockSize(cctx,
++                                ip, remaining,
++                                blockSizeMax,
++                                cctx->appliedParams.preBlockSplitter_level,
++                                cctx->appliedParams.cParams.strategy,
++                                savings);
++        U32 const lastBlock = lastFrameChunk & (blockSize == remaining);
++        assert(blockSize <= remaining);
++
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+-        if (remaining < blockSize) blockSize = remaining;
+ 
+         ZSTD_overflowCorrectIfNeeded(
+             ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+@@ -3899,8 +4568,23 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
+-
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
++
++            /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data.
++             * Without splitting, the maximum expansion is 3 bytes per full block.
++             * An adversarial input could attempt to fudge the split detector,
++             * and make it split incompressible data, resulting in more block headers.
++             * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block,
++             * and the splitter never creates blocks that small (current lower limit is 8 KB),
++             * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit.
++             * But if the goal is to not expand by more than 3-bytes per 128 KB full block,
++             * then yes, it becomes possible to make the block splitter oversplit incompressible data.
++             * Using @savings, we enforce an even more conservative condition,
++             * requiring the presence of enough savings (at least 3 bytes) to authorize splitting,
++             * otherwise only full blocks are used.
++             * But being conservative is fine,
++             * since splitting barely compressible blocks is not fruitful anyway */
++            savings += (S64)blockSize - (S64)cSize;
+ 
+             ip += blockSize;
+             assert(remaining >= blockSize);
+@@ -3919,8 +4603,10 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+ 
+ 
+ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+-                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+-{   BYTE* const op = (BYTE*)dst;
++                                    const ZSTD_CCtx_params* params,
++                                    U64 pledgedSrcSize, U32 dictID)
++{
++    BYTE* const op = (BYTE*)dst;
+     U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+     U32   const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+     U32   const checksumFlag = params->fParams.checksumFlag>0;
+@@ -4001,19 +4687,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4022,7 +4704,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                         const void* src, size_t srcSize,
+                                U32 frame, U32 lastFrameChunk)
+ {
+-    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
++    ZSTD_MatchState_t* const ms = &cctx->blockState.matchState;
+     size_t fhSize = 0;
+ 
+     DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+@@ -4057,7 +4739,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+             src, (BYTE const*)src + srcSize);
+     }
+ 
+-    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
++    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax);
+     {   size_t const cSize = frame ?
+                              ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                              ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+@@ -4078,58 +4760,90 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+-                                         ldmState_t* ls,
+-                                         ZSTD_cwksp* ws,
+-                                         ZSTD_CCtx_params const* params,
+-                                         const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++static size_t
++ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms,
++                        ldmState_t* ls,
++                        ZSTD_cwksp* ws,
++                        ZSTD_CCtx_params const* params,
++                        const void* src, size_t srcSize,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,35 +4852,59 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
++        DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict");
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++        DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes");
+     }
+ 
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    {   U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
++    }
++
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4912,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4183,14 +4921,24 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
++        DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4233,20 +4981,19 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+     {   unsigned maxSymbolValue = 255;
+         unsigned hasZeroWeights = 1;
+         size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+-            dictEnd-dictPtr, &hasZeroWeights);
++            (size_t)(dictEnd-dictPtr), &hasZeroWeights);
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+     {   unsigned offcodeLog;
+-        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
++        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+         /* fill all offset symbols to avoid garbage at end of table */
+@@ -4261,7 +5008,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+     {   short matchlengthNCount[MaxML+1];
+         unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+-        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
++        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+@@ -4275,7 +5022,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+     {   short litlengthNCount[MaxLL+1];
+         unsigned litlengthMaxValue = MaxLL, litlengthLog;
+-        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
++        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+@@ -4309,7 +5056,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                 RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+     }   }   }
+ 
+-    return dictPtr - (const BYTE*)dict;
++    return (size_t)(dictPtr - (const BYTE*)dict);
+ }
+ 
+ /* Dictionary format :
+@@ -4322,11 +5069,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+  *                dictSize supposed >= 8
+  */
+ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+-                                      ZSTD_matchState_t* ms,
++                                      ZSTD_MatchState_t* ms,
+                                       ZSTD_cwksp* ws,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +5093,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4354,13 +5102,14 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+ *   @return : dictID, or an error code */
+ static size_t
+ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+-                               ZSTD_matchState_t* ms,
++                               ZSTD_MatchState_t* ms,
+                                ldmState_t* ls,
+                                ZSTD_cwksp* ws,
+                          const ZSTD_CCtx_params* params,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +5122,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +5136,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +5176,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->tmpWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +5221,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +5233,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4496,14 +5252,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5268,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4528,7 +5284,7 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     }
+ 
+     cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+@@ -4537,9 +5293,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5319,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5355,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5473,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4719,14 +5483,16 @@ static size_t ZSTD_initCDict_internal(
+     return 0;
+ }
+ 
+-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
+-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+-                                      ZSTD_compressionParameters cParams,
+-                                      ZSTD_paramSwitch_e useRowMatchFinder,
+-                                      U32 enableDedicatedDictSearch,
+-                                      ZSTD_customMem customMem)
++static ZSTD_CDict*
++ZSTD_createCDict_advanced_internal(size_t dictSize,
++                                ZSTD_dictLoadMethod_e dictLoadMethod,
++                                ZSTD_compressionParameters cParams,
++                                ZSTD_ParamSwitch_e useRowMatchFinder,
++                                int enableDedicatedDictSearch,
++                                ZSTD_customMem customMem)
+ {
+     if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
++    DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize);
+ 
+     {   size_t const workspaceSize =
+             ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+@@ -4763,6 +5529,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+ {
+     ZSTD_CCtx_params cctxParams;
+     ZSTD_memset(&cctxParams, 0, sizeof(cctxParams));
++    DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType);
+     ZSTD_CCtxParams_init(&cctxParams, 0);
+     cctxParams.cParams = cParams;
+     cctxParams.customMem = customMem;
+@@ -4783,7 +5550,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+     ZSTD_compressionParameters cParams;
+     ZSTD_CDict* cdict;
+ 
+-    DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType);
++    DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType);
+     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+ 
+     if (cctxParams.enableDedicatedDictSearch) {
+@@ -4802,7 +5569,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+             &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+     }
+ 
+-    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
++    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch);
+     cctxParams.cParams = cParams;
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+ 
+@@ -4813,7 +5580,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+     if (!cdict)
+         return NULL;
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4867,7 +5634,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+  *  workspaceSize: Use ZSTD_estimateCDictSize()
+  *                 to determine how large workspace must be.
+  *  cParams : use ZSTD_getCParams() to transform a compression level
+- *            into its relevants cParams.
++ *            into its relevant cParams.
+  * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+  *  Note : there is no corresponding "free" function.
+  *         Since workspace was allocated externally, it must be freed externally.
+@@ -4879,7 +5646,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+                                  ZSTD_dictContentType_e dictContentType,
+                                  ZSTD_compressionParameters cParams)
+ {
+-    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
++    ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
+     /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
+     size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
+     size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+@@ -4890,6 +5657,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     ZSTD_CDict* cdict;
+     ZSTD_CCtx_params params;
+ 
++    DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize);
+     if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+ 
+     {
+@@ -4900,14 +5668,13 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+         ZSTD_cwksp_move(&cdict->workspace, &ws);
+     }
+ 
+-    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+-        (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+     if (workspaceSize < neededSize) return NULL;
+ 
+     ZSTD_CCtxParams_init(&params, 0);
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4987,12 +5754,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5002,7 +5774,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5068,7 +5840,7 @@ size_t ZSTD_CStreamOutSize(void)
+     return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+ }
+ 
+-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
++static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
+ {
+     if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize))
+         return ZSTD_cpm_attachDict;
+@@ -5199,30 +5971,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSizeMax - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSizeMax;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5231,8 +6014,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5243,12 +6028,13 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ 
+         case zcss_load:
+             if ( (flushMode == ZSTD_e_end)
+-              && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip)     /* Enough output space */
++              && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip))     /* Enough output space */
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
+-                                                op, oend-op, ip, iend-ip);
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
++                                                op, (size_t)(oend-op),
++                                                ip, (size_t)(iend-ip));
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+                 ip = iend;
+@@ -5262,10 +6048,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                 size_t const loaded = ZSTD_limitCopy(
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+-                                        ip, iend-ip);
++                                        ip, (size_t)(iend-ip));
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5276,16 +6061,29 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+             {   int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered);
+                 void* cDst;
+                 size_t cSize;
+-                size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t oSize = (size_t)(oend-op);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSizeMax);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5293,34 +6091,31 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+                     /* prepare next block */
+-                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
++                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax;
+                     if (zcs->inBuffTarget > zcs->inBuffSize)
+-                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
++                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax;
+                     DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                             (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5369,8 +6164,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         }
+     }
+ 
+-    input->pos = ip - istart;
+-    output->pos = op - ostart;
++    input->pos = (size_t)(ip - istart);
++    output->pos = (size_t)(op - ostart);
+     if (zcs->frameEnded) return 0;
+     return ZSTD_nextInputSizeHint(zcs);
+ }
+@@ -5390,8 +6185,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5410,22 +6207,27 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
++/*
++ * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize.
++ * Otherwise, it's ignored.
++ * @return: 0 on success, or a ZSTD_error code otherwise.
++ */
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5438,21 +6240,24 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+          */
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+-    DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage");
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+-        ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
++        ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+         params.cParams = ZSTD_getCParamsFromCCtxParams(
+                 &params, cctx->pledgedSrcSizePlusOne-1,
+                 dictSize, mode);
+     }
+ 
+-    params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
++    params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5468,7 +6273,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+             /* for small input: avoid automatic flush on reaching end of block, since
+             * it would require to add a 3-bytes null block to end frame
+             */
+-            cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize);
++            cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize);
+         } else {
+             cctx->inBuffTarget = 0;
+         }
+@@ -5479,6 +6284,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5493,8 +6300,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5512,13 +6338,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5541,6 +6374,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5551,64 +6385,67 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+- * @offCode : is presumed to follow format required by ZSTD_storeSeq()
++ * @offBase : must use the format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++/* This function scans through an array of ZSTD_Sequence,
++ * storing the sequences it reads, until it reaches a block delimiter.
++ * Note that the block delimiter includes the last literals of the block.
++ * @blockSize must be == sum(sequence_lengths).
++ * @returns @blockSize on success, and a ZSTD_error otherwise.
+  */
+ static size_t
+-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+-                                              ZSTD_sequencePosition* seqPos,
+-                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx,
++                                   ZSTD_SequencePosition* seqPos,
++                             const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                             const void* src, size_t blockSize,
++                                   ZSTD_ParamSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+-    repcodes_t updatedRepcodes;
++    Repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5616,27 +6453,60 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+     } else {
+         dictSize = 0;
+     }
+-    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
++
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch,
++                                                seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize,
++                                                ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
+-    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
++    RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found.");
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+         DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength);
+@@ -5644,37 +6514,43 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+-    return 0;
++    return blockSize;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
++/*
++ * This function attempts to scan through @blockSize bytes in @src
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
+  *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
++ * Occasionally, we may want to reduce the actual number of bytes consumed from @src
++ * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH.
+  *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ * @returns the number of bytes consumed from @src, necessarily <= @blockSize.
++ * Otherwise, it may return a ZSTD error if something went wrong.
+  */
+ static size_t
+-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+-                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx,
++                               ZSTD_SequencePosition* seqPos,
++                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                         const void* src, size_t blockSize,
++                               ZSTD_ParamSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+     U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
+     size_t dictSize;
+-    BYTE const* ip = (BYTE const*)(src);
+-    BYTE const* iend = ip + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
+-    repcodes_t updatedRepcodes;
++    const BYTE* const istart = (const BYTE*)(src);
++    const BYTE* ip = istart;
++    const BYTE* iend = istart + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
++    Repcodes_t updatedRepcodes;
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5682,15 +6558,15 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+-    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+         const ZSTD_Sequence currSeq = inSeqs[idx];
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5704,7 +6580,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5744,58 +6619,113 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+     seqPos->idx = idx;
+     seqPos->posInSequence = endPosInSequence;
+-    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
+ 
+     iend -= bytesAdjustment;
+     if (ip != iend) {
+         /* Store any last literals */
+-        U32 lastLLSize = (U32)(iend - ip);
++        U32 const lastLLSize = (U32)(iend - ip);
+         assert(ip <= iend);
+         DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize);
+         ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize);
+         seqPos->posInSrc += lastLLSize;
+     }
+ 
+-    return bytesAdjustment;
++    return (size_t)(iend-istart);
+ }
+ 
+-typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+-                                       const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
+-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
++/* @seqPos represents a position within @inSeqs,
++ * it is read and updated by this function,
++ * once the goal to produce a block of size @blockSize is reached.
++ * @return: nb of bytes consumed from @src, necessarily <= @blockSize.
++ */
++typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx,
++                                        ZSTD_SequencePosition* seqPos,
++                                  const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                  const void* src, size_t blockSize,
++                                        ZSTD_ParamSwitch_e externalRepSearch);
++
++static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode)
+ {
+-    ZSTD_sequenceCopier sequenceCopier = NULL;
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode));
+     if (mode == ZSTD_sf_explicitBlockDelimiters) {
+-        return ZSTD_copySequencesToSeqStoreExplicitBlockDelim;
+-    } else if (mode == ZSTD_sf_noBlockDelimiters) {
+-        return ZSTD_copySequencesToSeqStoreNoBlockDelim;
++        return ZSTD_transferSequences_wBlockDelim;
++    }
++    assert(mode == ZSTD_sf_noBlockDelimiters);
++    return ZSTD_transferSequences_noDelim;
++}
++
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
+     }
+-    assert(sequenceCopier != NULL);
+-    return sequenceCopier;
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
+ }
+ 
+-/* Compress, block-by-block, all of the sequences given.
++static size_t determine_blockSize(ZSTD_SequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                           ZSTD_SequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters) {
++        /* Note: more a "target" block size */
++        return MIN(remaining, blockSize);
++    }
++    assert(mode == ZSTD_sf_explicitBlockDelimiters);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
++/* Compress all provided sequences, block-by-block.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+  * otherwise a ZSTD error.
+@@ -5807,15 +6737,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+-    ZSTD_sequencePosition seqPos = {0, 0, 0};
++    ZSTD_SequencePosition seqPos = {0, 0, 0};
+ 
+-    BYTE const* ip = (BYTE const*)src;
++    const BYTE* ip = (BYTE const*)src;
+     BYTE* op = (BYTE*)dst;
+-    ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
++    ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
+ 
+     DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
+     /* Special case: empty frame */
+@@ -5829,22 +6756,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+-        size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSizeMax, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+-        FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+-        blockSize -= additionalByteAdjustment;
++        blockSize = sequenceCopier(cctx,
++                                   &seqPos, inSeqs, inSeqsSize,
++                                   ip, blockSize,
++                                   cctx->appliedParams.searchForExternalRepcodes);
++        FORWARD_IF_ERROR(blockSize, "Bad sequence copy");
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5853,35 +6787,36 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+                                 op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
+                                 blockSize,
+-                                cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++                                cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+-            /* We don't want to emit our first block as a RLE even if it qualifies because
+-            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+-            * This is only an issue for zstd <= v1.4.3
+-            */
++            ZSTD_isRLE(ip, blockSize)) {
++            /* Note: don't emit the first block as RLE even if it qualifies because
++             * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error
++             * "should consume all input error."
++             */
+             compressedSeqsSize = 1;
+         }
+ 
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5893,11 +6828,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5908,41 +6842,50 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+     BYTE* op = (BYTE*)dst;
+     size_t cSize = 0;
+-    size_t compressedBlocksSize = 0;
+-    size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
++
+     /* Begin writing output, starting with frame header */
+-    frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID);
+-    op += frameHeaderSize;
+-    dstCapacity -= frameHeaderSize;
+-    cSize += frameHeaderSize;
++    {   size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity,
++                    &cctx->appliedParams, srcSize, cctx->dictID);
++        op += frameHeaderSize;
++        assert(frameHeaderSize <= dstCapacity);
++        dstCapacity -= frameHeaderSize;
++        cSize += frameHeaderSize;
++    }
+     if (cctx->appliedParams.fParams.checksumFlag && srcSize) {
+         xxh64_update(&cctx->xxhState, src, srcSize);
+     }
+-    /* cSize includes block header size and compressed sequences size */
+-    compressedBlocksSize = ZSTD_compressSequences_internal(cctx,
++
++    /* Now generate compressed blocks */
++    {   size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx,
+                                                            op, dstCapacity,
+                                                            inSeqs, inSeqsSize,
+                                                            src, srcSize);
+-    FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!");
+-    cSize += compressedBlocksSize;
+-    dstCapacity -= compressedBlocksSize;
++        FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!");
++        cSize += cBlocksSize;
++        assert(cBlocksSize <= dstCapacity);
++        dstCapacity -= cBlocksSize;
++    }
+ 
++    /* Complete with frame checksum, if needed */
+     if (cctx->appliedParams.fParams.checksumFlag) {
+         U32 const checksum = (U32) xxh64_digest(&cctx->xxhState);
+         RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+@@ -5951,26 +6894,557 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
++    return cSize;
++}
++
++
++#if defined(__AVX2__)
++
++#include <immintrin.h>  /* AVX2 intrinsics */
++
++/*
++ * Convert 2 sequences per iteration, using AVX2 intrinsics:
++ *   - offset -> offBase = offset + 2
++ *   - litLength -> (U16) litLength
++ *   - matchLength -> (U16)(matchLength - 3)
++ *   - rep is ignored
++ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]).
++ *
++ * At the end, instead of extracting two __m128i,
++ * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1,
++ * then store the lower 16 bytes in one go.
++ *
++ * @returns 0 on succes, with no long length detected
++ * @returns > 0 if there is one long length (> 65535),
++ * indicating the position, and type.
++ */
++static size_t convertSequences_noRepcodes(
++    SeqDef* dstSeqs,
++    const ZSTD_Sequence* inSeqs,
++    size_t nbSequences)
++{
++    /*
++     * addition:
++     *   For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0)
++     */
++    const __m256i addition = _mm256_setr_epi32(
++        ZSTD_REP_NUM, 0, -MINMATCH, 0,    /* for sequence i */
++        ZSTD_REP_NUM, 0, -MINMATCH, 0     /* for sequence i+1 */
++    );
++
++    /* limit: check if there is a long length */
++    const __m256i limit = _mm256_set1_epi32(65535);
++
++    /*
++     * shuffle mask for byte-level rearrangement in each 128-bit half:
++     *
++     * Input layout (after addition) per 128-bit half:
++     *   [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ]
++     * We only need:
++     *   offBase (4 bytes) = offset+2
++     *   litLength (2 bytes) = low 2 bytes of litLength
++     *   mlBase (2 bytes) = low 2 bytes of (matchLength)
++     * => Bytes [0..3, 4..5, 8..9], zero the rest.
++     */
++    const __m256i mask = _mm256_setr_epi8(
++        /* For the lower 128 bits => sequence i */
++         0, 1, 2, 3,       /* offset+2 */
++         4, 5,             /* litLength (16 bits) */
++         8, 9,             /* matchLength (16 bits) */
++         (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++         (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++
++        /* For the upper 128 bits => sequence i+1 */
++        16,17,18,19,       /* offset+2 */
++        20,21,             /* litLength */
++        24,25,             /* matchLength */
++        (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++        (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80
++    );
++
++    /*
++     * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8).
++     * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3].
++     * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1.
++     */
++#define PERM_LANE_0X_E8 0xE8  /* [0,2,2,3] in lane indices */
++
++    size_t longLen = 0, i = 0;
++
++    /* AVX permutation depends on the specific definition of target structures */
++    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
++    ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
++
++    /* Process 2 sequences per loop iteration */
++    for (; i + 1 < nbSequences; i += 2) {
++        /* Load 2 ZSTD_Sequence (32 bytes) */
++        __m256i vin  = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]);
++
++        /* Add {2, 0, -3, 0} in each 128-bit half */
++        __m256i vadd = _mm256_add_epi32(vin, addition);
++
++        /* Check for long length */
++        __m256i ll_cmp  = _mm256_cmpgt_epi32(vadd, limit);  /* 0xFFFFFFFF for element > 65535 */
++        int ll_res  = _mm256_movemask_epi8(ll_cmp);
++
++        /* Shuffle bytes so each half gives us the 8 bytes we need */
++        __m256i vshf = _mm256_shuffle_epi8(vadd, mask);
++        /*
++         * Now:
++         *   Lane0 = seq0's 8 bytes
++         *   Lane1 = 0
++         *   Lane2 = seq1's 8 bytes
++         *   Lane3 = 0
++         */
++
++        /* Permute 64-bit lanes => move Lane2 down into Lane1. */
++        __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8);
++        /*
++         * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1].
++         * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them.
++         */
++
++        /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */
++        _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm));
++        /*
++         * This writes out 16 bytes total:
++         *   - offset 0..7  => seq0 (offBase, litLength, mlBase)
++         *   - offset 8..15 => seq1 (offBase, litLength, mlBase)
++         */
++
++        /* check (unlikely) long lengths > 65535
++         * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27]
++         * => combined mask = 0x0FF00FF0
++         */
++        if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) {
++            /* long length detected: let's figure out which one*/
++            if (inSeqs[i].matchLength > 65535+MINMATCH) {
++                assert(longLen == 0);
++                longLen = i + 1;
++            }
++            if (inSeqs[i].litLength > 65535) {
++                assert(longLen == 0);
++                longLen = i + nbSequences + 1;
++            }
++            if (inSeqs[i+1].matchLength > 65535+MINMATCH) {
++                assert(longLen == 0);
++                longLen = i + 1 + 1;
++            }
++            if (inSeqs[i+1].litLength > 65535) {
++                assert(longLen == 0);
++                longLen = i + 1 + nbSequences + 1;
++            }
++        }
++    }
++
++    /* Handle leftover if @nbSequences is odd */
++    if (i < nbSequences) {
++        /* process last sequence */
++        assert(i == nbSequences - 1);
++        dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset);
++        dstSeqs[i].litLength = (U16)inSeqs[i].litLength;
++        dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH);
++        /* check (unlikely) long lengths > 65535 */
++        if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) {
++            assert(longLen == 0);
++            longLen = i + 1;
++        }
++        if (UNLIKELY(inSeqs[i].litLength > 65535)) {
++            assert(longLen == 0);
++            longLen = i + nbSequences + 1;
++        }
++    }
++
++    return longLen;
++}
++
++/* the vector implementation could also be ported to SSSE3,
++ * but since this implementation is targeting modern systems (>= Sapphire Rapid),
++ * it's not useful to develop and maintain code for older pre-AVX2 platforms */
++
++#else /* no AVX2 */
++
++static size_t convertSequences_noRepcodes(
++    SeqDef* dstSeqs,
++    const ZSTD_Sequence* inSeqs,
++    size_t nbSequences)
++{
++    size_t longLen = 0;
++    size_t n;
++    for (n=0; n<nbSequences; n++) {
++        dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
++        dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
++        dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
++        /* check for long length > 65535 */
++        if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
++            assert(longLen == 0);
++            longLen = n + 1;
++        }
++        if (UNLIKELY(inSeqs[n].litLength > 65535)) {
++            assert(longLen == 0);
++            longLen = n + nbSequences + 1;
++        }
++    }
++    return longLen;
++}
++
++#endif
++
++/*
++ * Precondition: Sequences must end on an explicit Block Delimiter
++ * @return: 0 on success, or an error code.
++ * Note: Sequence validation functionality has been disabled (removed).
++ * This is helpful to generate a lean main pipeline, improving performance.
++ * It may be re-inserted later.
++ */
++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
++                const ZSTD_Sequence* const inSeqs, size_t nbSequences,
++                int repcodeResolution)
++{
++    Repcodes_t updatedRepcodes;
++    size_t seqNb = 0;
++
++    DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences);
++
++    RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
++                    "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
++
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++
++    /* check end condition */
++    assert(nbSequences >= 1);
++    assert(inSeqs[nbSequences-1].matchLength == 0);
++    assert(inSeqs[nbSequences-1].offset == 0);
++
++    /* Convert Sequences from public format to internal format */
++    if (!repcodeResolution) {
++        size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1);
++        cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1;
++        if (longl) {
++            DEBUGLOG(5, "long length");
++            assert(cctx->seqStore.longLengthType == ZSTD_llt_none);
++            if (longl <= nbSequences-1) {
++                DEBUGLOG(5, "long match length detected at pos %zu", longl-1);
++                cctx->seqStore.longLengthType = ZSTD_llt_matchLength;
++                cctx->seqStore.longLengthPos = (U32)(longl-1);
++            } else {
++                DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences);
++                assert(longl <= 2* (nbSequences-1));
++                cctx->seqStore.longLengthType = ZSTD_llt_literalLength;
++                cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1);
++            }
++        }
++    } else {
++        for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) {
++            U32 const litLength = inSeqs[seqNb].litLength;
++            U32 const matchLength = inSeqs[seqNb].matchLength;
++            U32 const ll0 = (litLength == 0);
++            U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0);
++
++            DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++            ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
++    }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    if (!repcodeResolution && nbSequences > 1) {
++        U32* const rep = updatedRepcodes.rep;
++
++        if (nbSequences >= 4) {
++            U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (nbSequences == 3) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[0].offset;
++            rep[0] = inSeqs[1].offset;
++        } else {
++            assert(nbSequences == 2);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[0].offset;
++        }
++    }
++
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
++
++    return 0;
++}
++
++#if defined(ZSTD_ARCH_X86_AVX2)
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
++{
++    size_t i;
++    __m256i const zeroVec = _mm256_setzero_si256();
++    __m256i sumVec = zeroVec;  /* accumulates match+lit in 32-bit lanes */
++    ZSTD_ALIGNED(32) U32 tmp[8];      /* temporary buffer for reduction */
++    size_t mSum = 0, lSum = 0;
++    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
++
++    /* Process 2 structs (32 bytes) at a time */
++    for (i = 0; i + 2 <= nbSeqs; i += 2) {
++        /* Load two consecutive ZSTD_Sequence (8Ã4 = 32 bytes) */
++        __m256i data     = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]);
++        /* check end of block signal */
++        __m256i cmp      = _mm256_cmpeq_epi32(data, zeroVec);
++        int cmp_res      = _mm256_movemask_epi8(cmp);
++        /* indices for match lengths correspond to bits [8..11], [24..27]
++         * => combined mask = 0x0F000F00 */
++        ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
++        if (cmp_res & 0x0F000F00) break;
++        /* Accumulate in sumVec */
++        sumVec           = _mm256_add_epi32(sumVec, data);
++    }
++
++    /* Horizontal reduction */
++    _mm256_store_si256((__m256i*)tmp, sumVec);
++    lSum = tmp[1] + tmp[5];
++    mSum = tmp[2] + tmp[6];
++
++    /* Handle the leftover */
++    for (; i < nbSeqs; i++) {
++        lSum += seqs[i].litLength;
++        mSum += seqs[i].matchLength;
++        if (seqs[i].matchLength == 0) break; /* end of block */
++    }
++
++    if (i==nbSeqs) {
++        /* reaching end of sequences: end of block signal was not present */
++        BlockSummary bs;
++        bs.nbSequences = ERROR(externalSequences_invalid);
++        return bs;
++    }
++    {   BlockSummary bs;
++        bs.nbSequences = i+1;
++        bs.blockSize = lSum + mSum;
++        bs.litSize = lSum;
++        return bs;
++    }
++}
++
++#else
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
++{
++    size_t totalMatchSize = 0;
++    size_t litSize = 0;
++    size_t n;
++    assert(seqs);
++    for (n=0; n<nbSeqs; n++) {
++        totalMatchSize += seqs[n].matchLength;
++        litSize += seqs[n].litLength;
++        if (seqs[n].matchLength == 0) {
++            assert(seqs[n].offset == 0);
++            break;
++        }
++    }
++    if (n==nbSeqs) {
++        BlockSummary bs;
++        bs.nbSequences = ERROR(externalSequences_invalid);
++        return bs;
++    }
++    {   BlockSummary bs;
++        bs.nbSequences = n+1;
++        bs.blockSize = litSize + totalMatchSize;
++        bs.litSize = litSize;
++        return bs;
++    }
++}
++#endif
++
++
++static size_t
++ZSTD_compressSequencesAndLiterals_internal(ZSTD_CCtx* cctx,
++                                void* dst, size_t dstCapacity,
++                          const ZSTD_Sequence* inSeqs, size_t nbSequences,
++                          const void* literals, size_t litSize, size_t srcSize)
++{
++    size_t remaining = srcSize;
++    size_t cSize = 0;
++    BYTE* op = (BYTE*)dst;
++    int const repcodeResolution = (cctx->appliedParams.searchForExternalRepcodes == ZSTD_ps_enable);
++    assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto);
++
++    DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize);
++    RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block");
++
++    /* Special case: empty frame */
++    if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) {
++        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header");
++        MEM_writeLE24(op, cBlockHeader24);
++        op += ZSTD_blockHeaderSize;
++        dstCapacity -= ZSTD_blockHeaderSize;
++        cSize += ZSTD_blockHeaderSize;
++    }
++
++    while (nbSequences) {
++        size_t compressedSeqsSize, cBlockSize, conversionStatus;
++        BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences);
++        U32 const lastBlock = (block.nbSequences == nbSequences);
++        FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block");
++        assert(block.nbSequences <= nbSequences);
++        RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer");
++        ZSTD_resetSeqStore(&cctx->seqStore);
++
++        conversionStatus = ZSTD_convertBlockSequences(cctx,
++                            inSeqs, block.nbSequences,
++                            repcodeResolution);
++        FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion");
++        inSeqs += block.nbSequences;
++        nbSequences -= block.nbSequences;
++        remaining -= block.blockSize;
++
++        /* Note: when blockSize is very small, other variant send it uncompressed.
++         * Here, we still send the sequences, because we don't have the original source to send it uncompressed.
++         * One could imagine in theory reproducing the source from the sequences,
++         * but that's complex and costly memory intensive, and goes against the objectives of this variant. */
++
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
++
++        compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal(
++                                op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
++                                literals, block.litSize,
++                                &cctx->seqStore,
++                                &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
++                                &cctx->appliedParams,
++                                cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */,
++                                cctx->bmi2);
++        FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
++        /* note: the spec forbids for any compressed block to be larger than maximum block size */
++        if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0;
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
++        litSize -= block.litSize;
++        literals = (const char*)literals + block.litSize;
++
++        /* Note: difficult to check source for RLE block when only Literals are provided,
++         * but it could be considered from analyzing the sequence directly */
++
++        if (compressedSeqsSize == 0) {
++            /* Sending uncompressed blocks is out of reach, because the source is not provided.
++             * In theory, one could use the sequences to regenerate the source, like a decompressor,
++             * but it's complex, and memory hungry, killing the purpose of this variant.
++             * Current outcome: generate an error code.
++             */
++            RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block");
++        } else {
++            U32 cBlockHeader;
++            assert(compressedSeqsSize > 1); /* no RLE */
++            /* Error checking and repcodes update */
++            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
++            if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
++                cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
++
++            /* Write block header into beginning of block*/
++            cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
++            MEM_writeLE24(op, cBlockHeader);
++            cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
++        }
++
++        cSize += cBlockSize;
++        op += cBlockSize;
++        dstCapacity -= cBlockSize;
++        cctx->isFirstBlock = 0;
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
++
++        if (lastBlock) {
++            assert(nbSequences == 0);
++            break;
++        }
++    }
++
++    RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed");
++    RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize);
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
++    return cSize;
++}
++
++size_t
++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx,
++                    void* dst, size_t dstCapacity,
++                    const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                    const void* literals, size_t litSize, size_t litCapacity,
++                    size_t decompressedSize)
++{
++    BYTE* op = (BYTE*)dst;
++    size_t cSize = 0;
++
++    /* Transparent initialization stage, same as compressStream2() */
++    DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity);
++    assert(cctx != NULL);
++    if (litCapacity < litSize) {
++        RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)");
++    }
++    FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed");
++
++    if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) {
++        RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters");
++    }
++    if (cctx->appliedParams.validateSequences) {
++        RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation");
++    }
++    if (cctx->appliedParams.fParams.checksumFlag) {
++        RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum");
++    }
++
++    /* Begin writing output, starting with frame header */
++    {   size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity,
++                    &cctx->appliedParams, decompressedSize, cctx->dictID);
++        op += frameHeaderSize;
++        assert(frameHeaderSize <= dstCapacity);
++        dstCapacity -= frameHeaderSize;
++        cSize += frameHeaderSize;
++    }
++
++    /* Now generate compressed blocks */
++    {   size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx,
++                                            op, dstCapacity,
++                                            inSeqs, inSeqsSize,
++                                            literals, litSize, decompressedSize);
++        FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!");
++        cSize += cBlocksSize;
++        assert(cBlocksSize <= dstCapacity);
++        dstCapacity -= cBlocksSize;
++    }
++
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+-
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6046,7 +7520,7 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+     }
+ }
+ 
+-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+@@ -6070,8 +7544,8 @@ static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMo
+  * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+  *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+  *        Use dictSize == 0 for unknown or unused.
+- *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */
+-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++ *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */
++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode);
+     U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+@@ -6092,7 +7566,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6109,7 +7583,9 @@ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long l
+  *  same idea as ZSTD_getCParams()
+  * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+  *  Fields of `ZSTD_frameParameters` are set to default values */
+-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
++static ZSTD_parameters
++ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
++{
+     ZSTD_parameters params;
+     ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode);
+     DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+@@ -6123,7 +7599,34 @@ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned lo
+  *  same idea as ZSTD_getCParams()
+  * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+  *  Fields of `ZSTD_frameParameters` are set to default values */
+-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
++ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
++{
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc)
++{
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc)
++{
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
++    } else {
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..b10978385876 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,7 +21,8 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
+-
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
++#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */
+ 
+ /*-*************************************
+ *  Constants
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
+@@ -75,6 +77,70 @@ typedef struct {
+     ZSTD_fseCTables_t fse;
+ } ZSTD_entropyCTables_t;
+ 
++/* *********************************************
++*  Sequences *
++***********************************************/
++typedef struct SeqDef_s {
++    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
++    U16 litLength;
++    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
++} SeqDef;
++
++/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */
++typedef enum {
++    ZSTD_llt_none = 0,             /* no longLengthType */
++    ZSTD_llt_literalLength = 1,    /* represents a long literal */
++    ZSTD_llt_matchLength = 2       /* represents a long match */
++} ZSTD_longLengthType_e;
++
++typedef struct {
++    SeqDef* sequencesStart;
++    SeqDef* sequences;      /* ptr to end of sequences */
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
++    size_t maxNbSeq;
++    size_t maxNbLit;
++
++    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
++     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
++     * the existing value of the litLength or matchLength by 0x10000.
++     */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
++} SeqStore_t;
++
++typedef struct {
++    U32 litLength;
++    U32 matchLength;
++} ZSTD_SequenceLength;
++
++/*
++ * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences
++ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
++ */
++MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq)
++{
++    ZSTD_SequenceLength seqLen;
++    seqLen.litLength = seq->litLength;
++    seqLen.matchLength = seq->mlBase + MINMATCH;
++    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
++        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
++            seqLen.litLength += 0x10000;
++        }
++        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
++            seqLen.matchLength += 0x10000;
++        }
++    }
++    return seqLen;
++}
++
++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
++
++
+ /* *********************************************
+ *  Entropy buffer statistics structs and funcs *
+ ***********************************************/
+@@ -84,7 +150,7 @@ typedef struct {
+  *  hufDesSize refers to the size of huffman tree description in bytes.
+  *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
+ typedef struct {
+-    symbolEncodingType_e hType;
++    SymbolEncodingType_e hType;
+     BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
+     size_t hufDesSize;
+ } ZSTD_hufCTablesMetadata_t;
+@@ -95,9 +161,9 @@ typedef struct {
+  *  fseTablesSize refers to the size of fse tables in bytes.
+  *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */
+ typedef struct {
+-    symbolEncodingType_e llType;
+-    symbolEncodingType_e ofType;
+-    symbolEncodingType_e mlType;
++    SymbolEncodingType_e llType;
++    SymbolEncodingType_e ofType;
++    SymbolEncodingType_e mlType;
+     BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
+     size_t fseTablesSize;
+     size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+@@ -111,12 +177,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -140,28 +207,29 @@ typedef struct {
+                            stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */
+   size_t size;          /* The number of sequences. <= capacity. */
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+-} rawSeqStore_t;
++} RawSeqStore_t;
+ 
+-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
++UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -173,7 +241,7 @@ typedef struct {
+     U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+     ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+     const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+-    ZSTD_paramSwitch_e literalCompressionMode;
++    ZSTD_ParamSwitch_e literalCompressionMode;
+ } optState_t;
+ 
+ typedef struct {
+@@ -195,11 +263,11 @@ typedef struct {
+ 
+ #define ZSTD_WINDOW_START_INDEX 2
+ 
+-typedef struct ZSTD_matchState_t ZSTD_matchState_t;
++typedef struct ZSTD_MatchState_t ZSTD_MatchState_t;
+ 
+ #define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */
+ 
+-struct ZSTD_matchState_t {
++struct ZSTD_MatchState_t {
+     ZSTD_window_t window;   /* State for window round buffer management */
+     U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
+                              * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+@@ -212,28 +280,42 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+     U32* chainTable;
+ 
+-    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
++    int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
+ 
+     int dedicatedDictSearch;  /* Indicates whether this matchState is using the
+                                * dedicated dictionary search structure.
+                                */
+     optState_t opt;         /* optimal parser state */
+-    const ZSTD_matchState_t* dictMatchState;
++    const ZSTD_MatchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+-    const rawSeqStore_t* ldmSeqStore;
++    const RawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+     ZSTD_compressedBlockState_t* prevCBlock;
+     ZSTD_compressedBlockState_t* nextCBlock;
+-    ZSTD_matchState_t matchState;
++    ZSTD_MatchState_t matchState;
+ } ZSTD_blockState_t;
+ 
+ typedef struct {
+@@ -260,7 +342,7 @@ typedef struct {
+ } ldmState_t;
+ 
+ typedef struct {
+-    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
++    ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
+     U32 hashLog;            /* Log size of hashTable */
+     U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+     U32 minMatchLength;     /* Minimum match length */
+@@ -291,7 +373,7 @@ struct ZSTD_CCtx_params_s {
+                                 * There is no guarantee that hint is close to actual source size */
+ 
+     ZSTD_dictAttachPref_e attachDictPref;
+-    ZSTD_paramSwitch_e literalCompressionMode;
++    ZSTD_ParamSwitch_e literalCompressionMode;
+ 
+     /* Multithreading: used to pass parameters to mtctx */
+     int nbWorkers;
+@@ -310,24 +392,54 @@ struct ZSTD_CCtx_params_s {
+     ZSTD_bufferMode_e outBufferMode;
+ 
+     /* Sequence compression API */
+-    ZSTD_sequenceFormat_e blockDelimiters;
++    ZSTD_SequenceFormat_e blockDelimiters;
+     int validateSequences;
+ 
+-    /* Block splitting */
+-    ZSTD_paramSwitch_e useBlockSplitter;
++    /* Block splitting
++     * @postBlockSplitter executes split analysis after sequences are produced,
++     * it's more accurate but consumes more resources.
++     * @preBlockSplitter_level splits before knowing sequences,
++     * it's more approximative but also cheaper.
++     * Valid @preBlockSplitter_level values range from 0 to 6 (included).
++     * 0 means auto, 1 means do not split,
++     * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest).
++     * Highest @preBlockSplitter_level combines well with @postBlockSplitter.
++     */
++    ZSTD_ParamSwitch_e postBlockSplitter;
++    int preBlockSplitter_level;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
+ 
+     /* Param for deciding whether to use row-based matchfinder */
+-    ZSTD_paramSwitch_e useRowMatchFinder;
++    ZSTD_ParamSwitch_e useRowMatchFinder;
+ 
+     /* Always load a dictionary in ext-dict mode (not prefix mode)? */
+     int deterministicRefPrefix;
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_ParamSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_ParamSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+ #define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE)
++#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE))
+ 
+ /*
+  * Indicates whether this compression proceeds directly from user-provided
+@@ -345,11 +457,11 @@ typedef enum {
+  */
+ #define ZSTD_MAX_NB_BLOCK_SPLITS 196
+ typedef struct {
+-    seqStore_t fullSeqStoreChunk;
+-    seqStore_t firstHalfSeqStore;
+-    seqStore_t secondHalfSeqStore;
+-    seqStore_t currSeqStore;
+-    seqStore_t nextSeqStore;
++    SeqStore_t fullSeqStoreChunk;
++    SeqStore_t firstHalfSeqStore;
++    SeqStore_t secondHalfSeqStore;
++    SeqStore_t currSeqStore;
++    SeqStore_t nextSeqStore;
+ 
+     U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+@@ -366,7 +478,7 @@ struct ZSTD_CCtx_s {
+     size_t dictContentSize;
+ 
+     ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+-    size_t blockSize;
++    size_t blockSizeMax;
+     unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+     unsigned long long consumedSrcSize;
+     unsigned long long producedCSize;
+@@ -378,13 +490,14 @@ struct ZSTD_CCtx_s {
+     int isFirstBlock;
+     int initialized;
+ 
+-    seqStore_t seqStore;      /* sequences storage ptrs */
++    SeqStore_t seqStore;      /* sequences storage ptrs */
+     ldmState_t ldmState;      /* long distance matching state */
+     rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+     size_t maxNbLdmSequences;
+-    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
++    RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+     ZSTD_blockState_t blockState;
+-    U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
++    void* tmpWorkspace;  /* used as substitute of stack space - must be aligned for S64 type */
++    size_t tmpWkspSize;
+ 
+     /* Whether we are streaming or not */
+     ZSTD_buffered_policy_e bufferedPolicy;
+@@ -404,6 +517,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +531,14 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,17 +560,17 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+                                  */
+-} ZSTD_cParamMode_e;
++} ZSTD_CParamMode_e;
+ 
+-typedef size_t (*ZSTD_blockCompressor) (
+-        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++typedef size_t (*ZSTD_BlockCompressor_f) (
++        ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
+ 
+ 
+ MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+@@ -497,12 +616,33 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+     return 1;
+ }
+ 
++/* ZSTD_selectAddr:
++ * @return index >= lowLimit ? candidate : backup,
++ * tries to force branchless codegen. */
++MEM_STATIC const BYTE*
++ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup)
++{
++#if defined(__x86_64__)
++    __asm__ (
++        "cmp %1, %2\n"
++        "cmova %3, %0\n"
++        : "+r"(candidate)
++        : "r"(index), "r"(lowLimit), "r"(backup)
++        );
++    return candidate;
++#else
++    return index >= lowLimit ? candidate : backup;
++#endif
++}
++
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +650,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +670,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +706,68 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
++
++/*! ZSTD_storeSeqOnly() :
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t.
++ *  Literals themselves are not copied, but @litPtr is updated.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
++ *  @matchLength : must be >= MINMATCH
++*/
++HINT_INLINE UNUSED_ATTR void
++ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr,
++              size_t litLength,
++              U32 offBase,
++              size_t matchLength)
++{
++    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
++
++    /* literal Length */
++    assert(litLength <= ZSTD_BLOCKSIZE_MAX);
++    if (UNLIKELY(litLength>0xFFFF)) {
++        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
++        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
++        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++    }
++    seqStorePtr->sequences[0].litLength = (U16)litLength;
++
++    /* match offset */
++    seqStorePtr->sequences[0].offBase = offBase;
++
++    /* match Length */
++    assert(matchLength <= ZSTD_BLOCKSIZE_MAX);
++    assert(matchLength >= MINMATCH);
++    {   size_t const mlBase = matchLength - MINMATCH;
++        if (UNLIKELY(mlBase>0xFFFF)) {
++            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
++            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
++            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        }
++        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
++    }
++
++    seqStorePtr->sequences++;
++}
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+-ZSTD_storeSeq(seqStore_t* seqStorePtr,
++ZSTD_storeSeq(SeqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +776,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +787,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -619,44 +799,22 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     }
+     seqStorePtr->lit += litLength;
+ 
+-    /* literal Length */
+-    if (litLength>0xFFFF) {
+-        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+-        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
+-        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+-    }
+-    seqStorePtr->sequences[0].litLength = (U16)litLength;
+-
+-    /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+-
+-    /* match Length */
+-    assert(matchLength >= MINMATCH);
+-    {   size_t const mlBase = matchLength - MINMATCH;
+-        if (mlBase>0xFFFF) {
+-            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+-            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
+-            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+-        }
+-        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
+-    }
+-
+-    seqStorePtr->sequences++;
++    ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength);
+ }
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -670,14 +828,14 @@ ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ 
+ typedef struct repcodes_s {
+     U32 rep[3];
+-} repcodes_t;
++} Repcodes_t;
+ 
+-MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++MEM_STATIC Repcodes_t
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    repcodes_t newReps;
++    Repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +843,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -771,8 +876,8 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+     size_t const matchLength = ZSTD_count(ip, match, vEnd);
+     if (match + matchLength != mEnd) return matchLength;
+     DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+-    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+-    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
++    DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match));
++    DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip));
+     DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+     DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+     return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+@@ -783,32 +888,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +936,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -881,11 +1015,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64
+ /*-*************************************
+ *  Round buffer management
+ ***************************************/
+-#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+-# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+-#endif
+-/* Max current allowed */
+-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
++/* Max @current value allowed:
++ * In 32-bit mode: we want to avoid crossing the 2 GB limit,
++ *                 reducing risks of side effects in case of signed operations on indexes.
++ * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB)
++ *                 doesn't overflow U32 index capacity (4 GB) */
++#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB)
+ /* Maximum chunk size before overflow correction needs to be called again */
+ #define ZSTD_CHUNKSIZE_MAX                                                     \
+     ( ((U32)-1)                  /* Maximum ending current index */            \
+@@ -925,7 +1060,7 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+  * Inspects the provided matchState and figures out what dictMode should be
+  * passed to the compressor.
+  */
+-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
++MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms)
+ {
+     return ZSTD_window_hasExtDict(ms->window) ?
+         ZSTD_extDict :
+@@ -1011,7 +1146,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1112,7 +1249,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                      const void* blockEnd,
+                            U32   maxDist,
+                            U32*  loadedDictEndPtr,
+-                     const ZSTD_matchState_t** dictMatchStatePtr)
++                     const ZSTD_MatchState_t** dictMatchStatePtr)
+ {
+     U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+     U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+@@ -1157,7 +1294,7 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                        const void* blockEnd,
+                              U32   maxDist,
+                              U32*  loadedDictEndPtr,
+-                       const ZSTD_matchState_t** dictMatchStatePtr)
++                       const ZSTD_MatchState_t** dictMatchStatePtr)
+ {
+     assert(loadedDictEndPtr != NULL);
+     assert(dictMatchStatePtr != NULL);
+@@ -1167,10 +1304,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1199,9 +1341,11 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+-                                  void const* src, size_t srcSize,
+-                                  int forceNonContiguous)
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
++                 const void* src, size_t srcSize,
++                       int forceNonContiguous)
+ {
+     BYTE const* const ip = (BYTE const*)src;
+     U32 contiguous = 1;
+@@ -1228,8 +1372,9 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+     /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+     if ( (ip+srcSize > window->dictBase + window->lowLimit)
+        & (ip < window->dictBase + window->dictLimit)) {
+-        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+-        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
++        size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase);
++        U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
++        assert(highInputIdx < UINT_MAX);
+         window->lowLimit = lowLimitMax;
+         DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+     }
+@@ -1239,7 +1384,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+ /*
+  * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+  */
+-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
++MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog)
+ {
+     U32 const maxDistance = 1U << windowLog;
+     U32 const lowestValid = ms->window.lowLimit;
+@@ -1256,7 +1401,7 @@ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, u
+ /*
+  * Returns the lowest allowed match index in the prefix.
+  */
+-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
++MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog)
+ {
+     U32    const maxDistance = 1U << windowLog;
+     U32    const lowestValid = ms->window.dictLimit;
+@@ -1269,6 +1414,13 @@ MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr,
+     return matchLowest;
+ }
+ 
++/* index_safety_check:
++ * intentional underflow : ensure repIndex isn't overlapping dict + prefix
++ * @return 1 if values are not overlapping,
++ * 0 otherwise */
++MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) {
++    return ((U32)((prefixLowestIndex-1)  - repIndex) >= 3);
++}
+ 
+ 
+ /* debug functions */
+@@ -1302,7 +1454,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
+ 
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ /* ===============================================================
+  * Shared internal declarations
+@@ -1319,6 +1506,25 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_SequencePosition;
++
++/* for benchmark */
++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
++                        const ZSTD_Sequence* const inSeqs, size_t nbSequences,
++                        int const repcodeResolution);
++
++typedef struct {
++    size_t nbSequences;
++    size_t blockSize;
++    size_t litSize;
++} BlockSummary;
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs);
++
+ /* ==============================================================
+  * Private declarations
+  * These prototypes shall only be called from within lib/compress
+@@ -1330,7 +1536,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+  * Note: srcSizeHint == 0 means 0!
+  */
+ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
++        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
+ 
+ /*! ZSTD_initCStream_internal() :
+  *  Private use only. Init streaming operation.
+@@ -1342,7 +1548,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                      const ZSTD_CDict* cdict,
+                      const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+ 
+-void ZSTD_resetSeqStore(seqStore_t* ssPtr);
++void ZSTD_resetSeqStore(SeqStore_t* ssPtr);
+ 
+ /*! ZSTD_getCParamsFromCDict() :
+  *  as the name implies */
+@@ -1381,11 +1587,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1601,28 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..ec39b4299b6f 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+-    symbolEncodingType_e hType = set_compressed;
++    SymbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..256980c9d85a 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -153,20 +154,20 @@ size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+     return cost >> 8;
+ }
+ 
+-symbolEncodingType_e
++SymbolEncodingType_e
+ ZSTD_selectEncodingType(
+         FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+         size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+         FSE_CTable const* prevCTable,
+         short const* defaultNorm, U32 defaultNormLog,
+-        ZSTD_defaultPolicy_e const isDefaultAllowed,
++        ZSTD_DefaultPolicy_e const isDefaultAllowed,
+         ZSTD_strategy const strategy)
+ {
+     ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+@@ -241,7 +242,7 @@ typedef struct {
+ 
+ size_t
+ ZSTD_buildCTable(void* dst, size_t dstCapacity,
+-                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
++                FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type,
+                 unsigned* count, U32 max,
+                 const BYTE* codeTable, size_t nbSeq,
+                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+@@ -293,7 +294,7 @@ ZSTD_encodeSequences_body(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     BIT_CStream_t blockStream;
+     FSE_CState_t  stateMatchLength;
+@@ -387,7 +388,7 @@ ZSTD_encodeSequences_default(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                     CTable_MatchLength, mlCodeTable,
+@@ -405,7 +406,7 @@ ZSTD_encodeSequences_bmi2(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                     CTable_MatchLength, mlCodeTable,
+@@ -421,7 +422,7 @@ size_t ZSTD_encodeSequences(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+ {
+     DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+ #if DYNAMIC_BMI2
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..14fdccb6547f 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,26 +12,27 @@
+ #ifndef ZSTD_COMPRESS_SEQUENCES_H
+ #define ZSTD_COMPRESS_SEQUENCES_H
+ 
++#include "zstd_compress_internal.h" /* SeqDef */
+ #include "../common/fse.h" /* FSE_repeat, FSE_CTable */
+-#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
++#include "../common/zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */
+ 
+ typedef enum {
+     ZSTD_defaultDisallowed = 0,
+     ZSTD_defaultAllowed = 1
+-} ZSTD_defaultPolicy_e;
++} ZSTD_DefaultPolicy_e;
+ 
+-symbolEncodingType_e
++SymbolEncodingType_e
+ ZSTD_selectEncodingType(
+         FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+         size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+         FSE_CTable const* prevCTable,
+         short const* defaultNorm, U32 defaultNormLog,
+-        ZSTD_defaultPolicy_e const isDefaultAllowed,
++        ZSTD_DefaultPolicy_e const isDefaultAllowed,
+         ZSTD_strategy const strategy);
+ 
+ size_t
+ ZSTD_buildCTable(void* dst, size_t dstCapacity,
+-                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
++                FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type,
+                 unsigned* count, U32 max,
+                 const BYTE* codeTable, size_t nbSeq,
+                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+@@ -42,7 +44,7 @@ size_t ZSTD_encodeSequences(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
++            SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+ 
+ size_t ZSTD_fseBitCost(
+     FSE_CTable const* ctable,
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..dc12d64e935c 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -50,11 +52,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart + lhSize;
+     U32 const singleStream = lhSize == 3;
+-    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
++    SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
++static size_t
++ZSTD_seqDecompressedSize(SeqStore_t const* seqStore,
++                   const SeqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
++{
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_SequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const SeqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -258,7 +263,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+  *            Or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                    const seqDef* sequences, size_t nbSeq,
++                                    const SeqDef* sequences, size_t nbSeq,
+                                     const BYTE* literals, size_t litSize,
+                                     const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                     const ZSTD_CCtx_params* cctxParams,
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -322,7 +328,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit
+     return 0;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
++static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type,
+                         const BYTE* codeTable, unsigned maxCode,
+                         size_t nbSeq, const FSE_CTable* fseCTable,
+                         const U8* additionalBits,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,14 +427,57 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
+     return 0;
+ }
+ 
++static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
+-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
++static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+                             const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+@@ -432,12 +487,14 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const int bmi2, U32 lastBlock,
+                             void* workspace, size_t wkspSize)
+ {
+-    const seqDef* const sstart = seqStorePtr->sequencesStart;
+-    const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const SeqDef* const sstart = seqStorePtr->sequencesStart;
++    const SeqDef* const send = seqStorePtr->sequences;
++    const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+-
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
+-
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
++
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+         }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
+-            repcodes_t rep;
++            const SeqDef* seq;
++            Repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+@@ -559,7 +675,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+           &zc->blockState.nextCBlock->entropy,
+           &zc->appliedParams,
+           &entropyMetadata,
+-          zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
++          zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), "");
+ 
+     return ZSTD_compressSubBlock_multi(&zc->seqStore,
+             zc->blockState.prevCBlock,
+@@ -569,5 +685,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+             dst, dstCapacity,
+             src, srcSize,
+             zc->bmi2, lastBlock,
+-            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
++            zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */);
+ }
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..dce42f653bae 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,8 +15,10 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
+-
++#include "../common/portability_macros.h"
++#include "../common/compiler.h" /* ZS2_isPower2 */
+ 
+ /*-*************************************
+ *  Constants
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,14 +184,16 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+  * Align must be a power of 2.
+  */
+-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
++MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) {
+     size_t const mask = align - 1;
+-    assert((align & mask) == 0);
++    assert(ZSTD_isPower2(align));
+     return (size + mask) & ~mask;
+ }
+ 
+@@ -189,7 +207,7 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+  * to figure out how much space you need for the matchState tables. Everything
+  * else is though.
+  *
+- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
++ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size().
+  */
+ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+     if (size == 0)
+@@ -197,12 +215,16 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+     return size;
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) {
++    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment));
++}
++
+ /*
+  * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
+  * Used to determine the number of bytes required for a given "aligned".
+  */
+-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+-    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
++MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) {
++    return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES);
+ }
+ 
+ /*
+@@ -210,14 +232,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -229,11 +247,23 @@ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+-    assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(ZSTD_isPower2(alignBytes));
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws)
++{
++    char* endPtr = (char*)ws->workspaceEnd;
++    assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES));
++    endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES);
++    return (void*)endPtr;
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -246,7 +276,7 @@ ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+ {
+     void* const alloc = (BYTE*)ws->allocStart - bytes;
+     void* const bottom = ws->tableEnd;
+-    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
++    DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining",
+         alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+     ZSTD_cwksp_assert_internal_consistency(ws);
+     assert(alloc >= bottom);
+@@ -274,27 +304,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +335,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -345,29 +366,61 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+ 
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
+  */
+-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
+-                                            ZSTD_cwksp_alloc_aligned);
+-    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes)
++{
++    void* const ptr = ZSTD_cwksp_reserve_internal(ws,
++                        ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
++                        ZSTD_cwksp_alloc_aligned);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+     return ptr;
+ }
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -387,7 +440,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ 
+ 
+     assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+-    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+     return alloc;
+ }
+ 
+@@ -421,6 +474,20 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+ 
+     return alloc;
+ }
++/*
++ * with alignment control
++ * Note : should happen only once, at workspace first initialization
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment)
++{
++    size_t const mask = alignment - 1;
++    size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0;
++    void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus);
++    if (start == NULL) return NULL;
++    if (surplus == 0) return start;
++    assert(ZSTD_isPower2(alignment));
++    return (void*)(((size_t)start + surplus) & ~mask);
++}
+ 
+ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+ {
+@@ -451,7 +518,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -460,7 +527,8 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+  * Invalidates table allocations.
+  * All other allocations remain valid.
+  */
+-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
++MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws)
++{
+     DEBUGLOG(4, "cwksp: clearing tables!");
+ 
+ 
+@@ -478,14 +546,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +575,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -529,15 +607,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
+@@ -550,17 +619,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+@@ -591,5 +654,4 @@ MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+     }
+ }
+ 
+-
+ #endif /* ZSTD_CWKSP_H */
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..995e83f3a183 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,49 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,13 +85,26 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+ {
+     ZSTD_compressionParameters const* cParams = &ms->cParams;
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -88,9 +143,14 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* matchl0; /* the long match for ip */
+     const BYTE* matchs0; /* the short match for ip */
+     const BYTE* matchl1; /* the long match for ip1 */
++    const BYTE* matchs0_safe; /* matchs0 or safe address */
+ 
+     const BYTE* ip = istart; /* the current position */
+     const BYTE* ip1; /* the next position */
++    /* Array of ~random data, should have low probability of matching data
++     * we load from here instead of from tables, if matchl0/matchl1 are
++     * invalid indices. Used to avoid unpredictable branches. */
++    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+ 
+@@ -100,8 +160,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,30 +191,35 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+             hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+ 
+-            if (idxl0 > prefixLowestIndex) {
++            /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch.
++             * However expression below complies into conditional move. Since
++             * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
++             * if there is a match, all branches become predictable. */
++            {   const BYTE*  const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]);
++
+                 /* check prefix long match */
+-                if (MEM_read64(matchl0) == MEM_read64(ip)) {
++                if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) {
+                     mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                     offset = (U32)(ip-matchl0);
+                     while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                     goto _match_found;
+-                }
+-            }
++            }   }
+ 
+             idxl1 = hashLong[hl1];
+             matchl1 = base + idxl1;
+ 
+-            if (idxs0 > prefixLowestIndex) {
+-                /* check prefix short match */
+-                if (MEM_read32(matchs0) == MEM_read32(ip)) {
+-                    goto _search_next_long;
+-                }
++            /* Same optimization as matchl0 above */
++            matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]);
++
++            /* check prefix short match */
++            if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) {
++                  goto _search_next_long;
+             }
+ 
+             if (ip1 >= nextStep) {
+@@ -175,30 +240,36 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+ 
+ _search_next_long:
+ 
+-        /* check prefix long +1 match */
+-        if (idxl1 > prefixLowestIndex) {
+-            if (MEM_read64(matchl1) == MEM_read64(ip1)) {
++        /* short match found: let's check for a longer one */
++        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
++        offset = (U32)(ip - matchs0);
++
++        /* check long match at +1 position */
++        if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) {
++            size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8;
++            if (l1len > mLength) {
++                /* use the long match instead */
+                 ip = ip1;
+-                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
++                mLength = l1len;
+                 offset = (U32)(ip-matchl1);
+-                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+-                goto _match_found;
++                matchs0 = matchl1;
+             }
+         }
+ 
+-        /* if no long +1 match, explore the short match we found */
+-        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+-        offset = (U32)(ip - matchs0);
+-        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
++        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */
+ 
+         /* fall-through */
+ 
+@@ -217,7 +288,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +314,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -254,8 +325,9 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+ {
+@@ -275,9 +347,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+     const U32* const dictHashLong  = dms->hashTable;
+     const U32* const dictHashSmall = dms->chainTable;
+@@ -286,8 +357,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +366,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +387,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -323,26 +405,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */
+ 
+         /* check repcode */
+-        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++        if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+             && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+-        if (matchIndexL > prefixLowestIndex) {
++        if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+             /* check prefix long match */
+-            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+-                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+-                offset = (U32)(ip-matchLong);
+-                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+-                goto _match_found;
+-            }
+-        } else {
++            mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
++            offset = (U32)(ip-matchLong);
++            while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
++            goto _match_found;
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -354,13 +434,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         }   }
+ 
+         if (matchIndexS > prefixLowestIndex) {
+-            /* check prefix short match */
++            /* short match  candidate */
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,25 +455,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+             /* check prefix long +1 match */
+-            if (matchIndexL3 > prefixLowestIndex) {
+-                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+-                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+-                    ip++;
+-                    offset = (U32)(ip-matchL3);
+-                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+-                    goto _match_found;
+-                }
+-            } else {
++            if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) {
++                mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
++                ip++;
++                offset = (U32)(ip-matchL3);
++                while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
++                goto _match_found;
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +498,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -443,12 +522,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+                         dictBase + repIndex2 - dictIndexDelta :
+                         base + repIndex2;
+-                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
++                if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2))
+                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +540,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -470,7 +549,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ 
+ #define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
+     static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
+             void const* src, size_t srcSize)                                                             \
+     {                                                                                                    \
+         return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+@@ -488,7 +567,7 @@ ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+ 
+ 
+ size_t ZSTD_compressBlock_doubleFast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     const U32 mls = ms->cParams.minMatch;
+@@ -508,7 +587,7 @@ size_t ZSTD_compressBlock_doubleFast(
+ 
+ 
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     const U32 mls = ms->cParams.minMatch;
+@@ -527,8 +606,10 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+ {
+@@ -579,13 +660,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         size_t mLength;
+         hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
+ 
+-        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
++        if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
+             & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +677,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +702,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -647,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 U32 const current2 = (U32)(ip-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
++                if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
+                     & (offset_2 <= current2 - dictStartIndex))
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -677,7 +758,7 @@ ZSTD_GEN_DFAST_FN(extDict, 6)
+ ZSTD_GEN_DFAST_FN(extDict, 7)
+ 
+ size_t ZSTD_compressBlock_doubleFast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
+@@ -694,3 +775,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..011556ce56f7 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,22 +12,32 @@
+ #ifndef ZSTD_DOUBLE_FAST_H
+ #define ZSTD_DOUBLE_FAST_H
+ 
+-
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
++
+ size_t ZSTD_compressBlock_doubleFast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_doubleFast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..60e07e839e5f 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,46 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++    }   }   }   }
++}
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +85,60 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
++
++typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
++
++static int
++ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
++{
++    /* Array of ~random data, should have low probability of matching data.
++     * Load from here if the index is invalid.
++     * Used to avoid unpredictable branches. */
++    static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
++
++    /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
++     * However expression below compiles into conditional move.
++     */
++    const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
++    /* Note: this used to be written as : return test1 && test2;
++     * Unfortunately, once inlined, these tests become branches,
++     * in which case it becomes critical that they are executed in the right order (test1 then test2).
++     * So we have to write these tests in a specific manner to ensure their ordering.
++     */
++    if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
++    /* force ordering of these tests, which matters once the function is inlined, as they become branches */
++    __asm__("");
++    return matchIdx >= idxLowLimit;
++}
++
++static int
++ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
++{
++    /* using a branch instead of a cmov,
++     * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
++     * aka almost all candidates are within range */
++    U32 mval;
++    if (matchIdx >= idxLowLimit) {
++        mval = MEM_read32(matchAddress);
++    } else {
++        mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
++    }
++
++    return (MEM_read32(currentPtr) == mval);
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -89,17 +186,17 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+-        U32 const mls, U32 const hasStep)
++        U32 const mls, int useCmov)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+-    /* support stepSize of 0 */
+-    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+@@ -117,12 +214,11 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+-    U32 idx; /* match idx for ip0 */
+-    U32 mval; /* src value at match idx */
++    U32 matchIdx; /* match idx for ip0 */
+ 
+     U32 offcode;
+     const BYTE* match0;
+@@ -135,14 +231,15 @@ ZSTD_compressBlock_fast_noDict_generic(
+     size_t step;
+     const BYTE* nextStep;
+     const size_t kStepIncr = (1 << (kSearchStrength - 1));
++    const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+     ip0 += (ip0 == prefixStart);
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -163,7 +260,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+ 
+-    idx = hashTable[hash0];
++    matchIdx = hashTable[hash0];
+ 
+     do {
+         /* load repcode match for ip[2]*/
+@@ -180,26 +277,28 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* Write next hash table entry: it's already calculated.
++             * This write is known to be safe because ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+-        /* load match for ip[0] */
+-        if (idx >= prefixStartIndex) {
+-            mval = MEM_read32(base + idx);
+-        } else {
+-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+-        }
++         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
++            /* Write next hash table entry (it's already calculated).
++            * This write is known to be safe because the ip1 == ip0 + 1,
++            * so searching will resume after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
+ 
+-        /* check match at ip[0] */
+-        if (MEM_read32(ip0) == mval) {
+-            /* found a match! */
+             goto _offset;
+         }
+ 
+         /* lookup ip[1] */
+-        idx = hashTable[hash1];
++        matchIdx = hashTable[hash1];
+ 
+         /* hash ip[2] */
+         hash0 = hash1;
+@@ -214,21 +313,19 @@ ZSTD_compressBlock_fast_noDict_generic(
+         current0 = (U32)(ip0 - base);
+         hashTable[hash0] = current0;
+ 
+-        /* load match for ip[0] */
+-        if (idx >= prefixStartIndex) {
+-            mval = MEM_read32(base + idx);
+-        } else {
+-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+-        }
+-
+-        /* check match at ip[0] */
+-        if (MEM_read32(ip0) == mval) {
+-            /* found a match! */
++         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
++            /* Write next hash table entry, since it's already calculated */
++            if (step <= 4) {
++                /* Avoid writing an index if it's >= position where search will resume.
++                * The minimum possible match has length 4, so search can resume at ip0 + 4.
++                */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
+             goto _offset;
+         }
+ 
+         /* lookup ip[1] */
+-        idx = hashTable[hash1];
++        matchIdx = hashTable[hash1];
+ 
+         /* hash ip[2] */
+         hash0 = hash1;
+@@ -250,13 +347,28 @@ ZSTD_compressBlock_fast_noDict_generic(
+     } while (ip3 < ilimit);
+ 
+ _cleanup:
+-    /* Note that there are probably still a couple positions we could search.
++    /* Note that there are probably still a couple positions one could search.
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -264,10 +376,10 @@ ZSTD_compressBlock_fast_noDict_generic(
+ _offset: /* Requires: ip0, idx */
+ 
+     /* Compute the offset code. */
+-    match0 = base + idx;
++    match0 = base + matchIdx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +399,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +413,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -314,12 +421,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+     goto _start;
+ }
+ 
+-#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
+-    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
++#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov)                                                       \
++    static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov(                              \
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
+             void const* src, size_t srcSize)                                                       \
+     {                                                                                              \
+-        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
++        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
+     }
+ 
+ ZSTD_GEN_FAST_FN(noDict, 4, 1)
+@@ -333,13 +440,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ ZSTD_GEN_FAST_FN(noDict, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    U32 const mls = ms->cParams.minMatch;
++    U32 const mml = ms->cParams.minMatch;
++    /* use cmov when "candidate in range" branch is likely unpredictable */
++    int const useCmov = ms->cParams.windowLog < 19;
+     assert(ms->dictMatchState == NULL);
+-    if (ms->cParams.targetLength > 1) {
+-        switch(mls)
++    if (useCmov) {
++        switch(mml)
+         {
+         default: /* includes case 3 */
+         case 4 :
+@@ -352,7 +461,8 @@ size_t ZSTD_compressBlock_fast(
+             return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+         }
+     } else {
+-        switch(mls)
++        /* use a branch instead */
++        switch(mml)
+         {
+         default: /* includes case 3 */
+         case 4 :
+@@ -364,13 +474,13 @@ size_t ZSTD_compressBlock_fast(
+         case 7 :
+             return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+         }
+-
+     }
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -380,16 +490,16 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+     const U32* const dictHashTable = dms->hashTable;
+     const U32 dictStartIndex       = dms->window.dictLimit;
+@@ -397,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +523,154 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
++                /* found a regular match of size >= 4 */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+-                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -525,7 +683,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
+@@ -545,19 +703,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +729,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +760,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -656,10 +964,11 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ ZSTD_GEN_FAST_FN(extDict, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..04fde0a72a4e 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,21 +12,20 @@
+ #ifndef ZSTD_FAST_H
+ #define ZSTD_FAST_H
+ 
+-
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_fast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_fast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-
+ #endif /* ZSTD_FAST_H */
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..88e2501fe3ef 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,14 +11,23 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_MatchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,9 +160,10 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
+-        const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
++        const ZSTD_MatchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+         size_t bestLength,
+@@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch (
+         U32 const mls,
+         const ZSTD_dictMode_e dictMode)
+ {
+-    const ZSTD_matchState_t * const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t * const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+     const U32 * const dictHashTable = dms->hashTable;
+     U32         const hashLog = dmsCParams->hashLog;
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -378,24 +391,25 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+ * Dedicated dict search
+ ***********************************/
+ 
+-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip)
+ {
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+@@ -514,7 +528,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
+  */
+ FORCE_INLINE_TEMPLATE
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+-                                            const ZSTD_matchState_t* const dms,
++                                            const ZSTD_MatchState_t* const dms,
+                                             const BYTE* const ip, const BYTE* const iLimit,
+                                             const BYTE* const prefixStart, const U32 curr,
+                                             const U32 dictLimit, const size_t ddsIdx) {
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+-                        ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
++                        ZSTD_MatchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,21 +648,25 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+ }
+ 
+-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+-                        ZSTD_matchState_t* ms,
++                        ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+                         size_t* offsetPtr,
+                         const U32 mls, const ZSTD_dictMode_e dictMode)
+@@ -670,7 +690,7 @@ size_t ZSTD_HcFindBestMatch(
+     U32 nbAttempts = 1U << cParams->searchLog;
+     size_t ml=4-1;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
+                          ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+     const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -965,13 +947,41 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const
+  * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+  * processing.
+  */
+-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) {
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+     const U32 rowMask = (1u << rowLog) - 1;
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1103,29 +1124,30 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every context reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by introducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+-                        ZSTD_matchState_t* ms,
++                        ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+                         size_t* offsetPtr,
+                         const U32 mls, const ZSTD_dictMode_e dictMode,
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,11 +1165,14 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+ 
+     /* Initialize the following variables to satisfy static analyzer */
+     size_t ddsIdx = 0;
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1301,7 +1341,7 @@ size_t ZSTD_RowFindBestMatch(
+  * ZSTD_searchMax() dispatches to the correct implementation function.
+  *
+  * TODO: The start of the search function involves loading and calculating a
+- * bunch of constants from the ZSTD_matchState_t. These computations could be
++ * bunch of constants from the ZSTD_MatchState_t. These computations could be
+  * done in an initialization function, and saved somewhere in the match state.
+  * Then we could pass a pointer to the saved state instead of the match state,
+  * and avoid duplicate computations.
+@@ -1325,7 +1365,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \
+-            ZSTD_matchState_t* ms,                                                     \
++            ZSTD_MatchState_t* ms,                                                     \
+             const BYTE* ip, const BYTE* const iLimit,                                  \
+             size_t* offBasePtr)                                                        \
+     {                                                                                  \
+@@ -1335,7 +1375,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \
+-            ZSTD_matchState_t* ms,                                                    \
++            ZSTD_MatchState_t* ms,                                                    \
+             const BYTE* ip, const BYTE* const iLimit,                                 \
+             size_t* offsetPtr)                                                        \
+     {                                                                                 \
+@@ -1345,7 +1385,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \
+-            ZSTD_matchState_t* ms,                                                             \
++            ZSTD_MatchState_t* ms,                                                             \
+             const BYTE* ip, const BYTE* const iLimit,                                          \
+             size_t* offsetPtr)                                                                 \
+     {                                                                                          \
+@@ -1446,7 +1486,7 @@ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searc
+  * If a match is found its offset is stored in @p offsetPtr.
+  */
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+-    ZSTD_matchState_t* ms,
++    ZSTD_MatchState_t* ms,
+     const BYTE* ip,
+     const BYTE* iend,
+     size_t* offsetPtr,
+@@ -1472,9 +1512,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
+-                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
++                        ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const searchMethod_e searchMethod, const U32 depth,
+@@ -1491,12 +1532,13 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+     const int isDxS = isDMS || isDDS;
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+     const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+     const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1548,7 +1591,7 @@ ZSTD_compressBlock_lazy_generic(
+                                 && repIndex < prefixLowestIndex) ?
+                                    dictBase + (repIndex - dictIndexDelta) :
+                                    base + repIndex;
+-            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++            if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,34 +1631,34 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                dictBase + (repIndex - dictIndexDelta) :
+                                base + repIndex;
+-                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++                if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,34 +1667,34 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+                     const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                    dictBase + (repIndex - dictIndexDelta) :
+                                    base + repIndex;
+-                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++                    if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                         && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1682,12 +1741,12 @@ ZSTD_compressBlock_lazy_generic(
+                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                         dictBase - dictIndexDelta + repIndex :
+                         base + repIndex;
+-                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
++                if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,168 +1760,183 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+-                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
++                        ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const searchMethod_e searchMethod, const U32 depth)
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1912,7 +1987,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             const U32 repIndex = (U32)(curr+1 - offset_1);
+             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+             const BYTE* const repMatch = repBase + repIndex;
+-            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
++            if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
+             if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                 /* repcode detected we should take it */
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,30 +2023,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                 const BYTE* const repMatch = repBase + repIndex;
+-                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++                if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                    & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                 if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                     /* repcode detected */
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,50 +2055,57 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                     const BYTE* const repMatch = repBase + repIndex;
+-                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++                    if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                        & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                     if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                         /* repcode detected */
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2023,14 +2114,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             const U32 repIndex = repCurrent - offset_2;
+             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+             const BYTE* const repMatch = repBase + repIndex;
+-            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++            if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+             if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2045,58 +2136,65 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..987a036d8bde 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,7 +12,6 @@
+ #ifndef ZSTD_LAZY_H
+ #define ZSTD_LAZY_H
+ 
+-
+ #include "zstd_compress_internal.h"
+ 
+ /*
+@@ -22,98 +22,173 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
+-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip);
++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip);
+ 
+-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++size_t ZSTD_compressBlock_greedy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
+ 
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..54eefad9cae6 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,7 @@
+ #include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+ #include "zstd_ldm_geartab.h"
+ 
+-#define LDM_BUCKET_SIZE_LOG 3
++#define LDM_BUCKET_SIZE_LOG 4
+ #define LDM_MIN_MATCH_LENGTH 64
+ #define LDM_HASH_RLOG 7
+ 
+@@ -133,21 +134,35 @@ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
+ }
+ 
+ void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+-                               ZSTD_compressionParameters const* cParams)
++                        const ZSTD_compressionParameters* cParams)
+ {
+     params->windowLog = cParams->windowLog;
+     ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+     DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+-    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+-    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
++    if (params->hashRateLog == 0) {
++        if (params->hashLog > 0) {
++            /* if params->hashLog is set, derive hashRateLog from it */
++            assert(params->hashLog <= ZSTD_HASHLOG_MAX);
++            if (params->windowLog > params->hashLog) {
++                params->hashRateLog = params->windowLog - params->hashLog;
++            }
++        } else {
++            assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
++            /* mapping from [fast, rate7] to [btultra2, rate4] */
++            params->hashRateLog = 7 - (cParams->strategy/3);
++        }
++    }
+     if (params->hashLog == 0) {
+-        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+-        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
++        params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX);
+     }
+-    if (params->hashRateLog == 0) {
+-        params->hashRateLog = params->windowLog < params->hashLog
+-                                   ? 0
+-                                   : params->windowLog - params->hashLog;
++    if (params->minMatchLength == 0) {
++        params->minMatchLength = LDM_MIN_MATCH_LENGTH;
++        if (cParams->strategy >= ZSTD_btultra)
++            params->minMatchLength /= 2;
++    }
++    if (params->bucketSizeLog==0) {
++        assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
++        params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX);
+     }
+     params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+ }
+@@ -170,22 +185,22 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+ /* ZSTD_ldm_getBucket() :
+  *  Returns a pointer to the start of the bucket associated with hash. */
+ static ldmEntry_t* ZSTD_ldm_getBucket(
+-        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
++        const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog)
+ {
+-    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
++    return ldmState->hashTable + (hash << bucketSizeLog);
+ }
+ 
+ /* ZSTD_ldm_insertEntry() :
+  *  Insert the entry with corresponding hash into the hash table */
+ static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                  size_t const hash, const ldmEntry_t entry,
+-                                 ldmParams_t const ldmParams)
++                                 U32 const bucketSizeLog)
+ {
+     BYTE* const pOffset = ldmState->bucketOffsets + hash;
+     unsigned const offset = *pOffset;
+ 
+-    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
+-    *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
++    *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry;
++    *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1));
+ 
+ }
+ 
+@@ -234,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMatch_2segments(
+  *
+  *  The tables for the other strategies are filled within their
+  *  block compressors. */
+-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
++static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms,
+                                       void const* end)
+ {
+     const BYTE* const iend = (const BYTE*)end;
+@@ -242,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+@@ -269,7 +288,8 @@ void ZSTD_ldm_fillHashTable(
+             const BYTE* iend, ldmParams_t const* params)
+ {
+     U32 const minMatchLength = params->minMatchLength;
+-    U32 const hBits = params->hashLog - params->bucketSizeLog;
++    U32 const bucketSizeLog = params->bucketSizeLog;
++    U32 const hBits = params->hashLog - bucketSizeLog;
+     BYTE const* const base = ldmState->window.base;
+     BYTE const* const istart = ip;
+     ldmRollingHashState_t hashState;
+@@ -284,7 +304,7 @@ void ZSTD_ldm_fillHashTable(
+         unsigned n;
+ 
+         numSplits = 0;
+-        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
++        hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits);
+ 
+         for (n = 0; n < numSplits; n++) {
+             if (ip + splits[n] >= istart + minMatchLength) {
+@@ -295,7 +315,7 @@ void ZSTD_ldm_fillHashTable(
+ 
+                 entry.offset = (U32)(split - base);
+                 entry.checksum = (U32)(xxhash >> 32);
+-                ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog);
+             }
+         }
+ 
+@@ -309,7 +329,7 @@ void ZSTD_ldm_fillHashTable(
+  *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+  *  if it is far way
+  *  (after a long match, only update tables a limited amount). */
+-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
++static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor)
+ {
+     U32 const curr = (U32)(anchor - ms->window.base);
+     if (curr > ms->nextToUpdate + 1024) {
+@@ -318,8 +338,10 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
+-        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
++        ldmState_t* ldmState, RawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+     /* LDM parameters */
+@@ -373,7 +395,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+             candidates[n].split = split;
+             candidates[n].hash = hash;
+             candidates[n].checksum = (U32)(xxhash >> 32);
+-            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
++            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog);
+             PREFETCH_L1(candidates[n].bucket);
+         }
+ 
+@@ -396,7 +418,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+              * the previous one, we merely register it in the hash table and
+              * move on */
+             if (split < anchor) {
+-                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+                 continue;
+             }
+ 
+@@ -443,7 +465,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+             /* No match found -- insert an entry into the hash table
+              * and process the next candidate match */
+             if (bestEntry == NULL) {
+-                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+                 continue;
+             }
+ 
+@@ -464,7 +486,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+ 
+             /* Insert the current entry into the hash table --- it must be
+              * done after the previous block to avoid clobbering bestEntry */
+-            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+ 
+             anchor = split + forwardMatchLength;
+ 
+@@ -503,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+ }
+ 
+ size_t ZSTD_ldm_generateSequences(
+-        ldmState_t* ldmState, rawSeqStore_t* sequences,
++        ldmState_t* ldmState, RawSeqStore_t* sequences,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+     U32 const maxDist = 1U << params->windowLog;
+@@ -549,7 +571,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -580,7 +602,7 @@ size_t ZSTD_ldm_generateSequences(
+ }
+ 
+ void
+-ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
++ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+ {
+     while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+         rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+@@ -616,7 +638,7 @@ ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const min
+  * Returns the current sequence to handle, or if the rest of the block should
+  * be literals, it returns a sequence with offset == 0.
+  */
+-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
++static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore,
+                                  U32 const remaining, U32 const minMatch)
+ {
+     rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+@@ -640,7 +662,7 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+     return sequence;
+ }
+ 
+-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) {
+     U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+     while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+         rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+@@ -657,14 +679,14 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+     }
+ }
+ 
+-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-    ZSTD_paramSwitch_e useRowMatchFinder,
++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore,
++    ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++    ZSTD_ParamSwitch_e useRowMatchFinder,
+     void const* src, size_t srcSize)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     unsigned const minMatch = cParams->minMatch;
+-    ZSTD_blockCompressor const blockCompressor =
++    ZSTD_BlockCompressor_f const blockCompressor =
+         ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
+     /* Input bounds */
+     BYTE const* const istart = (BYTE const*)src;
+@@ -689,7 +711,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +723,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +733,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..41400a7191b2 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,7 +12,6 @@
+ #ifndef ZSTD_LDM_H
+ #define ZSTD_LDM_H
+ 
+-
+ #include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+ #include <linux/zstd.h>   /* ZSTD_CCtx, size_t */
+ 
+@@ -40,7 +40,7 @@ void ZSTD_ldm_fillHashTable(
+  *       sequences.
+  */
+ size_t ZSTD_ldm_generateSequences(
+-            ldmState_t* ldms, rawSeqStore_t* sequences,
++            ldmState_t* ldms, RawSeqStore_t* sequences,
+             ldmParams_t const* params, void const* src, size_t srcSize);
+ 
+ /*
+@@ -61,9 +61,9 @@ size_t ZSTD_ldm_generateSequences(
+  * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+  * NOTE: This function does not return any errors.
+  */
+-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-            ZSTD_paramSwitch_e useRowMatchFinder,
++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore,
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++            ZSTD_ParamSwitch_e useRowMatchFinder,
+             void const* src, size_t srcSize);
+ 
+ /*
+@@ -73,7 +73,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+  * Avoids emitting matches less than `minMatch` bytes.
+  * Must be called for data that is not passed to ZSTD_ldm_blockCompress().
+  */
+-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
++void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize,
+     U32 const minMatch);
+ 
+ /* ZSTD_ldm_skipRawSeqStoreBytes():
+@@ -81,7 +81,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+  * Not to be used in conjunction with ZSTD_ldm_skipSequences().
+  * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+  */
+-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes);
++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes);
+ 
+ /* ZSTD_ldm_getTableSize() :
+  *  Estimate the space needed for long distance matching tables or 0 if LDM is
+@@ -107,5 +107,4 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+ void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                                ZSTD_compressionParameters const* cParams);
+ 
+-
+ #endif /* ZSTD_FAST_H */
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..b62fd1b0d83e 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +30,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
+-                const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
++                const ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+                 U32 const mls, const int extDict)
+@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+-                ZSTD_matchState_t* ms,
++                ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 const U32 mls, const ZSTD_dictMode_e dictMode)
+ {
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal(
+     ms->nextToUpdate = target;
+ }
+ 
+-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) {
+     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+ }
+ 
+ FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_MatchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+     U32 mnum = 0;
+     U32 nbCompares = 1U << cParams->searchLog;
+ 
+-    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
++    const ZSTD_MatchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+     const ZSTD_compressionParameters* const dmsCParams =
+                                       dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+     const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 assert(curr >= windowLow);
+                 if ( dictMode == ZSTD_extDict
+                   && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow)  /* equivalent to `curr > repIndex >= windowLow` */
+-                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
++                     & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
+                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                 }
+                 if (dictMode == ZSTD_dictMatchState
+                   && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `curr > repIndex >= dmsLowLimit` */
+-                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
++                     & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
+                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+             }   }
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+ typedef U32 (*ZSTD_getAllMatchesFn)(
+     ZSTD_match_t*,
+-    ZSTD_matchState_t*,
++    ZSTD_MatchState_t*,
+     U32*,
+     const BYTE*,
+     const BYTE*,
+@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+-        ZSTD_matchState_t* ms,
++        ZSTD_MatchState_t* ms,
+         U32* nextToUpdate3,
+         const BYTE* ip,
+         const BYTE* const iHighLimit,
+@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+ #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \
+     static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \
+             ZSTD_match_t* matches,                             \
+-            ZSTD_matchState_t* ms,                             \
++            ZSTD_MatchState_t* ms,                             \
+             U32* nextToUpdate3,                                \
+             const BYTE* ip,                                    \
+             const BYTE* const iHighLimit,                      \
+@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+     }
+ 
+ static ZSTD_getAllMatchesFn
+-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
++ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode)
+ {
+     ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
+         ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const di
+ 
+ /* Struct containing info needed to make decision about ldm inclusion */
+ typedef struct {
+-    rawSeqStore_t seqStore;   /* External match candidates store for this block */
++    RawSeqStore_t seqStore;   /* External match candidates store for this block */
+     U32 startPosInBlock;      /* Start position of the current match candidate */
+     U32 endPosInBlock;        /* End position of the current match candidate */
+     U32 offset;               /* Offset of the match candidate */
+@@ -878,7 +916,7 @@ typedef struct {
+  * Moves forward in @rawSeqStore by @nbBytes,
+  * which will update the fields 'pos' and 'posInSequence'.
+  */
+-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
++static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes)
+ {
+     U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+     while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
+         return;
+     }
+ 
+-    /* Matches may be < MINMATCH by this process. In that case, we will reject them
++    /* Matches may be < minMatch by this process. In that case, we will reject them
+        when we are deciding whether or not to add the ldm */
+     optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
+     optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
+@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
+  * into 'matches'. Maintains the correct ordering of 'matches'.
+  */
+ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+-                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
++                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
++                                      U32 minMatch)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+     if (currPosInBlock < optLdm->startPosInBlock
+       || currPosInBlock >= optLdm->endPosInBlock
+-      || candidateMatchLength < MINMATCH) {
++      || candidateMatchLength < minMatch) {
+         return;
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+ static void
+ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+                                   ZSTD_match_t* matches, U32* nbMatches,
+-                                  U32 currPosInBlock, U32 remainingBytes)
++                                  U32 currPosInBlock, U32 remainingBytes,
++                                  U32 minMatch)
+ {
+     if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+         return;
+@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+         }
+         ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
+     }
+-    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
++    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch);
+ }
+ 
+ 
+@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltID)
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+-                               seqStore_t* seqStore,
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
++ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms,
++                               SeqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+                          const void* src, size_t srcSize,
+                          const int optLevel,
+@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip),
++                                              minMatch);
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                     pos, ZSTD_fCost(sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
++                        opt[pos].off = offBase;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
++                    ZSTD_optimal_t const prevMatch = opt[cur];
++                    DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
++                                (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+-            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
++            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+-                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
++                Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
++                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+                 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                                  (U32)(inr-istart), (U32)(iend-inr));
++                                                  (U32)(inr-istart), (U32)(iend-inr),
++                                                  minMatch);
+ 
+                 if (!nbMatches) {
+                     DEBUGLOG(7, "rPos:%u : no match found", cur);
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u",
++                                (int)(inr-istart), cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
+                 }   }
+ 
+@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+-                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+-                                anchor - istart, (unsigned)llen, (unsigned)mlen);
++                    DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u",
++                                (int)(anchor - istart), (unsigned)llen, (unsigned)mlen);
+ 
+                     if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                         assert(storePos == storeEnd);   /* must be last sequence */
+@@ -1308,11 +1426,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,42 +1441,51 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms,
++                          SeqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ }
+ 
+ size_t ZSTD_compressBlock_btultra(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra(
+ }
+ 
+ size_t ZSTD_compressBlock_btultra2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2(
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..fbdc540ec9d1 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,40 +12,62 @@
+ #ifndef ZSTD_OPT_H
+ #define ZSTD_OPT_H
+ 
+-
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ #endif /* ZSTD_OPT_H */
+diff --git a/lib/zstd/compress/zstd_preSplit.c b/lib/zstd/compress/zstd_preSplit.c
+new file mode 100644
+index 000000000000..7d9403c9a3bc
+--- /dev/null
++++ b/lib/zstd/compress/zstd_preSplit.c
+@@ -0,0 +1,239 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#include "../common/compiler.h" /* ZSTD_ALIGNOF */
++#include "../common/mem.h" /* S64 */
++#include "../common/zstd_deps.h" /* ZSTD_memset */
++#include "../common/zstd_internal.h" /* ZSTD_STATIC_ASSERT */
++#include "hist.h" /* HIST_add */
++#include "zstd_preSplit.h"
++
++
++#define BLOCKSIZE_MIN 3500
++#define THRESHOLD_PENALTY_RATE 16
++#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2)
++#define THRESHOLD_PENALTY 3
++
++#define HASHLENGTH 2
++#define HASHLOG_MAX 10
++#define HASHTABLESIZE (1 << HASHLOG_MAX)
++#define HASHMASK (HASHTABLESIZE - 1)
++#define KNUTH 0x9e3779b9
++
++/* for hashLog > 8, hash 2 bytes.
++ * for hashLog == 8, just take the byte, no hashing.
++ * The speed of this method relies on compile-time constant propagation */
++FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog)
++{
++    assert(hashLog >= 8);
++    if (hashLog == 8) return (U32)((const BYTE*)p)[0];
++    assert(hashLog <= HASHLOG_MAX);
++    return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog);
++}
++
++
++typedef struct {
++  unsigned events[HASHTABLESIZE];
++  size_t nbEvents;
++} Fingerprint;
++typedef struct {
++    Fingerprint pastEvents;
++    Fingerprint newEvents;
++} FPStats;
++
++static void initStats(FPStats* fpstats)
++{
++    ZSTD_memset(fpstats, 0, sizeof(FPStats));
++}
++
++FORCE_INLINE_TEMPLATE void
++addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog)
++{
++    const char* p = (const char*)src;
++    size_t limit = srcSize - HASHLENGTH + 1;
++    size_t n;
++    assert(srcSize >= HASHLENGTH);
++    for (n = 0; n < limit; n+=samplingRate) {
++        fp->events[hash2(p+n, hashLog)]++;
++    }
++    fp->nbEvents += limit/samplingRate;
++}
++
++FORCE_INLINE_TEMPLATE void
++recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog)
++{
++    ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog));
++    fp->nbEvents = 0;
++    addEvents_generic(fp, src, srcSize, samplingRate, hashLog);
++}
++
++typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize);
++
++#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate
++
++#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize)                                 \
++    static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \
++    {                                                                              \
++        recordFingerprint_generic(fp, src, srcSize, _rate, _hSize);                \
++    }
++
++ZSTD_GEN_RECORD_FINGERPRINT(1, 10)
++ZSTD_GEN_RECORD_FINGERPRINT(5, 10)
++ZSTD_GEN_RECORD_FINGERPRINT(11, 9)
++ZSTD_GEN_RECORD_FINGERPRINT(43, 8)
++
++
++static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); }
++
++static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog)
++{
++    U64 distance = 0;
++    size_t n;
++    assert(hashLog <= HASHLOG_MAX);
++    for (n = 0; n < ((size_t)1 << hashLog); n++) {
++        distance +=
++            abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents);
++    }
++    return distance;
++}
++
++/* Compare newEvents with pastEvents
++ * return 1 when considered "too different"
++ */
++static int compareFingerprints(const Fingerprint* ref,
++                            const Fingerprint* newfp,
++                            int penalty,
++                            unsigned hashLog)
++{
++    assert(ref->nbEvents > 0);
++    assert(newfp->nbEvents > 0);
++    {   U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents;
++        U64 deviation = fpDistance(ref, newfp, hashLog);
++        U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE;
++        return deviation >= threshold;
++    }
++}
++
++static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        acc->events[n] += newfp->events[n];
++    }
++    acc->nbEvents += newfp->nbEvents;
++}
++
++static void flushEvents(FPStats* fpstats)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        fpstats->pastEvents.events[n] = fpstats->newEvents.events[n];
++    }
++    fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents;
++    ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents));
++}
++
++static void removeEvents(Fingerprint* acc, const Fingerprint* slice)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        assert(acc->events[n] >= slice->events[n]);
++        acc->events[n] -= slice->events[n];
++    }
++    acc->nbEvents -= slice->nbEvents;
++}
++
++#define CHUNKSIZE (8 << 10)
++static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize,
++                        int level,
++                        void* workspace, size_t wkspSize)
++{
++    static const RecordEvents_f records_fs[] = {
++        FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1)
++    };
++    static const unsigned hashParams[] = { 8, 9, 10, 10 };
++    const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]);
++    FPStats* const fpstats = (FPStats*)workspace;
++    const char* p = (const char*)blockStart;
++    int penalty = THRESHOLD_PENALTY;
++    size_t pos = 0;
++    assert(blockSize == (128 << 10));
++    assert(workspace != NULL);
++    assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0);
++    ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats));
++    assert(wkspSize >= sizeof(FPStats)); (void)wkspSize;
++
++    initStats(fpstats);
++    record_f(&fpstats->pastEvents, p, CHUNKSIZE);
++    for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) {
++        record_f(&fpstats->newEvents, p + pos, CHUNKSIZE);
++        if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) {
++            return pos;
++        } else {
++            mergeEvents(&fpstats->pastEvents, &fpstats->newEvents);
++            if (penalty > 0) penalty--;
++        }
++    }
++    assert(pos == blockSize);
++    return blockSize;
++    (void)flushEvents; (void)removeEvents;
++}
++
++/* ZSTD_splitBlock_fromBorders(): very fast strategy :
++ * compare fingerprint from beginning and end of the block,
++ * derive from their difference if it's preferable to split in the middle,
++ * repeat the process a second time, for finer grained decision.
++ * 3 times did not brought improvements, so I stopped at 2.
++ * Benefits are good enough for a cheap heuristic.
++ * More accurate splitting saves more, but speed impact is also more perceptible.
++ * For better accuracy, use more elaborate variant *_byChunks.
++ */
++static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize,
++                        void* workspace, size_t wkspSize)
++{
++#define SEGMENT_SIZE 512
++    FPStats* const fpstats = (FPStats*)workspace;
++    Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned));
++    assert(blockSize == (128 << 10));
++    assert(workspace != NULL);
++    assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0);
++    ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats));
++    assert(wkspSize >= sizeof(FPStats)); (void)wkspSize;
++
++    initStats(fpstats);
++    HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE);
++    HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE);
++    fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE;
++    if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8))
++        return blockSize;
++
++    HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE);
++    middleEvents->nbEvents = SEGMENT_SIZE;
++    {   U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8);
++        U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8);
++        U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3;
++        if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance)
++            return 64 KB;
++        return (distFromBegin > distFromEnd) ? 32 KB : 96 KB;
++    }
++}
++
++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
++                    int level,
++                    void* workspace, size_t wkspSize)
++{
++    DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level);
++    assert(0<=level && level<=4);
++    if (level == 0)
++        return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize);
++    /* level >= 1*/
++    return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize);
++}
+diff --git a/lib/zstd/compress/zstd_preSplit.h b/lib/zstd/compress/zstd_preSplit.h
+new file mode 100644
+index 000000000000..f98f797fe191
+--- /dev/null
++++ b/lib/zstd/compress/zstd_preSplit.h
+@@ -0,0 +1,34 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_PRESPLIT_H
++#define ZSTD_PRESPLIT_H
++
++#include <linux/types.h>  /* size_t */
++
++#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208
++
++/* ZSTD_splitBlock():
++ * @level must be a value between 0 and 4.
++ *        higher levels spend more energy to detect block boundaries.
++ * @workspace must be aligned for size_t.
++ * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE
++ * note:
++ * For the time being, this function only accepts full 128 KB blocks.
++ * Therefore, @blockSize must be == 128 KB.
++ * While this could be extended to smaller sizes in the future,
++ * it is not yet clear if this would be useful. TBD.
++ */
++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
++                    int level,
++                    void* workspace, size_t wkspSize);
++
++#endif /* ZSTD_PRESPLIT_H */
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..ac8b87f48f84 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilowest, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+ 
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
+          */
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
+ 
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
++    bit->start = (const char*)args->ilowest;
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +379,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        assert(dstSize >= 6); /* validated above */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    do {                                                        \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    } while (0)
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    do {                                                            \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+     * overwritten any op[].
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1175,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        assert(dstSize >= 6 /* validated above */);
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1);
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
++        } while (op[3] < olimit);
++    }
++
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..30ef65e1ab5c 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94711..da8b4cf116e3 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -53,13 +54,15 @@
+ *  Dependencies
+ *********************************************************/
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+@@ -72,11 +75,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
++**           or an error code, which can be tested using ZSTD_isError() */
++size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -438,8 +468,10 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
+             if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+                 return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+             ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));
+-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+             zfhPtr->frameType = ZSTD_skippableFrame;
++            zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START;
++            zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE;
++            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+             return 0;
+         }
+         RETURN_ERROR(prefix_unknown, "");
+@@ -508,7 +540,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+  *           or an error code, which can be tested using ZSTD_isError() */
+-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
++size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize)
+ {
+     return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+ }
+@@ -520,7 +552,7 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src
+  *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+ {
+-    {   ZSTD_frameHeader zfh;
++    {   ZSTD_FrameHeader zfh;
+         if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+             return ZSTD_CONTENTSIZE_ERROR;
+         if (zfh.frameType == ZSTD_skippableFrame) {
+@@ -540,61 +572,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+-    unsigned long long totalDstSize = 0;
++    U64 totalDstSize = 0;
+ 
+     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+         U32 const magicNumber = MEM_readLE32(src);
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +635,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (U64_MAX - totalDstSize < fcs)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -676,13 +709,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -693,10 +726,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+         const BYTE* const ipstart = ip;
+         size_t remainingSize = srcSize;
+         size_t nbBlocks = 0;
+-        ZSTD_frameHeader zfh;
++        ZSTD_FrameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,28 +763,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+  *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame or a skippeable frame
++ *  `src` must point to the start of a ZSTD frame or a skippable frame
+  *  `srcSize` must be at least as large as the frame contained
+  *  @return : the maximum decompressed size of the compressed source
+  */
+@@ -760,7 +796,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +809,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_FrameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -815,7 +893,7 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+     return regenSize;
+ }
+ 
+-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming)
++static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming)
+ {
+     (void)dctx;
+     (void)uncompressedSize;
+@@ -856,6 +934,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +970,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +984,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1015,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr));
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1043,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1150,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1181,7 +1270,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1340,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1353,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1394,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,10 +1494,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1557,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1566,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1673,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1683,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1694,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1766,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1815,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1854,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1754,6 +1873,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1890,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1908,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1793,7 +1924,7 @@ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+ {
+     U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+-    ZSTD_frameHeader zfh;
++    ZSTD_FrameHeader zfh;
+     size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+     if (ZSTD_isError(err)) return err;
+     RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+@@ -1888,6 +2019,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     U32 someMoreWork = 1;
+ 
+     DEBUGLOG(5, "ZSTD_decompressStream");
++    assert(zds != NULL);
+     RETURN_ERROR_IF(
+         input->pos > input->size,
+         srcSize_wrong,
+@@ -1918,7 +2050,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2063,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1943,14 +2079,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -1969,7 +2106,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2123,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2174,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2189,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2198,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2212,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2236,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2249,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2287,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..710eb0ffd5a3 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+-        }
+-        else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -124,7 +140,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+-        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -317,7 +359,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  * - start from default distributions, present in /lib/common/zstd_internal.h
+  * - generate tables normally, using ZSTD_buildFSETable()
+  * - printout the content of tables
+- * - pretify output, report below, test with fuzzer to ensure it's correct */
++ * - prettify output, report below, test with fuzzer to ensure it's correct */
+ 
+ /* Default FSE distribution table for Literal Lengths */
+ static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -603,7 +646,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+  * @return : nb bytes read from src,
+  *           or an error code if it fails */
+ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+-                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
++                                 SymbolEncodingType_e type, unsigned max, U32 maxLog,
+                                  const void* src, size_t srcSize,
+                                  const U32* baseValue, const U8* nbAdditionalBits,
+                                  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,11 +719,19 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+-    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+-        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+-        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
++    {   SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6);
++        SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3);
++        SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3);
+         ip++;
+ 
+         /* Build DTables */
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,50 +1928,40 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ #endif /* DYNAMIC_BMI2 */
+ 
+-typedef size_t (*ZSTD_decompressSequences_t)(
+-                            ZSTD_DCtx* dctx,
+-                            void* dst, size_t maxDstSize,
+-                            const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
+-
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1976,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+-
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
++
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,40 +2127,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2187,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
++    dctx->isFrameDecompression = 0;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..becffbd89364 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..2a225d1811c4 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -135,7 +137,7 @@ struct ZSTD_DCtx_s
+     const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+     const void* dictEnd;          /* end of previous segment */
+     size_t expected;
+-    ZSTD_frameHeader fParams;
++    ZSTD_FrameHeader fParams;
+     U64 processedCSize;
+     U64 decodedSize;
+     blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+@@ -152,7 +154,8 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
+-#if DYNAMIC_BMI2 != 0
++    int isFrameDecompression;
++#if DYNAMIC_BMI2
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+ 
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
++    int maxBlockSizeParam;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+@@ -199,11 +204,11 @@ struct ZSTD_DCtx_s
+ };  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+ 
+ MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+-#if DYNAMIC_BMI2 != 0
+-	return dctx->bmi2;
++#if DYNAMIC_BMI2
++    return dctx->bmi2;
+ #else
+     (void)dctx;
+-	return 0;
++    return 0;
+ #endif
+ }
+ 
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..466828e35752 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index bd8784449b31..a788ebfcb111 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,6 +16,7 @@
+ 
+ #include "common/zstd_deps.h"
+ #include "common/zstd_internal.h"
++#include "compress/zstd_compress_internal.h"
+ 
+ #define ZSTD_FORWARD_IF_ERR(ret)            \
+ 	do {                                \
+@@ -85,6 +86,12 @@ zstd_parameters zstd_get_params(int level,
+ }
+ EXPORT_SYMBOL(zstd_get_params);
+ 
++size_t zstd_cctx_set_param(zstd_cctx *cctx, ZSTD_cParameter param, int value)
++{
++	return ZSTD_CCtx_setParameter(cctx, param, value);
++}
++EXPORT_SYMBOL(zstd_cctx_set_param);
++
+ zstd_compression_parameters zstd_get_cparams(int level,
+ 	unsigned long long estimated_src_size, size_t dict_size)
+ {
+@@ -98,6 +105,52 @@ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+ }
+ EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+ 
++// Used by zstd_cctx_workspace_bound_with_ext_seq_prod()
++static size_t dummy_external_sequence_producer(
++	void *sequenceProducerState,
++	ZSTD_Sequence *outSeqs, size_t outSeqsCapacity,
++	const void *src, size_t srcSize,
++	const void *dict, size_t dictSize,
++	int compressionLevel,
++	size_t windowSize)
++{
++	(void)sequenceProducerState;
++	(void)outSeqs; (void)outSeqsCapacity;
++	(void)src; (void)srcSize;
++	(void)dict; (void)dictSize;
++	(void)compressionLevel;
++	(void)windowSize;
++	return ZSTD_SEQUENCE_PRODUCER_ERROR;
++}
++
++static void init_cctx_params_from_compress_params(
++	ZSTD_CCtx_params *cctx_params,
++	const zstd_compression_parameters *compress_params)
++{
++	ZSTD_parameters zstd_params;
++	memset(&zstd_params, 0, sizeof(zstd_params));
++	zstd_params.cParams = *compress_params;
++	ZSTD_CCtxParams_init_advanced(cctx_params, zstd_params);
++}
++
++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params)
++{
++	ZSTD_CCtx_params cctx_params;
++	init_cctx_params_from_compress_params(&cctx_params, compress_params);
++	ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer);
++	return ZSTD_estimateCCtxSize_usingCCtxParams(&cctx_params);
++}
++EXPORT_SYMBOL(zstd_cctx_workspace_bound_with_ext_seq_prod);
++
++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params)
++{
++	ZSTD_CCtx_params cctx_params;
++	init_cctx_params_from_compress_params(&cctx_params, compress_params);
++	ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer);
++	return ZSTD_estimateCStreamSize_usingCCtxParams(&cctx_params);
++}
++EXPORT_SYMBOL(zstd_cstream_workspace_bound_with_ext_seq_prod);
++
+ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+ {
+ 	if (workspace == NULL)
+@@ -209,5 +262,25 @@ size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+ }
+ EXPORT_SYMBOL(zstd_end_stream);
+ 
++void zstd_register_sequence_producer(
++  zstd_cctx *cctx,
++  void* sequence_producer_state,
++  zstd_sequence_producer_f sequence_producer
++) {
++	ZSTD_registerSequenceProducer(cctx, sequence_producer_state, sequence_producer);
++}
++EXPORT_SYMBOL(zstd_register_sequence_producer);
++
++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity,
++					    const zstd_sequence *in_seqs, size_t in_seqs_size,
++					    const void* literals, size_t lit_size, size_t lit_capacity,
++					    size_t decompressed_size)
++{
++	return ZSTD_compressSequencesAndLiterals(cctx, dst, dst_capacity, in_seqs,
++						 in_seqs_size, literals, lit_size,
++						 lit_capacity, decompressed_size);
++}
++EXPORT_SYMBOL(zstd_compress_sequences_and_literals);
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Compressor");
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index 469fc3059be0..0ae819f0c927 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.48.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.14/0001-bore.patch b/sys-kernel/gentoo-sources-6.14/0001-bore.patch
new file mode 100644
index 0000000..82b3dac
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.14/0001-bore.patch
@@ -0,0 +1,1006 @@
+From 9c32e28fe484288e6ba87efd34914c1dcb3f3150 Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Mon, 2 Jun 2025 20:44:49 +0200
+Subject: [PATCH] bore
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ include/linux/sched.h      |  18 ++
+ include/linux/sched/bore.h |  40 ++++
+ init/Kconfig               |  17 ++
+ kernel/Kconfig.hz          |  17 ++
+ kernel/fork.c              |   6 +
+ kernel/sched/Makefile      |   1 +
+ kernel/sched/bore.c        | 443 +++++++++++++++++++++++++++++++++++++
+ kernel/sched/core.c        |   6 +
+ kernel/sched/debug.c       |  61 ++++-
+ kernel/sched/fair.c        |  73 +++++-
+ kernel/sched/sched.h       |   9 +
+ 11 files changed, 686 insertions(+), 5 deletions(-)
+ create mode 100644 include/linux/sched/bore.h
+ create mode 100644 kernel/sched/bore.c
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 6e5c38718..77ac55985 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -550,6 +550,15 @@ struct sched_statistics {
+ #endif /* CONFIG_SCHEDSTATS */
+ } ____cacheline_aligned;
+ 
++#ifdef CONFIG_SCHED_BORE
++struct sched_burst_cache {
++	u8				score;
++	u32				count;
++	u64				timestamp;
++    spinlock_t		lock;
++};
++#endif // CONFIG_SCHED_BORE
++
+ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+@@ -569,6 +578,15 @@ struct sched_entity {
+ 	u64				sum_exec_runtime;
+ 	u64				prev_sum_exec_runtime;
+ 	u64				vruntime;
++#ifdef CONFIG_SCHED_BORE
++	u64				burst_time;
++	u8				prev_burst_penalty;
++	u8				curr_burst_penalty;
++	u8				burst_penalty;
++	u8				burst_score;
++	struct sched_burst_cache child_burst;
++	struct sched_burst_cache group_burst;
++#endif // CONFIG_SCHED_BORE
+ 	s64				vlag;
+ 	u64				slice;
+ 
+diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h
+new file mode 100644
+index 000000000..a8faabc28
+--- /dev/null
++++ b/include/linux/sched/bore.h
+@@ -0,0 +1,40 @@
++
++#include <linux/sched.h>
++#include <linux/sched/cputime.h>
++
++#ifndef _LINUX_SCHED_BORE_H
++#define _LINUX_SCHED_BORE_H
++#define SCHED_BORE_VERSION "5.9.6"
++
++#ifdef CONFIG_SCHED_BORE
++extern u8   __read_mostly sched_bore;
++extern u8   __read_mostly sched_burst_exclude_kthreads;
++extern u8   __read_mostly sched_burst_smoothness_long;
++extern u8   __read_mostly sched_burst_smoothness_short;
++extern u8   __read_mostly sched_burst_fork_atavistic;
++extern u8   __read_mostly sched_burst_parity_threshold;
++extern u8   __read_mostly sched_burst_penalty_offset;
++extern uint __read_mostly sched_burst_penalty_scale;
++extern uint __read_mostly sched_burst_cache_stop_count;
++extern uint __read_mostly sched_burst_cache_lifetime;
++extern uint __read_mostly sched_deadline_boost_mask;
++
++extern void update_burst_score(struct sched_entity *se);
++extern void update_burst_penalty(struct sched_entity *se);
++
++extern void restart_burst(struct sched_entity *se);
++extern void restart_burst_rescale_deadline(struct sched_entity *se);
++
++extern int sched_bore_update_handler(const struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos);
++
++extern void sched_clone_bore(
++	struct task_struct *p, struct task_struct *parent, u64 clone_flags, u64 now);
++
++extern void reset_task_bore(struct task_struct *p);
++extern void sched_bore_init(void);
++
++extern void reweight_entity(
++	struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight);
++#endif // CONFIG_SCHED_BORE
++#endif // _LINUX_SCHED_BORE_H
+diff --git a/init/Kconfig b/init/Kconfig
+index 522fac299..13a48166e 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1363,6 +1363,23 @@ config CHECKPOINT_RESTORE
+ 
+ 	  If unsure, say N here.
+ 
++config SCHED_BORE
++	bool "Burst-Oriented Response Enhancer"
++	default y
++	help
++	  In Desktop and Mobile computing, one might prefer interactive
++	  tasks to keep responsive no matter what they run in the background.
++
++	  Enabling this kernel feature modifies the scheduler to discriminate
++	  tasks by their burst time (runtime since it last went sleeping or
++	  yielding state) and prioritize those that run less bursty.
++	  Such tasks usually include window compositor, widgets backend,
++	  terminal emulator, video playback, games and so on.
++	  With a little impact to scheduling fairness, it may improve
++	  responsiveness especially under heavy background workload.
++
++	  If unsure, say Y here.
++
+ config SCHED_AUTOGROUP
+ 	bool "Automatic process group scheduling"
+ 	select CGROUPS
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index 38ef6d068..253c566b5 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -55,5 +55,22 @@ config HZ
+ 	default 300 if HZ_300
+ 	default 1000 if HZ_1000
+ 
++config MIN_BASE_SLICE_NS
++	int "Default value for min_base_slice_ns"
++	default 2000000
++	help
++	 The BORE Scheduler automatically calculates the optimal base
++	 slice for the configured HZ using the following equation:
++	 
++	 base_slice_ns =
++	 	1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ)
++	 
++	 This option sets the default lower bound limit of the base slice
++	 to prevent the loss of task throughput due to overscheduling.
++	 
++	 Setting this value too high can cause the system to boot with
++	 an unnecessarily large base slice, resulting in high scheduling
++	 latency and poor system responsiveness.
++
+ config SCHED_HRTICK
+ 	def_bool HIGH_RES_TIMERS
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 5e640468b..235fe18fe 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -112,6 +112,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <linux/sched/bore.h>
++
+ #include <trace/events/sched.h>
+ 
+ #define CREATE_TRACE_POINTS
+@@ -2529,6 +2531,10 @@ __latent_entropy struct task_struct *copy_process(
+ 	p->start_time = ktime_get_ns();
+ 	p->start_boottime = ktime_get_boottime_ns();
+ 
++#ifdef CONFIG_SCHED_BORE
++	if (likely(p->pid))
++		sched_clone_bore(p, current, clone_flags, p->start_time);
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * Make it visible to the rest of the system, but dont wake it up yet.
+ 	 * Need tasklist lock for parent etc handling!
+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
+index 8ae86371d..ab9ad886a 100644
+--- a/kernel/sched/Makefile
++++ b/kernel/sched/Makefile
+@@ -37,3 +37,4 @@ obj-y += core.o
+ obj-y += fair.o
+ obj-y += build_policy.o
+ obj-y += build_utility.o
++obj-y += bore.o
+diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c
+new file mode 100644
+index 000000000..23aeb5649
+--- /dev/null
++++ b/kernel/sched/bore.c
+@@ -0,0 +1,443 @@
++/*
++ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
++ *  Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com>
++ */
++#include <linux/cpuset.h>
++#include <linux/sched/task.h>
++#include <linux/sched/bore.h>
++#include "sched.h"
++
++#ifdef CONFIG_SCHED_BORE
++u8   __read_mostly sched_bore                   = 1;
++u8   __read_mostly sched_burst_exclude_kthreads = 1;
++u8   __read_mostly sched_burst_smoothness_long  = 1;
++u8   __read_mostly sched_burst_smoothness_short = 0;
++u8   __read_mostly sched_burst_fork_atavistic   = 2;
++u8   __read_mostly sched_burst_parity_threshold = 2;
++u8   __read_mostly sched_burst_penalty_offset   = 24;
++uint __read_mostly sched_burst_penalty_scale    = 1280;
++uint __read_mostly sched_burst_cache_stop_count = 64;
++uint __read_mostly sched_burst_cache_lifetime   = 75000000;
++uint __read_mostly sched_deadline_boost_mask    = ENQUEUE_INITIAL
++                                                | ENQUEUE_WAKEUP;
++static int __maybe_unused sixty_four     = 64;
++static int __maybe_unused maxval_u8      = 255;
++static int __maybe_unused maxval_12_bits = 4095;
++
++#define MAX_BURST_PENALTY (39U <<2)
++
++static inline u32 log2plus1_u64_u32f8(u64 v) {
++	u32 integral = fls64(v);
++	u8  fractional = v << (64 - integral) >> 55;
++	return integral << 8 | fractional;
++}
++
++static inline u32 calc_burst_penalty(u64 burst_time) {
++	u32 greed, tolerance, penalty, scaled_penalty;
++	
++	greed = log2plus1_u64_u32f8(burst_time);
++	tolerance = sched_burst_penalty_offset << 8;
++	penalty = max(0, (s32)(greed - tolerance));
++	scaled_penalty = penalty * sched_burst_penalty_scale >> 16;
++
++	return min(MAX_BURST_PENALTY, scaled_penalty);
++}
++
++static inline u64 __scale_slice(u64 delta, u8 score)
++{return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);}
++
++static inline u64 __unscale_slice(u64 delta, u8 score)
++{return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);}
++
++static void reweight_task_by_prio(struct task_struct *p, int prio) {
++	struct sched_entity *se = &p->se;
++	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
++
++	reweight_entity(cfs_rq_of(se), se, weight);
++	se->load.inv_weight = sched_prio_to_wmult[prio];
++}
++
++static inline u8 effective_prio(struct task_struct *p) {
++	u8 prio = p->static_prio - MAX_RT_PRIO;
++	if (likely(sched_bore))
++		prio += p->se.burst_score;
++	return min(39, prio);
++}
++
++void update_burst_score(struct sched_entity *se) {
++	if (!entity_is_task(se)) return;
++	struct task_struct *p = task_of(se);
++	u8 prev_prio = effective_prio(p);
++
++	u8 burst_score = 0;
++	if (!((p->flags & PF_KTHREAD) && likely(sched_burst_exclude_kthreads)))
++		burst_score = se->burst_penalty >> 2;
++	se->burst_score = burst_score;
++
++	u8 new_prio = effective_prio(p);
++	if (new_prio != prev_prio)
++		reweight_task_by_prio(p, new_prio);
++}
++
++void update_burst_penalty(struct sched_entity *se) {
++	se->curr_burst_penalty = calc_burst_penalty(se->burst_time);
++	se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty);
++	update_burst_score(se);
++}
++
++static inline u32 binary_smooth(u32 new, u32 old) {
++	int increment = new - old;
++	return (0 <= increment)?
++		old + ( increment >> (int)sched_burst_smoothness_long):
++		old - (-increment >> (int)sched_burst_smoothness_short);
++}
++
++static void revolve_burst_penalty(struct sched_entity *se) {
++	se->prev_burst_penalty =
++		binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty);
++	se->burst_time = 0;
++	se->curr_burst_penalty = 0;
++}
++
++inline void restart_burst(struct sched_entity *se) {
++	revolve_burst_penalty(se);
++	se->burst_penalty = se->prev_burst_penalty;
++	update_burst_score(se);
++}
++
++void restart_burst_rescale_deadline(struct sched_entity *se) {
++	s64 vscaled, wremain, vremain = se->deadline - se->vruntime;
++	struct task_struct *p = task_of(se);
++	u8 prev_prio = effective_prio(p);
++	restart_burst(se);
++	u8 new_prio = effective_prio(p);
++	if (prev_prio > new_prio) {
++		wremain = __unscale_slice(abs(vremain), prev_prio);
++		vscaled = __scale_slice(wremain, new_prio);
++		if (unlikely(vremain < 0))
++			vscaled = -vscaled;
++		se->deadline = se->vruntime + vscaled;
++	}
++}
++
++static inline bool task_is_bore_eligible(struct task_struct *p)
++{return p && p->sched_class == &fair_sched_class && !p->exit_state;}
++
++static void reset_task_weights_bore(void) {
++	struct task_struct *task;
++	struct rq *rq;
++	struct rq_flags rf;
++
++	write_lock_irq(&tasklist_lock);
++	for_each_process(task) {
++		if (!task_is_bore_eligible(task)) continue;
++		rq = task_rq(task);
++		rq_pin_lock(rq, &rf);
++		update_rq_clock(rq);
++		reweight_task_by_prio(task, effective_prio(task));
++		rq_unpin_lock(rq, &rf);
++	}
++	write_unlock_irq(&tasklist_lock);
++}
++
++int sched_bore_update_handler(const struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos) {
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	reset_task_weights_bore();
++
++	return 0;
++}
++
++#define for_each_child(p, t) \
++	list_for_each_entry(t, &(p)->children, sibling)
++
++static u32 count_entries_upto2(struct list_head *head) {
++	struct list_head *next = head->next;
++	return (next != head) + (next->next != head);
++}
++
++static inline void init_task_burst_cache_lock(struct task_struct *p) {
++	spin_lock_init(&p->se.child_burst.lock);
++	spin_lock_init(&p->se.group_burst.lock);
++}
++
++static inline bool burst_cache_expired(struct sched_burst_cache *bc, u64 now)
++{return (s64)(bc->timestamp + sched_burst_cache_lifetime - now) < 0;}
++
++static void update_burst_cache(struct sched_burst_cache *bc,
++	struct task_struct *p, u32 cnt, u32 sum, u64 now) {
++	u8 avg = cnt ? sum / cnt : 0;
++	bc->score = max(avg, p->se.burst_penalty);
++	bc->count = cnt;
++	bc->timestamp = now;
++}
++
++static inline void update_child_burst_direct(struct task_struct *p, u64 now) {
++	u32 cnt = 0, sum = 0;
++	struct task_struct *child;
++
++	for_each_child(p, child) {
++		if (!task_is_bore_eligible(child)) continue;
++		cnt++;
++		sum += child->se.burst_penalty;
++	}
++
++	update_burst_cache(&p->se.child_burst, p, cnt, sum, now);
++}
++
++static inline u8 inherit_burst_direct(
++	struct task_struct *p, u64 now, u64 clone_flags) {
++	struct task_struct *parent = p;
++	struct sched_burst_cache *bc;
++
++	if (clone_flags & CLONE_PARENT)
++		parent = parent->real_parent;
++
++	bc = &parent->se.child_burst;
++	guard(spinlock)(&bc->lock);
++	if (burst_cache_expired(bc, now))
++		update_child_burst_direct(parent, now);
++
++	return bc->score;
++}
++
++static void update_child_burst_topological(
++	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
++	u32 cnt = 0, dcnt = 0, sum = 0;
++	struct task_struct *child, *dec;
++	struct sched_burst_cache *bc __maybe_unused;
++
++	for_each_child(p, child) {
++		dec = child;
++		while ((dcnt = count_entries_upto2(&dec->children)) == 1)
++			dec = list_first_entry(&dec->children, struct task_struct, sibling);
++		
++		if (!dcnt || !depth) {
++			if (!task_is_bore_eligible(dec)) continue;
++			cnt++;
++			sum += dec->se.burst_penalty;
++			continue;
++		}
++		bc = &dec->se.child_burst;
++		spin_lock(&bc->lock);
++		if (!burst_cache_expired(bc, now)) {
++			cnt += bc->count;
++			sum += (u32)bc->score * bc->count;
++			if (sched_burst_cache_stop_count <= cnt) {
++				spin_unlock(&bc->lock);
++				break;
++			}
++			spin_unlock(&bc->lock);
++			continue;
++		}
++		update_child_burst_topological(dec, now, depth - 1, &cnt, &sum);
++		spin_unlock(&bc->lock);
++	}
++
++	update_burst_cache(&p->se.child_burst, p, cnt, sum, now);
++	*acnt += cnt;
++	*asum += sum;
++}
++
++static inline u8 inherit_burst_topological(
++	struct task_struct *p, u64 now, u64 clone_flags) {
++	struct task_struct *anc = p;
++	struct sched_burst_cache *bc;
++	u32 cnt = 0, sum = 0;
++	u32 base_child_cnt = 0;
++
++	if (clone_flags & CLONE_PARENT) {
++		anc = anc->real_parent;
++		base_child_cnt = 1;
++	}
++
++	for (struct task_struct *next;
++		 anc != (next = anc->real_parent) &&
++		 	count_entries_upto2(&anc->children) <= base_child_cnt;) {
++		anc = next;
++		base_child_cnt = 1;
++	}
++
++	bc = &anc->se.child_burst;
++	guard(spinlock)(&bc->lock);
++	if (burst_cache_expired(bc, now))
++		update_child_burst_topological(
++			anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum);
++
++	return bc->score;
++}
++
++static inline void update_tg_burst(struct task_struct *p, u64 now) {
++	struct task_struct *task;
++	u32 cnt = 0, sum = 0;
++
++	for_each_thread(p, task) {
++		if (!task_is_bore_eligible(task)) continue;
++		cnt++;
++		sum += task->se.burst_penalty;
++	}
++
++	update_burst_cache(&p->se.group_burst, p, cnt, sum, now);
++}
++
++static inline u8 inherit_burst_tg(struct task_struct *p, u64 now) {
++	struct task_struct *parent = rcu_dereference(p->group_leader);
++	struct sched_burst_cache *bc = &parent->se.group_burst;
++	guard(spinlock)(&bc->lock);
++	if (burst_cache_expired(bc, now))
++		update_tg_burst(parent, now);
++
++	return bc->score;
++}
++
++void sched_clone_bore(struct task_struct *p,
++	struct task_struct *parent, u64 clone_flags, u64 now) {
++	struct sched_entity *se = &p->se;
++	u8 penalty;
++
++	init_task_burst_cache_lock(p);
++
++	if (!task_is_bore_eligible(p)) return;
++
++	if (clone_flags & CLONE_THREAD) {
++		rcu_read_lock();
++		penalty = inherit_burst_tg(parent, now);
++		rcu_read_unlock();
++	} else {
++		read_lock(&tasklist_lock);
++		penalty = likely(sched_burst_fork_atavistic) ?
++			inherit_burst_topological(parent, now, clone_flags):
++			inherit_burst_direct(parent, now, clone_flags);
++		read_unlock(&tasklist_lock);
++	}
++
++	revolve_burst_penalty(se);
++	se->burst_penalty = se->prev_burst_penalty =
++		max(se->prev_burst_penalty, penalty);
++	se->child_burst.timestamp = 0;
++	se->group_burst.timestamp = 0;
++}
++
++void reset_task_bore(struct task_struct *p) {
++	p->se.burst_time = 0;
++	p->se.prev_burst_penalty = 0;
++	p->se.curr_burst_penalty = 0;
++	p->se.burst_penalty = 0;
++	p->se.burst_score = 0;
++	memset(&p->se.child_burst, 0, sizeof(struct sched_burst_cache));
++	memset(&p->se.group_burst, 0, sizeof(struct sched_burst_cache));
++}
++
++void __init sched_bore_init(void) {
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification %s by Masahito Suzuki", SCHED_BORE_VERSION);
++	reset_task_bore(&init_task);
++	init_task_burst_cache_lock(&init_task);
++}
++
++#ifdef CONFIG_SYSCTL
++static struct ctl_table sched_bore_sysctls[] = {
++	{
++		.procname	= "sched_bore",
++		.data		= &sched_bore,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = sched_bore_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_exclude_kthreads",
++		.data		= &sched_burst_exclude_kthreads,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_smoothness_long",
++		.data		= &sched_burst_smoothness_long,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_smoothness_short",
++		.data		= &sched_burst_smoothness_short,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_fork_atavistic",
++		.data		= &sched_burst_fork_atavistic,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_THREE,
++	},
++	{
++		.procname	= "sched_burst_parity_threshold",
++		.data		= &sched_burst_parity_threshold,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_u8,
++	},
++	{
++		.procname	= "sched_burst_penalty_offset",
++		.data		= &sched_burst_penalty_offset,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &sixty_four,
++	},
++	{
++		.procname	= "sched_burst_penalty_scale",
++		.data		= &sched_burst_penalty_scale,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_12_bits,
++	},
++	{
++		.procname	= "sched_burst_cache_stop_count",
++		.data		= &sched_burst_cache_stop_count,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_burst_cache_lifetime",
++		.data		= &sched_burst_cache_lifetime,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_deadline_boost_mask",
++		.data		= &sched_deadline_boost_mask,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++};
++
++static int __init sched_bore_sysctl_init(void) {
++	register_sysctl_init("kernel", sched_bore_sysctls);
++	return 0;
++}
++late_initcall(sched_bore_sysctl_init);
++#endif // CONFIG_SYSCTL
++#endif // CONFIG_SCHED_BORE
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 3c7c942c7..f6a9189ff 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -97,6 +97,8 @@
+ #include "../../io_uring/io-wq.h"
+ #include "../smpboot.h"
+ 
++#include <linux/sched/bore.h>
++
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
+ 
+@@ -8490,6 +8492,10 @@ void __init sched_init(void)
+ 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	sched_bore_init();
++#endif // CONFIG_SCHED_BORE
++
+ 	wait_bit_init();
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index a0893a483..1ee54165f 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = {
+ };
+ 
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \
++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \
++{ \
++	char buf[16]; \
++	unsigned int value; \
++\
++	if (cnt > 15) \
++		cnt = 15; \
++\
++	if (copy_from_user(&buf, ubuf, cnt)) \
++		return -EFAULT; \
++	buf[cnt] = '\0'; \
++\
++	if (kstrtouint(buf, 10, &value)) \
++		return -EINVAL; \
++\
++	sysctl_sched_##name = value; \
++	sched_update_##update_func(); \
++\
++	*ppos += cnt; \
++	return cnt; \
++} \
++\
++static int sched_##name##_show(struct seq_file *m, void *v) \
++{ \
++	seq_printf(m, "%d\n", sysctl_sched_##name); \
++	return 0; \
++} \
++\
++static int sched_##name##_open(struct inode *inode, struct file *filp) \
++{ \
++	return single_open(filp, sched_##name##_show, NULL); \
++} \
++\
++static const struct file_operations sched_##name##_fops = { \
++	.open		= sched_##name##_open, \
++	.write		= sched_##name##_write, \
++	.read		= seq_read, \
++	.llseek		= seq_lseek, \
++	.release	= single_release, \
++};
++
++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice)
+ 
++#undef DEFINE_SYSCTL_SCHED_FUNC
++#else // !CONFIG_SCHED_BORE
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = {
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+-
++#endif // CONFIG_SCHED_BORE
+ #endif /* SMP */
+ 
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+@@ -505,13 +551,20 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
++	debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice);
++#else // !CONFIG_SCHED_BORE
+ 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+ 
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
++#endif // CONFIG_SCHED_BORE
+ 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ 
+@@ -756,6 +809,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
++#ifdef CONFIG_SCHED_BORE
++	SEQ_printf(m, " %2d", p->se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ #ifdef CONFIG_NUMA_BALANCING
+ 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
+ #endif
+@@ -1242,6 +1298,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 
+ 	P(se.load.weight);
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++	P(se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+ 	P(se.avg.util_sum);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index eb1165016..abecfa517 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -58,6 +58,8 @@
+ #include "stats.h"
+ #include "autogroup.h"
+ 
++#include <linux/sched/bore.h>
++
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -67,17 +69,30 @@
+  *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
+  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+  *
+- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant
++ * EEVDF: default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
+  */
++#ifdef CONFIG_SCHED_BORE
++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound tasks:
+  *
+- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice
++ * (default min_base_slice = 2000000 constant, units: nanoseconds)
++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds
+  */
++#ifdef CONFIG_SCHED_BORE
++static const unsigned int nsecs_per_tick       = 1000000000ULL / HZ;
++unsigned int sysctl_sched_min_base_slice       = CONFIG_MIN_BASE_SLICE_NS;
++__read_mostly uint sysctl_sched_base_slice     = nsecs_per_tick;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_base_slice			= 700000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
++#endif // CONFIG_SCHED_BORE
+ 
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
+@@ -191,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
+  *
+  * This idea comes from the SD scheduler of Con Kolivas:
+  */
++#ifdef CONFIG_SCHED_BORE
++static void update_sysctl(void) {
++	sysctl_sched_base_slice = nsecs_per_tick *
++		max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick));
++}
++void sched_update_min_base_slice(void) { update_sysctl(); }
++#else // !CONFIG_SCHED_BORE
+ static unsigned int get_update_sysctl_factor(void)
+ {
+ 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
+@@ -221,6 +243,7 @@ static void update_sysctl(void)
+ 	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
++#endif // CONFIG_SCHED_BORE
+ 
+ void __init sched_init_granularity(void)
+ {
+@@ -700,6 +723,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ 	vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++#ifdef CONFIG_SCHED_BORE
++	limit >>= !!sched_bore;
++#endif // CONFIG_SCHED_BORE
+ 
+ 	se->vlag = clamp(vlag, -limit, limit);
+ }
+@@ -940,6 +966,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+ 		curr = NULL;
+ 
+ 	if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
++#ifdef CONFIG_SCHED_BORE
++		if (!(likely(sched_bore) && likely(sched_burst_parity_threshold) &&
++			sched_burst_parity_threshold < cfs_rq->nr_queued))
++#endif // CONFIG_SCHED_BORE
+ 		return curr;
+ 
+ 	/* Pick the leftmost entity if it's eligible */
+@@ -998,6 +1028,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+  * Scheduling class statistics methods:
+  */
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ int sched_update_scaling(void)
+ {
+ 	unsigned int factor = get_update_sysctl_factor();
+@@ -1009,6 +1040,7 @@ int sched_update_scaling(void)
+ 
+ 	return 0;
+ }
++#endif // CONFIG_SCHED_BORE
+ #endif
+ #endif
+ 
+@@ -1239,6 +1271,10 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
++#ifdef CONFIG_SCHED_BORE
++	curr->burst_time += delta_exec;
++	update_burst_penalty(curr);
++#endif // CONFIG_SCHED_BORE
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
+ 	resched = update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+@@ -3786,7 +3822,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ 
+ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+ 
+-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 			    unsigned long weight)
+ {
+ 	bool curr = cfs_rq->curr == se;
+@@ -5292,7 +5328,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		se->rel_deadline = 0;
+ 		return;
+ 	}
+-
++#ifdef CONFIG_SCHED_BORE
++	else if (likely(sched_bore))
++		vslice >>= !!(flags & sched_deadline_boost_mask);
++	else
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * When joining the competition; the existing tasks will be,
+ 	 * on average, halfway through their slice, as such start tasks
+@@ -7187,6 +7227,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		util_est_dequeue(&rq->cfs, p);
+ 
+ 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
++#ifdef CONFIG_SCHED_BORE
++	struct cfs_rq *cfs_rq = &rq->cfs;
++	struct sched_entity *se = &p->se;
++	if (flags & DEQUEUE_SLEEP && entity_is_task(se)) {
++		if (cfs_rq->curr == se)
++			update_curr(cfs_rq);
++		restart_burst(se);
++	}
++#endif // CONFIG_SCHED_BORE
+ 	if (dequeue_entities(rq, &p->se, flags) < 0)
+ 		return false;
+ 
+@@ -9007,16 +9056,25 @@ static void yield_task_fair(struct rq *rq)
+ 	/*
+ 	 * Are we the only task in the tree?
+ 	 */
++#if !defined(CONFIG_SCHED_BORE)
+ 	if (unlikely(rq->nr_running == 1))
+ 		return;
+ 
+ 	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	update_rq_clock(rq);
+ 	/*
+ 	 * Update run-time statistics of the 'current'.
+ 	 */
+ 	update_curr(cfs_rq);
++#ifdef CONFIG_SCHED_BORE
++	restart_burst_rescale_deadline(se);
++	if (unlikely(rq->nr_running == 1))
++		return;
++
++	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * Tell update_rq_clock() that we've just updated,
+ 	 * so we don't do microscopic update in schedule()
+@@ -13130,6 +13188,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ static void task_fork_fair(struct task_struct *p)
+ {
+ 	set_task_max_allowed_capacity(p);
++#ifdef CONFIG_SCHED_BORE
++	update_burst_score(&p->se);
++#endif // CONFIG_SCHED_BORE
+ }
+ 
+ /*
+@@ -13240,6 +13301,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
+ 
+ static void switched_from_fair(struct rq *rq, struct task_struct *p)
+ {
++	p->se.rel_deadline = 0;
++#ifdef CONFIG_SCHED_BORE
++	reset_task_bore(p);
++#endif // CONFIG_SCHED_BORE
+ 	detach_task_cfs_rq(p);
+ }
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 1aa65a0ac..fddf67b19 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2122,7 +2122,11 @@ static inline void update_sched_domain_debugfs(void) { }
+ static inline void dirty_sched_domain_sysctl(int cpu) { }
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++extern void sched_update_min_base_slice(void);
++#else // !CONFIG_SCHED_BORE
+ extern int sched_update_scaling(void);
++#endif // CONFIG_SCHED_BORE
+ 
+ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+ {
+@@ -2846,7 +2850,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+ extern const_debug unsigned int sysctl_sched_nr_migrate;
+ extern const_debug unsigned int sysctl_sched_migration_cost;
+ 
++#ifdef CONFIG_SCHED_BORE
++extern unsigned int sysctl_sched_min_base_slice;
++extern __read_mostly uint sysctl_sched_base_slice;
++#else // !CONFIG_SCHED_BORE
+ extern unsigned int sysctl_sched_base_slice;
++#endif // CONFIG_SCHED_BORE
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ extern int sysctl_resched_latency_warn_ms;
+-- 
+2.49.0
+
diff --git a/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch
new file mode 100644
index 0000000..4b43326
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch
@@ -0,0 +1,3387 @@
+From d221b4b9939f83a4df2ca8d037a2b73d49041a40 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 22 May 2025 16:19:46 +0200
+Subject: [PATCH 4/9] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2231 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 16 files changed, 1937 insertions(+), 554 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index f88daaa76d83..e569fd1ed7e8 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -243,7 +243,8 @@ struct tcp_sock {
+ 	/* OOO segments go in this rbtree. Socket lock must be held. */
+ 	struct rb_root	out_of_order_queue;
+ 	u32	snd_ssthresh;	/* Slow start size threshold		*/
+-	u8	recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++	u32	recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_rx);
+ 
+ 	/* TX read-write hotpath cache lines */
+@@ -300,7 +301,8 @@ struct tcp_sock {
+  */
+ 	struct tcp_options_received rx_opt;
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	__cacheline_group_end(tcp_sock_write_txrx);
+ 
+ 	/* RX read-write hotpath cache lines */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index c7f42844c79a..170250145598 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 2d08473a6dc0..aa80dd0abe5a 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -376,6 +376,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -796,6 +798,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -901,6 +912,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -990,9 +1006,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1105,6 +1126,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1127,7 +1149,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1147,10 +1173,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1161,7 +1190,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1185,8 +1216,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1252,6 +1286,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1271,6 +1313,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1283,6 +1326,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2434,7 +2492,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index 66c3903d29cf..dfdbc1c0b606 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -516,12 +516,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dbf896f3146c..92b6d6472951 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN enabled at conn init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 6d2c97f8e9ef..ddc116ef22cb 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index 554804774628..fb6ab6ca8440 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 57df7c1d2faa..47605d71f68b 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3398,6 +3398,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4124,6 +4125,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..516a5daac694 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +456,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +473,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +534,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +547,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +579,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +599,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +670,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +681,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +710,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +739,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +795,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +803,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +849,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +858,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +886,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +923,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +946,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +971,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+ 
+-	bbr_update_model(sk, rs);
+-
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2360,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2397,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 0cbf81bf3d45..7e8324f54563 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -387,7 +387,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1126,7 +1126,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1507,6 +1512,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3832,7 +3848,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3849,6 +3866,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3859,6 +3877,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3967,6 +3990,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4041,7 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4065,6 +4089,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4084,7 +4109,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5764,13 +5789,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index dfdb7a4608a8..874e99902bba 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -471,6 +471,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index bc95d2a5924f..d4c45ca6fe06 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1606,7 +1609,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1681,6 +1684,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2038,13 +2065,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2770,6 +2796,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2982,6 +3009,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index b412ed88ccd9..d70f8b742b21 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -699,6 +699,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.49.0.634.g8613c2bb6c
+
diff --git a/sys-kernel/gentoo-sources-6.14/0006-crypto.patch b/sys-kernel/gentoo-sources-6.14/0006-crypto.patch
new file mode 100644
index 0000000..ac617a2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.14/0006-crypto.patch
@@ -0,0 +1,2495 @@
+From 54fcd81865473d94e2174586621d03006f85c68d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 22 May 2025 16:35:34 +0200
+Subject: [PATCH 6/9] crypto
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ MAINTAINERS                         |   1 +
+ arch/x86/Kconfig                    |   2 +-
+ arch/x86/crypto/aesni-intel_glue.c  |  22 +-
+ arch/x86/include/asm/cpufeatures.h  |   1 +
+ arch/x86/kernel/cpu/intel.c         |  22 ++
+ arch/x86/lib/Makefile               |   2 +-
+ arch/x86/lib/crc-pclmul-consts.h    |  99 +++++
+ arch/x86/lib/crc-pclmul-template.S  | 584 ++++++++++++++++++++++++++++
+ arch/x86/lib/crc-pclmul-template.h  |  81 ++++
+ arch/x86/lib/crc-t10dif-glue.c      |  23 +-
+ arch/x86/lib/crc16-msb-pclmul.S     |   6 +
+ arch/x86/lib/crc32-glue.c           |  51 +--
+ arch/x86/lib/crc32-pclmul.S         | 219 +----------
+ arch/x86/lib/crct10dif-pcl-asm_64.S | 332 ----------------
+ drivers/nvme/host/Kconfig           |   3 +-
+ drivers/nvme/host/tcp.c             | 122 ++----
+ drivers/nvme/target/tcp.c           |  90 ++---
+ include/linux/skbuff.h              |   7 +-
+ net/core/datagram.c                 |  46 +--
+ scripts/gen-crc-consts.py           | 238 ++++++++++++
+ 20 files changed, 1143 insertions(+), 808 deletions(-)
+ create mode 100644 arch/x86/lib/crc-pclmul-consts.h
+ create mode 100644 arch/x86/lib/crc-pclmul-template.S
+ create mode 100644 arch/x86/lib/crc-pclmul-template.h
+ create mode 100644 arch/x86/lib/crc16-msb-pclmul.S
+ delete mode 100644 arch/x86/lib/crct10dif-pcl-asm_64.S
+ create mode 100755 scripts/gen-crc-consts.py
+
+diff --git a/MAINTAINERS b/MAINTAINERS
+index c0d5232a473b..ed22cbce79af 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -6140,6 +6140,7 @@ F:	Documentation/staging/crc*
+ F:	arch/*/lib/crc*
+ F:	include/linux/crc*
+ F:	lib/crc*
++F:	scripts/gen-crc-consts.py
+ 
+ CREATIVE SB0540
+ M:	Bastien Nocera <hadess@hadess.net>
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 473364353bd9..500584609508 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -77,7 +77,7 @@ config X86
+ 	select ARCH_HAS_CPU_FINALIZE_INIT
+ 	select ARCH_HAS_CPU_PASID		if IOMMU_SVA
+ 	select ARCH_HAS_CRC32
+-	select ARCH_HAS_CRC_T10DIF		if X86_64
++	select ARCH_HAS_CRC_T10DIF
+ 	select ARCH_HAS_CURRENT_STACK_POINTER
+ 	select ARCH_HAS_DEBUG_VIRTUAL
+ 	select ARCH_HAS_DEBUG_VM_PGTABLE	if !X86_PAE
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index 11e95fc62636..3e9ab5cdade4 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -1536,26 +1536,6 @@ DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
+ 		AES_GCM_KEY_AVX10_SIZE, 800);
+ #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+ 
+-/*
+- * This is a list of CPU models that are known to suffer from downclocking when
+- * zmm registers (512-bit vectors) are used.  On these CPUs, the AES mode
+- * implementations with zmm registers won't be used by default.  Implementations
+- * with ymm registers (256-bit vectors) will be used by default instead.
+- */
+-static const struct x86_cpu_id zmm_exclusion_list[] = {
+-	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
+-	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
+-	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
+-	X86_MATCH_VFM(INTEL_ICELAKE,		0),
+-	X86_MATCH_VFM(INTEL_ICELAKE_L,		0),
+-	X86_MATCH_VFM(INTEL_ICELAKE_NNPI,	0),
+-	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	0),
+-	X86_MATCH_VFM(INTEL_TIGERLAKE,		0),
+-	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+-	/* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
+-	{},
+-};
+-
+ static int __init register_avx_algs(void)
+ {
+ 	int err;
+@@ -1600,7 +1580,7 @@ static int __init register_avx_algs(void)
+ 	if (err)
+ 		return err;
+ 
+-	if (x86_match_cpu(zmm_exclusion_list)) {
++	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
+ 		int i;
+ 
+ 		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index 4c38c9b9c69d..97d7617cab1e 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -485,6 +485,7 @@
+ #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
+ #define X86_FEATURE_AMD_WORKLOAD_CLASS	(21*32 + 7) /* Workload Classification */
+ #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32 + 8) /* Use thunk for indirect branches in lower half of cacheline */
++#define X86_FEATURE_PREFER_YMM		(21*32 + 9) /* Avoid ZMM registers due to downclocking */
+ 
+ /*
+  * BUG word(s)
+diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
+index 134368a3f4b1..5fe563eeb17d 100644
+--- a/arch/x86/kernel/cpu/intel.c
++++ b/arch/x86/kernel/cpu/intel.c
+@@ -521,6 +521,25 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
+ 	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+ }
+ 
++/*
++ * This is a list of Intel CPUs that are known to suffer from downclocking when
++ * ZMM registers (512-bit vectors) are used.  On these CPUs, when the kernel
++ * executes SIMD-optimized code such as cryptography functions or CRCs, it
++ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
++ */
++static const struct x86_cpu_id zmm_exclusion_list[] = {
++	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
++	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
++	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
++	X86_MATCH_VFM(INTEL_ICELAKE,		0),
++	X86_MATCH_VFM(INTEL_ICELAKE_L,		0),
++	X86_MATCH_VFM(INTEL_ICELAKE_NNPI,	0),
++	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	0),
++	X86_MATCH_VFM(INTEL_TIGERLAKE,		0),
++	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
++	{},
++};
++
+ static void init_intel(struct cpuinfo_x86 *c)
+ {
+ 	early_init_intel(c);
+@@ -601,6 +620,9 @@ static void init_intel(struct cpuinfo_x86 *c)
+ 	}
+ #endif
+ 
++	if (x86_match_cpu(zmm_exclusion_list))
++		set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
++
+ 	/* Work around errata */
+ 	srat_detect_node(c);
+ 
+diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
+index 8a59c61624c2..08496e221a7d 100644
+--- a/arch/x86/lib/Makefile
++++ b/arch/x86/lib/Makefile
+@@ -43,7 +43,7 @@ crc32-x86-y := crc32-glue.o crc32-pclmul.o
+ crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
+ 
+ obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
+-crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o
++crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
+ 
+ obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+ obj-y += iomem.o
+diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
+new file mode 100644
+index 000000000000..089954988f97
+--- /dev/null
++++ b/arch/x86/lib/crc-pclmul-consts.h
+@@ -0,0 +1,99 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++/*
++ * CRC constants generated by:
++ *
++ *	./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
++ *
++ * Do not edit manually.
++ */
++
++/*
++ * CRC folding constants generated for most-significant-bit-first CRC-16 using
++ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
++ */
++static const struct {
++	u8 bswap_mask[16];
++	u64 fold_across_2048_bits_consts[2];
++	u64 fold_across_1024_bits_consts[2];
++	u64 fold_across_512_bits_consts[2];
++	u64 fold_across_256_bits_consts[2];
++	u64 fold_across_128_bits_consts[2];
++	u8 shuf_table[48];
++	u64 barrett_reduction_consts[2];
++} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
++	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
++	.fold_across_2048_bits_consts = {
++		0xdccf000000000000,	/* LO64_TERMS: (x^2000 mod G) * x^48 */
++		0x4b0b000000000000,	/* HI64_TERMS: (x^2064 mod G) * x^48 */
++	},
++	.fold_across_1024_bits_consts = {
++		0x9d9d000000000000,	/* LO64_TERMS: (x^976 mod G) * x^48 */
++		0x7cf5000000000000,	/* HI64_TERMS: (x^1040 mod G) * x^48 */
++	},
++	.fold_across_512_bits_consts = {
++		0x044c000000000000,	/* LO64_TERMS: (x^464 mod G) * x^48 */
++		0xe658000000000000,	/* HI64_TERMS: (x^528 mod G) * x^48 */
++	},
++	.fold_across_256_bits_consts = {
++		0x6ee3000000000000,	/* LO64_TERMS: (x^208 mod G) * x^48 */
++		0xe7b5000000000000,	/* HI64_TERMS: (x^272 mod G) * x^48 */
++	},
++	.fold_across_128_bits_consts = {
++		0x2d56000000000000,	/* LO64_TERMS: (x^80 mod G) * x^48 */
++		0x06df000000000000,	/* HI64_TERMS: (x^144 mod G) * x^48 */
++	},
++	.shuf_table = {
++		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
++		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
++		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
++	},
++	.barrett_reduction_consts = {
++		0x8bb7000000000000,	/* LO64_TERMS: (G - x^16) * x^48 */
++		0xf65a57f81d33a48a,	/* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
++	},
++};
++
++/*
++ * CRC folding constants generated for least-significant-bit-first CRC-32 using
++ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
++ *        x^5 + x^4 + x^2 + x^1 + x^0
++ */
++static const struct {
++	u64 fold_across_2048_bits_consts[2];
++	u64 fold_across_1024_bits_consts[2];
++	u64 fold_across_512_bits_consts[2];
++	u64 fold_across_256_bits_consts[2];
++	u64 fold_across_128_bits_consts[2];
++	u8 shuf_table[48];
++	u64 barrett_reduction_consts[2];
++} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
++	.fold_across_2048_bits_consts = {
++		0x00000000ce3371cb,	/* HI64_TERMS: (x^2079 mod G) * x^32 */
++		0x00000000e95c1271,	/* LO64_TERMS: (x^2015 mod G) * x^32 */
++	},
++	.fold_across_1024_bits_consts = {
++		0x0000000033fff533,	/* HI64_TERMS: (x^1055 mod G) * x^32 */
++		0x00000000910eeec1,	/* LO64_TERMS: (x^991 mod G) * x^32 */
++	},
++	.fold_across_512_bits_consts = {
++		0x000000008f352d95,	/* HI64_TERMS: (x^543 mod G) * x^32 */
++		0x000000001d9513d7,	/* LO64_TERMS: (x^479 mod G) * x^32 */
++	},
++	.fold_across_256_bits_consts = {
++		0x00000000f1da05aa,	/* HI64_TERMS: (x^287 mod G) * x^32 */
++		0x0000000081256527,	/* LO64_TERMS: (x^223 mod G) * x^32 */
++	},
++	.fold_across_128_bits_consts = {
++		0x00000000ae689191,	/* HI64_TERMS: (x^159 mod G) * x^32 */
++		0x00000000ccaa009e,	/* LO64_TERMS: (x^95 mod G) * x^32 */
++	},
++	.shuf_table = {
++		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
++		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
++		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
++	},
++	.barrett_reduction_consts = {
++		0xb4e5b025f7011641,	/* HI64_TERMS: floor(x^95 / G) */
++		0x00000001db710640,	/* LO64_TERMS: (G - x^32) * x^31 */
++	},
++};
+diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
+new file mode 100644
+index 000000000000..dc91cc074b30
+--- /dev/null
++++ b/arch/x86/lib/crc-pclmul-template.S
+@@ -0,0 +1,584 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++//
++// Template to generate [V]PCLMULQDQ-based CRC functions for x86
++//
++// Copyright 2025 Google LLC
++//
++// Author: Eric Biggers <ebiggers@google.com>
++
++#include <linux/linkage.h>
++
++// Offsets within the generated constants table
++.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
++.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
++.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
++.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
++.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
++.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
++.set OFFSETOF_SHUF_TABLE,			1*16
++.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
++
++// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
++// corresponding non-VEX instruction plus any needed moves.  The supported
++// instruction formats are:
++//
++//     - Two-arg [src, dst], where the non-VEX format is the same.
++//     - Three-arg [src1, src2, dst] where the non-VEX format is
++//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
++//
++// \insn gives the instruction without a "v" prefix and including any immediate
++// argument if needed to make the instruction follow one of the above formats.
++// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
++// it first; this is needed when \arg1 is an unaligned mem operand.
++.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
++.if AVX_LEVEL == 0
++  // VEX not allowed.  Emulate it.
++  .ifnb \arg3 // Three-arg [src1, src2, dst]
++    .ifc "\arg2", "\arg3" // src2 == dst?
++      .ifnb \unaligned_mem_tmp
++	movdqu		\arg1, \unaligned_mem_tmp
++	\insn		\unaligned_mem_tmp, \arg3
++      .else
++	\insn		\arg1, \arg3
++      .endif
++    .else // src2 != dst
++      .ifc "\arg1", "\arg3"
++	.error "Can't have src1 == dst when src2 != dst"
++      .endif
++      .ifnb \unaligned_mem_tmp
++	movdqu		\arg1, \unaligned_mem_tmp
++	movdqa		\arg2, \arg3
++	\insn		\unaligned_mem_tmp, \arg3
++      .else
++	movdqa		\arg2, \arg3
++	\insn		\arg1, \arg3
++      .endif
++    .endif
++  .else // Two-arg [src, dst]
++    .ifnb \unaligned_mem_tmp
++	movdqu		\arg1, \unaligned_mem_tmp
++	\insn		\unaligned_mem_tmp, \arg2
++    .else
++	\insn		\arg1, \arg2
++    .endif
++  .endif
++.else
++  // VEX is allowed.  Emit the desired instruction directly.
++  .ifnb \arg3
++	v\insn		\arg1, \arg2, \arg3
++  .else
++	v\insn		\arg1, \arg2
++  .endif
++.endif
++.endm
++
++// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
++// register of length VL.
++.macro	_vbroadcast	src, dst
++.if VL == 16
++	_cond_vex movdqa,	\src, \dst
++.elseif VL == 32
++	vbroadcasti128		\src, \dst
++.else
++	vbroadcasti32x4		\src, \dst
++.endif
++.endm
++
++// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
++// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
++.macro	_load_data	vl, src, bswap_mask, dst
++.if \vl < 64
++	_cond_vex movdqu,	"\src", \dst
++.else
++	vmovdqu8		\src, \dst
++.endif
++.if !LSB_CRC
++	_cond_vex pshufb,	\bswap_mask, \dst, \dst
++.endif
++.endm
++
++.macro	_prepare_v0	vl, v0, v1, bswap_mask
++.if LSB_CRC
++  .if \vl < 64
++	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
++  .else
++	vpxorq			(BUF), \v0, \v0
++  .endif
++.else
++	_load_data		\vl, (BUF), \bswap_mask, \v1
++  .if \vl < 64
++	_cond_vex pxor,		\v1, \v0, \v0
++  .else
++	vpxorq			\v1, \v0, \v0
++  .endif
++.endif
++.endm
++
++// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
++// msb-first order or the physically high qword for lsb-first order
++#define LO64_TERMS 0
++
++// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
++// qword for msb-first order or the physically low qword for lsb-first order
++#define HI64_TERMS 1
++
++// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
++// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
++.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
++	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
++		  \src1, \src2, \dst
++.endm
++
++// Fold \acc into \data and store the result back into \acc.  \data can be an
++// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
++// byte-reflection is needed; otherwise it must be a vector register.  \consts
++// is a vector register containing the needed fold constants, and \tmp is a
++// temporary vector register.  All arguments must be the same length.
++.macro	_fold_vec	acc, data, consts, tmp
++	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
++	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
++.if AVX_LEVEL < 10
++	_cond_vex pxor,	\data, \tmp, \tmp
++	_cond_vex pxor,	\tmp, \acc, \acc
++.else
++	vpternlogq	$0x96, \data, \tmp, \acc
++.endif
++.endm
++
++// Fold \acc into \data and store the result back into \acc.  \data is an
++// unaligned mem operand, \consts is a vector register containing the needed
++// fold constants, \bswap_mask is a vector register containing the
++// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
++// temporary vector registers.  All arguments must have length \vl.
++.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
++.if AVX_LEVEL == 0 || !LSB_CRC
++	_load_data	\vl, \data, \bswap_mask, \tmp1
++	_fold_vec	\acc, \tmp1, \consts, \tmp2
++.else
++	_fold_vec	\acc, \data, \consts, \tmp1
++.endif
++.endm
++
++// Load the constants for folding across 2**i vectors of length VL at a time
++// into all 128-bit lanes of the vector register CONSTS.
++.macro	_load_vec_folding_consts	i
++	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
++		    CONSTS
++.endm
++
++// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
++// the result back into \v0.  If the remaining length mod \vl is nonzero, also
++// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
++// \consts must be a register of length \vl containing the fold constants.
++.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
++	_fold_vec	\v0, \v1, \consts, \tmp1
++	test		$\vl, LEN8
++	jz		.Lfold_vec_final_done\@
++	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
++	add		$\vl, BUF
++.Lfold_vec_final_done\@:
++.endm
++
++// This macro generates the body of a CRC function with the following prototype:
++//
++// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
++//
++// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
++// |buf| is the data to checksum.  |len| is the data length in bytes, which must
++// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
++// field of the constants struct that was generated for the chosen CRC variant.
++//
++// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
++// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
++// the file is compiled in i386 mode, then the maximum supported value is 32.
++//
++// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
++// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
++// if the CRC processes the most significant bit of each byte first, i.e. maps
++// bit0 to x^0, bit1 to x^1, bit7 to x^7.
++//
++// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
++//
++// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
++// 10 for AVX10 or AVX512.
++//
++// If \vl == 16 && \avx_level == 0, the generated code requires:
++// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
++//
++// If \vl == 32 && \avx_level == 2, the generated code requires:
++// VPCLMULQDQ && AVX2.
++//
++// If \vl == 32 && \avx_level == 10, the generated code requires:
++// VPCLMULQDQ && (AVX10/256 || (AVX512BW && AVX512VL))
++//
++// If \vl == 64 && \avx_level == 10, the generated code requires:
++// VPCLMULQDQ && (AVX10/512 || (AVX512BW && AVX512VL))
++//
++// Other \vl and \avx_level combinations are either not supported or not useful.
++.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
++	.set	LSB_CRC,	\lsb_crc
++	.set	VL,		\vl
++	.set	AVX_LEVEL,	\avx_level
++
++	// Define aliases for the xmm, ymm, or zmm registers according to VL.
++.irp i, 0,1,2,3,4,5,6,7
++  .if VL == 16
++	.set	V\i,		%xmm\i
++	.set	LOG2_VL,	4
++  .elseif VL == 32
++	.set	V\i,		%ymm\i
++	.set	LOG2_VL,	5
++  .elseif VL == 64
++	.set	V\i,		%zmm\i
++	.set	LOG2_VL,	6
++  .else
++	.error "Unsupported vector length"
++  .endif
++.endr
++	// Define aliases for the function parameters.
++	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
++	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
++	// when crc_t is shorter than u64.
++#ifdef __x86_64__
++.if \n <= 32
++	.set	CRC,		%edi
++.else
++	.set	CRC,		%rdi
++.endif
++	.set	BUF,		%rsi
++	.set	LEN,		%rdx
++	.set	LEN32,		%edx
++	.set	LEN8,		%dl
++	.set	CONSTS_PTR,	%rcx
++#else
++	// 32-bit support, assuming -mregparm=3 and not including support for
++	// CRC-64 (which would use both eax and edx to pass the crc parameter).
++	.set	CRC,		%eax
++	.set	BUF,		%edx
++	.set	LEN,		%ecx
++	.set	LEN32,		%ecx
++	.set	LEN8,		%cl
++	.set	CONSTS_PTR,	%ebx	// Passed on stack
++#endif
++
++	// Define aliases for some local variables.  V0-V5 are used without
++	// aliases (for accumulators, data, temporary values, etc).  Staying
++	// within the first 8 vector registers keeps the code 32-bit SSE
++	// compatible and reduces the size of 64-bit SSE code slightly.
++	.set	BSWAP_MASK,	V6
++	.set	BSWAP_MASK_YMM,	%ymm6
++	.set	BSWAP_MASK_XMM,	%xmm6
++	.set	CONSTS,		V7
++	.set	CONSTS_YMM,	%ymm7
++	.set	CONSTS_XMM,	%xmm7
++
++#ifdef __i386__
++	push		CONSTS_PTR
++	mov		8(%esp), CONSTS_PTR
++#endif
++
++	// Create a 128-bit vector that contains the initial CRC in the end
++	// representing the high-order polynomial coefficients, and the rest 0.
++	// If the CRC is msb-first, also load the byte-reflection table.
++.if \n <= 32
++	_cond_vex movd,	CRC, %xmm0
++.else
++	_cond_vex movq,	CRC, %xmm0
++.endif
++.if !LSB_CRC
++	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
++	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
++.endif
++
++	// Load the first vector of data and XOR the initial CRC into the
++	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
++	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
++	// is guaranteed here but not necessarily LEN >= VL.)
++.if VL >= 32
++	cmp		$VL, LEN
++	jae		.Lat_least_1vec\@
++  .if VL == 64
++	cmp		$32, LEN32
++	jb		.Lless_than_32bytes\@
++	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
++	add		$32, BUF
++	jmp		.Lreduce_256bits_to_128bits\@
++.Lless_than_32bytes\@:
++  .endif
++	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
++	add		$16, BUF
++	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
++	jmp		.Lcheck_for_partial_block\@
++.Lat_least_1vec\@:
++.endif
++	_prepare_v0	VL, V0, V1, BSWAP_MASK
++
++	// Handle VL <= LEN < 4*VL.
++	cmp		$4*VL-1, LEN
++	ja		.Lat_least_4vecs\@
++	add		$VL, BUF
++	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
++	// If VL==16 then load fold_across_128_bits_consts first, as the final
++	// reduction depends on it and it won't be loaded anywhere else.
++	cmp		$2*VL-1, LEN32
++.if VL == 16
++	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
++.endif
++	jbe		.Lreduce_1vec_to_128bits\@
++	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
++	// the reduction from 2 vectors.
++	_load_data	VL, (BUF), BSWAP_MASK, V1
++	add		$VL, BUF
++	jmp		.Lreduce_2vecs_to_1\@
++
++.Lat_least_4vecs\@:
++	// Load 3 more vectors of data.
++	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
++	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
++	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
++	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
++	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
++
++	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
++	// 4 vectors of data and write the result back to V0-V3.
++	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
++	jbe		.Lreduce_4vecs_to_2\@
++	_load_vec_folding_consts	2
++.Lfold_4vecs_loop\@:
++	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	sub		$-4*VL, BUF
++	add		$-4*VL, LEN
++	cmp		$4*VL-1, LEN
++	ja		.Lfold_4vecs_loop\@
++
++	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
++	// two more vectors of data from BUF, if at least that much remains.
++.Lreduce_4vecs_to_2\@:
++	_load_vec_folding_consts	1
++	_fold_vec	V0, V2, CONSTS, V4
++	_fold_vec	V1, V3, CONSTS, V4
++	test		$2*VL, LEN8
++	jz		.Lreduce_2vecs_to_1\@
++	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
++	sub		$-2*VL, BUF
++
++	// Fold V0 into V1 and write the result back to V0.  Then fold one more
++	// vector of data from BUF, if at least that much remains.
++.Lreduce_2vecs_to_1\@:
++	_load_vec_folding_consts	0
++	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
++
++.Lreduce_1vec_to_128bits\@:
++.if VL == 64
++	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
++	// data from BUF, if at least that much remains.
++	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
++	vextracti64x4	$1, %zmm0, %ymm1
++	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
++.Lreduce_256bits_to_128bits\@:
++.endif
++.if VL >= 32
++	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
++	// data from BUF, if at least that much remains.
++	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
++	vextracti128	$1, %ymm0, %xmm1
++	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
++.Lcheck_for_partial_block\@:
++.endif
++	and		$15, LEN32
++	jz		.Lreduce_128bits_to_crc\@
++
++	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
++	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
++	// and B is the polynomial of the remaining LEN data bytes.  To reduce
++	// this to 128 bits without needing fold constants for each possible
++	// LEN, rearrange this expression into C1*(x^128) + C2, where
++	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
++	// Then fold C1 into C2, which is just another fold across 128 bits.
++
++.if !LSB_CRC || AVX_LEVEL == 0
++	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
++	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
++.endif // Else will use vpblendvb mem operand later.
++.if !LSB_CRC
++	neg		LEN	// Needed for indexing shuf_table
++.endif
++
++	// tmp = A*x^(8*LEN) mod x^128
++	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
++	//	i.e. right-shift by LEN bytes.
++	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
++	//	i.e. left-shift by LEN bytes.
++	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
++	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
++
++	// C1 = floor(A / x^(128 - 8*LEN))
++	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
++	//	i.e. left-shift by 16-LEN bytes.
++	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
++	//	i.e. right-shift by 16-LEN bytes.
++	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
++				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
++
++	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
++	// bytes (reflected if msb-first).  The blend mask is the shuffle table
++	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
++.if AVX_LEVEL == 0
++	movdqa		%xmm0, %xmm4
++	movdqa		%xmm3, %xmm0
++	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
++	movdqa		%xmm4, %xmm0
++.elseif LSB_CRC
++	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
++.else
++	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
++.endif
++
++	// Fold C1 into C2 and store the 128-bit result in %xmm0.
++	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
++
++.Lreduce_128bits_to_crc\@:
++	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
++	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
++	// order according to LSB_CRC), and G is the CRC's generator polynomial.
++
++	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
++	//
++	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
++	//	      x^n * (%xmm0 mod x^64)
++	//
++	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
++	//
++	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
++	//		 x^64 * (%xmm0 mod x^64)
++	//
++	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
++	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
++	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
++	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
++	// (considering the extra factor of x that gets implicitly introduced by
++	// each pclmulqdq when using lsb-first order), is identical to the
++	// constant that was used earlier for folding the LO64_TERMS across 128
++	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
++	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
++.if LSB_CRC
++	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
++.else
++	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
++.endif
++	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
++	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
++	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
++
++	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
++	// polynomial by which G needs to be multiplied to cancel out the x^n
++	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
++	//
++	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
++	//
++	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
++	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
++	// value that makes enough precision be carried through the calculation.
++	//
++	// The '* x' makes it so the result is floor(t1 / x^64) rather than
++	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
++	// can be extracted much more easily in the next step.  In the lsb-first
++	// case the '* x' happens implicitly.  In the msb-first case it must be
++	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
++	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
++	// the multiplication by the x^64 term is handled using a pxor.  The
++	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
++	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
++	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
++.if !LSB_CRC
++	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
++.endif
++	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
++
++	// Second step of Barrett reduction: Cancel out the x^n and higher terms
++	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
++	//
++	//	crc := t0 - (G * floor(t0 / G))
++	//
++	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
++	//
++	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
++	//
++	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
++	// explicitly applied to it then the x^n term of G makes no difference
++	// in the result and can be omitted.  This helps keep the constant
++	// multiplier in 64 bits in most cases.  This gives the following:
++	//
++	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
++	//	crc := (%xmm0 / x^(64-n)) mod x^n
++	//
++	// In the lsb-first case, each pclmulqdq implicitly introduces
++	// an extra factor of x, so in that case the constant that needs to be
++	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
++	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
++	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
++	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
++	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
++	// CRC has be XORed with the physically low qword of %xmm1, representing
++	// floor(t0 / G).  The most efficient way to do that is to move it to
++	// the physically high qword and use a ternlog to combine the two XORs.
++.if LSB_CRC && \n == 64
++	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
++	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
++    .if AVX_LEVEL < 10
++	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
++	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
++    .else
++	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
++    .endif
++	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
++.else
++	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
++	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
++  .if \n == 8
++	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
++  .elseif \n == 16
++	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
++  .elseif \n == 32
++	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
++  .else // \n == 64 && !LSB_CRC
++	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
++  .endif
++.endif
++
++.if VL > 16
++	vzeroupper	// Needed when ymm or zmm registers may have been used.
++.endif
++#ifdef __i386__
++	pop		CONSTS_PTR
++#endif
++	RET
++.endm
++
++#ifdef CONFIG_AS_VPCLMULQDQ
++#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
++SYM_FUNC_START(prefix##_pclmul_sse);					\
++	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
++SYM_FUNC_END(prefix##_pclmul_sse);					\
++									\
++SYM_FUNC_START(prefix##_vpclmul_avx2);					\
++	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
++SYM_FUNC_END(prefix##_vpclmul_avx2);					\
++									\
++SYM_FUNC_START(prefix##_vpclmul_avx10_256);				\
++	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=10;	\
++SYM_FUNC_END(prefix##_vpclmul_avx10_256);				\
++									\
++SYM_FUNC_START(prefix##_vpclmul_avx10_512);				\
++	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=10;	\
++SYM_FUNC_END(prefix##_vpclmul_avx10_512);
++#else
++#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
++SYM_FUNC_START(prefix##_pclmul_sse);					\
++	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
++SYM_FUNC_END(prefix##_pclmul_sse);
++#endif // !CONFIG_AS_VPCLMULQDQ
+diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
+new file mode 100644
+index 000000000000..7b89f0edbc17
+--- /dev/null
++++ b/arch/x86/lib/crc-pclmul-template.h
+@@ -0,0 +1,81 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++/*
++ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
++ * instantiated by crc-pclmul-template.S
++ *
++ * Copyright 2025 Google LLC
++ *
++ * Author: Eric Biggers <ebiggers@google.com>
++ */
++#ifndef _CRC_PCLMUL_TEMPLATE_H
++#define _CRC_PCLMUL_TEMPLATE_H
++
++#include <asm/cpufeatures.h>
++#include <asm/simd.h>
++#include <crypto/internal/simd.h>
++#include <linux/static_call.h>
++#include "crc-pclmul-consts.h"
++
++#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t)				\
++crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len,		\
++			  const void *consts_ptr);			\
++crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len,		\
++			    const void *consts_ptr);			\
++crc_t prefix##_vpclmul_avx10_256(crc_t crc, const u8 *p, size_t len,	\
++				 const void *consts_ptr);		\
++crc_t prefix##_vpclmul_avx10_512(crc_t crc, const u8 *p, size_t len,	\
++				 const void *consts_ptr);		\
++DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
++
++#define INIT_CRC_PCLMUL(prefix)						\
++do {									\
++	if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) &&				\
++	    boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&			\
++	    boot_cpu_has(X86_FEATURE_AVX2) &&				\
++	    cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) {		\
++		if (boot_cpu_has(X86_FEATURE_AVX512BW) &&		\
++		    boot_cpu_has(X86_FEATURE_AVX512VL) &&		\
++		    cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {	\
++			if (boot_cpu_has(X86_FEATURE_PREFER_YMM))	\
++				static_call_update(prefix##_pclmul,	\
++						   prefix##_vpclmul_avx10_256); \
++			else						\
++				static_call_update(prefix##_pclmul,	\
++						   prefix##_vpclmul_avx10_512); \
++		} else {						\
++			static_call_update(prefix##_pclmul,		\
++					   prefix##_vpclmul_avx2);	\
++		}							\
++	}								\
++} while (0)
++
++/*
++ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
++ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
++ *
++ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
++ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
++ * varying by CPU and factors such as which parts of the "FPU" state userspace
++ * has touched, which could result in a larger cutoff being better.  Indeed, a
++ * larger cutoff is usually better for a *single* message.  However, the
++ * overhead of the FPU section gets amortized if multiple FPU sections get
++ * executed before returning to userspace, since the XSAVE and XRSTOR occur only
++ * once.  Considering that and the fact that the [V]PCLMULQDQ code is lighter on
++ * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
++ */
++#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq)		\
++do {									\
++	if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) &&	\
++	    crypto_simd_usable()) {					\
++		const void *consts_ptr;					\
++									\
++		consts_ptr = (consts).fold_across_128_bits_consts;	\
++		kernel_fpu_begin();					\
++		crc = static_call(prefix##_pclmul)((crc), (p), (len),	\
++						   consts_ptr);		\
++		kernel_fpu_end();					\
++		return crc;						\
++	}								\
++} while (0)
++
++#endif /* _CRC_PCLMUL_TEMPLATE_H */
+diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c
+index 13f07ddc9122..6b09374b8355 100644
+--- a/arch/x86/lib/crc-t10dif-glue.c
++++ b/arch/x86/lib/crc-t10dif-glue.c
+@@ -1,37 +1,32 @@
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+- * CRC-T10DIF using PCLMULQDQ instructions
++ * CRC-T10DIF using [V]PCLMULQDQ instructions
+  *
+  * Copyright 2024 Google LLC
+  */
+ 
+-#include <asm/cpufeatures.h>
+-#include <asm/simd.h>
+-#include <crypto/internal/simd.h>
+ #include <linux/crc-t10dif.h>
+ #include <linux/module.h>
++#include "crc-pclmul-template.h"
+ 
+ static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+ 
+-asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
++DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
+ 
+ u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+ {
+-	if (len >= 16 &&
+-	    static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) {
+-		kernel_fpu_begin();
+-		crc = crc_t10dif_pcl(crc, p, len);
+-		kernel_fpu_end();
+-		return crc;
+-	}
++	CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
++		   have_pclmulqdq);
+ 	return crc_t10dif_generic(crc, p, len);
+ }
+ EXPORT_SYMBOL(crc_t10dif_arch);
+ 
+ static int __init crc_t10dif_x86_init(void)
+ {
+-	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
++	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ 		static_branch_enable(&have_pclmulqdq);
++		INIT_CRC_PCLMUL(crc16_msb);
++	}
+ 	return 0;
+ }
+ arch_initcall(crc_t10dif_x86_init);
+@@ -47,5 +42,5 @@ bool crc_t10dif_is_optimized(void)
+ }
+ EXPORT_SYMBOL(crc_t10dif_is_optimized);
+ 
+-MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions");
++MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
+ MODULE_LICENSE("GPL");
+diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
+new file mode 100644
+index 000000000000..e9fe248093a8
+--- /dev/null
++++ b/arch/x86/lib/crc16-msb-pclmul.S
+@@ -0,0 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++// Copyright 2025 Google LLC
++
++#include "crc-pclmul-template.S"
++
++DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
+diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
+index 2dd18a886ded..5b2878c2f793 100644
+--- a/arch/x86/lib/crc32-glue.c
++++ b/arch/x86/lib/crc32-glue.c
+@@ -7,43 +7,20 @@
+  * Copyright 2024 Google LLC
+  */
+ 
+-#include <asm/cpufeatures.h>
+-#include <asm/simd.h>
+-#include <crypto/internal/simd.h>
+ #include <linux/crc32.h>
+-#include <linux/linkage.h>
+ #include <linux/module.h>
+-
+-/* minimum size of buffer for crc32_pclmul_le_16 */
+-#define CRC32_PCLMUL_MIN_LEN	64
++#include "crc-pclmul-template.h"
+ 
+ static DEFINE_STATIC_KEY_FALSE(have_crc32);
+ static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+ 
+-u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
++DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+ 
+ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+ {
+-	if (len >= CRC32_PCLMUL_MIN_LEN + 15 &&
+-	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+-		size_t n = -(uintptr_t)p & 15;
+-
+-		/* align p to 16-byte boundary */
+-		if (n) {
+-			crc = crc32_le_base(crc, p, n);
+-			p += n;
+-			len -= n;
+-		}
+-		n = round_down(len, 16);
+-		kernel_fpu_begin();
+-		crc = crc32_pclmul_le_16(crc, p, n);
+-		kernel_fpu_end();
+-		p += n;
+-		len -= n;
+-	}
+-	if (len)
+-		crc = crc32_le_base(crc, p, len);
+-	return crc;
++	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
++		   have_pclmulqdq);
++	return crc32_le_base(crc, p, len);
+ }
+ EXPORT_SYMBOL(crc32_le_arch);
+ 
+@@ -78,10 +55,18 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
+ 
+ 	for (num_longs = len / sizeof(unsigned long);
+ 	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
+-		asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
++		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+ 
+-	for (len %= sizeof(unsigned long); len; len--, p++)
+-		asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p));
++	if (sizeof(unsigned long) > 4 && (len & 4)) {
++		asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
++		p += 4;
++	}
++	if (len & 2) {
++		asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
++		p += 2;
++	}
++	if (len & 1)
++		asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+ 
+ 	return crc;
+ }
+@@ -97,8 +82,10 @@ static int __init crc32_x86_init(void)
+ {
+ 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ 		static_branch_enable(&have_crc32);
+-	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
++	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ 		static_branch_enable(&have_pclmulqdq);
++		INIT_CRC_PCLMUL(crc32_lsb);
++	}
+ 	return 0;
+ }
+ arch_initcall(crc32_x86_init);
+diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
+index f9637789cac1..f20f40fb0172 100644
+--- a/arch/x86/lib/crc32-pclmul.S
++++ b/arch/x86/lib/crc32-pclmul.S
+@@ -1,217 +1,6 @@
+-/* SPDX-License-Identifier: GPL-2.0-only */
+-/*
+- * Copyright 2012 Xyratex Technology Limited
+- *
+- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+- * calculation.
+- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+- * at:
+- * http://www.intel.com/products/processor/manuals/
+- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+- * Volume 2B: Instruction Set Reference, N-Z
+- *
+- * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+- *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+- */
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++// Copyright 2025 Google LLC
+ 
+-#include <linux/linkage.h>
++#include "crc-pclmul-template.S"
+ 
+-
+-.section .rodata
+-.align 16
+-/*
+- * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+- * #define CONSTANT_R1  0x154442bd4LL
+- *
+- * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+- * #define CONSTANT_R2  0x1c6e41596LL
+- */
+-.Lconstant_R2R1:
+-	.octa 0x00000001c6e415960000000154442bd4
+-/*
+- * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+- * #define CONSTANT_R3  0x1751997d0LL
+- *
+- * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+- * #define CONSTANT_R4  0x0ccaa009eLL
+- */
+-.Lconstant_R4R3:
+-	.octa 0x00000000ccaa009e00000001751997d0
+-/*
+- * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+- * #define CONSTANT_R5  0x163cd6124LL
+- */
+-.Lconstant_R5:
+-	.octa 0x00000000000000000000000163cd6124
+-.Lconstant_mask32:
+-	.octa 0x000000000000000000000000FFFFFFFF
+-/*
+- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+- *
+- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+- * #define CONSTANT_RU  0x1F7011641LL
+- */
+-.Lconstant_RUpoly:
+-	.octa 0x00000001F701164100000001DB710641
+-
+-#define CONSTANT %xmm0
+-
+-#ifdef __x86_64__
+-#define CRC     %edi
+-#define BUF     %rsi
+-#define LEN     %rdx
+-#else
+-#define CRC     %eax
+-#define BUF     %edx
+-#define LEN     %ecx
+-#endif
+-
+-
+-
+-.text
+-/**
+- *      Calculate crc32
+- *      CRC - initial crc32
+- *      BUF - buffer (16 bytes aligned)
+- *      LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63
+- *      return %eax crc32
+- *      u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
+- */
+-
+-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+-	movdqa  (BUF), %xmm1
+-	movdqa  0x10(BUF), %xmm2
+-	movdqa  0x20(BUF), %xmm3
+-	movdqa  0x30(BUF), %xmm4
+-	movd    CRC, CONSTANT
+-	pxor    CONSTANT, %xmm1
+-	sub     $0x40, LEN
+-	add     $0x40, BUF
+-	cmp     $0x40, LEN
+-	jb      .Lless_64
+-
+-#ifdef __x86_64__
+-	movdqa .Lconstant_R2R1(%rip), CONSTANT
+-#else
+-	movdqa .Lconstant_R2R1, CONSTANT
+-#endif
+-
+-.Lloop_64:/*  64 bytes Full cache line folding */
+-	prefetchnta    0x40(BUF)
+-	movdqa  %xmm1, %xmm5
+-	movdqa  %xmm2, %xmm6
+-	movdqa  %xmm3, %xmm7
+-#ifdef __x86_64__
+-	movdqa  %xmm4, %xmm8
+-#endif
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pclmulqdq $0x00, CONSTANT, %xmm2
+-	pclmulqdq $0x00, CONSTANT, %xmm3
+-#ifdef __x86_64__
+-	pclmulqdq $0x00, CONSTANT, %xmm4
+-#endif
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pclmulqdq $0x11, CONSTANT, %xmm6
+-	pclmulqdq $0x11, CONSTANT, %xmm7
+-#ifdef __x86_64__
+-	pclmulqdq $0x11, CONSTANT, %xmm8
+-#endif
+-	pxor    %xmm5, %xmm1
+-	pxor    %xmm6, %xmm2
+-	pxor    %xmm7, %xmm3
+-#ifdef __x86_64__
+-	pxor    %xmm8, %xmm4
+-#else
+-	/* xmm8 unsupported for x32 */
+-	movdqa  %xmm4, %xmm5
+-	pclmulqdq $0x00, CONSTANT, %xmm4
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pxor    %xmm5, %xmm4
+-#endif
+-
+-	pxor    (BUF), %xmm1
+-	pxor    0x10(BUF), %xmm2
+-	pxor    0x20(BUF), %xmm3
+-	pxor    0x30(BUF), %xmm4
+-
+-	sub     $0x40, LEN
+-	add     $0x40, BUF
+-	cmp     $0x40, LEN
+-	jge     .Lloop_64
+-.Lless_64:/*  Folding cache line into 128bit */
+-#ifdef __x86_64__
+-	movdqa  .Lconstant_R4R3(%rip), CONSTANT
+-#else
+-	movdqa  .Lconstant_R4R3, CONSTANT
+-#endif
+-	prefetchnta     (BUF)
+-
+-	movdqa  %xmm1, %xmm5
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pxor    %xmm5, %xmm1
+-	pxor    %xmm2, %xmm1
+-
+-	movdqa  %xmm1, %xmm5
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pxor    %xmm5, %xmm1
+-	pxor    %xmm3, %xmm1
+-
+-	movdqa  %xmm1, %xmm5
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pxor    %xmm5, %xmm1
+-	pxor    %xmm4, %xmm1
+-
+-	cmp     $0x10, LEN
+-	jb      .Lfold_64
+-.Lloop_16:/* Folding rest buffer into 128bit */
+-	movdqa  %xmm1, %xmm5
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pclmulqdq $0x11, CONSTANT, %xmm5
+-	pxor    %xmm5, %xmm1
+-	pxor    (BUF), %xmm1
+-	sub     $0x10, LEN
+-	add     $0x10, BUF
+-	cmp     $0x10, LEN
+-	jge     .Lloop_16
+-
+-.Lfold_64:
+-	/* perform the last 64 bit fold, also adds 32 zeroes
+-	 * to the input stream */
+-	pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+-	psrldq  $0x08, %xmm1
+-	pxor    CONSTANT, %xmm1
+-
+-	/* final 32-bit fold */
+-	movdqa  %xmm1, %xmm2
+-#ifdef __x86_64__
+-	movdqa  .Lconstant_R5(%rip), CONSTANT
+-	movdqa  .Lconstant_mask32(%rip), %xmm3
+-#else
+-	movdqa  .Lconstant_R5, CONSTANT
+-	movdqa  .Lconstant_mask32, %xmm3
+-#endif
+-	psrldq  $0x04, %xmm2
+-	pand    %xmm3, %xmm1
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pxor    %xmm2, %xmm1
+-
+-	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+-#ifdef __x86_64__
+-	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
+-#else
+-	movdqa  .Lconstant_RUpoly, CONSTANT
+-#endif
+-	movdqa  %xmm1, %xmm2
+-	pand    %xmm3, %xmm1
+-	pclmulqdq $0x10, CONSTANT, %xmm1
+-	pand    %xmm3, %xmm1
+-	pclmulqdq $0x00, CONSTANT, %xmm1
+-	pxor    %xmm2, %xmm1
+-	pextrd  $0x01, %xmm1, %eax
+-
+-	RET
+-SYM_FUNC_END(crc32_pclmul_le_16)
++DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
+diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S
+deleted file mode 100644
+index 5286db5b8165..000000000000
+--- a/arch/x86/lib/crct10dif-pcl-asm_64.S
++++ /dev/null
+@@ -1,332 +0,0 @@
+-########################################################################
+-# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+-#
+-# Copyright (c) 2013, Intel Corporation
+-#
+-# Authors:
+-#     Erdinc Ozturk <erdinc.ozturk@intel.com>
+-#     Vinodh Gopal <vinodh.gopal@intel.com>
+-#     James Guilford <james.guilford@intel.com>
+-#     Tim Chen <tim.c.chen@linux.intel.com>
+-#
+-# This software is available to you under a choice of one of two
+-# licenses.  You may choose to be licensed under the terms of the GNU
+-# General Public License (GPL) Version 2, available from the file
+-# COPYING in the main directory of this source tree, or the
+-# OpenIB.org BSD license below:
+-#
+-# Redistribution and use in source and binary forms, with or without
+-# modification, are permitted provided that the following conditions are
+-# met:
+-#
+-# * Redistributions of source code must retain the above copyright
+-#   notice, this list of conditions and the following disclaimer.
+-#
+-# * Redistributions in binary form must reproduce the above copyright
+-#   notice, this list of conditions and the following disclaimer in the
+-#   documentation and/or other materials provided with the
+-#   distribution.
+-#
+-# * Neither the name of the Intel Corporation nor the names of its
+-#   contributors may be used to endorse or promote products derived from
+-#   this software without specific prior written permission.
+-#
+-#
+-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-#
+-#       Reference paper titled "Fast CRC Computation for Generic
+-#	Polynomials Using PCLMULQDQ Instruction"
+-#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+-#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+-#
+-
+-#include <linux/linkage.h>
+-
+-.text
+-
+-#define		init_crc	%edi
+-#define		buf		%rsi
+-#define		len		%rdx
+-
+-#define		FOLD_CONSTS	%xmm10
+-#define		BSWAP_MASK	%xmm11
+-
+-# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+-# reg1, reg2.
+-.macro	fold_32_bytes	offset, reg1, reg2
+-	movdqu	\offset(buf), %xmm9
+-	movdqu	\offset+16(buf), %xmm12
+-	pshufb	BSWAP_MASK, %xmm9
+-	pshufb	BSWAP_MASK, %xmm12
+-	movdqa	\reg1, %xmm8
+-	movdqa	\reg2, %xmm13
+-	pclmulqdq	$0x00, FOLD_CONSTS, \reg1
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm8
+-	pclmulqdq	$0x00, FOLD_CONSTS, \reg2
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm13
+-	pxor	%xmm9 , \reg1
+-	xorps	%xmm8 , \reg1
+-	pxor	%xmm12, \reg2
+-	xorps	%xmm13, \reg2
+-.endm
+-
+-# Fold src_reg into dst_reg.
+-.macro	fold_16_bytes	src_reg, dst_reg
+-	movdqa	\src_reg, %xmm8
+-	pclmulqdq	$0x11, FOLD_CONSTS, \src_reg
+-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
+-	pxor	%xmm8, \dst_reg
+-	xorps	\src_reg, \dst_reg
+-.endm
+-
+-#
+-# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+-#
+-# Assumes len >= 16.
+-#
+-SYM_FUNC_START(crc_t10dif_pcl)
+-
+-	movdqa	.Lbswap_mask(%rip), BSWAP_MASK
+-
+-	# For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+-	cmp	$256, len
+-	jl	.Lless_than_256_bytes
+-
+-	# Load the first 128 data bytes.  Byte swapping is necessary to make the
+-	# bit order match the polynomial coefficient order.
+-	movdqu	16*0(buf), %xmm0
+-	movdqu	16*1(buf), %xmm1
+-	movdqu	16*2(buf), %xmm2
+-	movdqu	16*3(buf), %xmm3
+-	movdqu	16*4(buf), %xmm4
+-	movdqu	16*5(buf), %xmm5
+-	movdqu	16*6(buf), %xmm6
+-	movdqu	16*7(buf), %xmm7
+-	add	$128, buf
+-	pshufb	BSWAP_MASK, %xmm0
+-	pshufb	BSWAP_MASK, %xmm1
+-	pshufb	BSWAP_MASK, %xmm2
+-	pshufb	BSWAP_MASK, %xmm3
+-	pshufb	BSWAP_MASK, %xmm4
+-	pshufb	BSWAP_MASK, %xmm5
+-	pshufb	BSWAP_MASK, %xmm6
+-	pshufb	BSWAP_MASK, %xmm7
+-
+-	# XOR the first 16 data *bits* with the initial CRC value.
+-	pxor	%xmm8, %xmm8
+-	pinsrw	$7, init_crc, %xmm8
+-	pxor	%xmm8, %xmm0
+-
+-	movdqa	.Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
+-
+-	# Subtract 128 for the 128 data bytes just consumed.  Subtract another
+-	# 128 to simplify the termination condition of the following loop.
+-	sub	$256, len
+-
+-	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
+-	# bytes xmm0-7 into them, storing the result back into xmm0-7.
+-.Lfold_128_bytes_loop:
+-	fold_32_bytes	0, %xmm0, %xmm1
+-	fold_32_bytes	32, %xmm2, %xmm3
+-	fold_32_bytes	64, %xmm4, %xmm5
+-	fold_32_bytes	96, %xmm6, %xmm7
+-	add	$128, buf
+-	sub	$128, len
+-	jge	.Lfold_128_bytes_loop
+-
+-	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
+-
+-	# Fold across 64 bytes.
+-	movdqa	.Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
+-	fold_16_bytes	%xmm0, %xmm4
+-	fold_16_bytes	%xmm1, %xmm5
+-	fold_16_bytes	%xmm2, %xmm6
+-	fold_16_bytes	%xmm3, %xmm7
+-	# Fold across 32 bytes.
+-	movdqa	.Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+-	fold_16_bytes	%xmm4, %xmm6
+-	fold_16_bytes	%xmm5, %xmm7
+-	# Fold across 16 bytes.
+-	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+-	fold_16_bytes	%xmm6, %xmm7
+-
+-	# Add 128 to get the correct number of data bytes remaining in 0...127
+-	# (not counting xmm7), following the previous extra subtraction by 128.
+-	# Then subtract 16 to simplify the termination condition of the
+-	# following loop.
+-	add	$128-16, len
+-
+-	# While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
+-	# xmm7 into them, storing the result back into xmm7.
+-	jl	.Lfold_16_bytes_loop_done
+-.Lfold_16_bytes_loop:
+-	movdqa	%xmm7, %xmm8
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
+-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
+-	pxor	%xmm8, %xmm7
+-	movdqu	(buf), %xmm0
+-	pshufb	BSWAP_MASK, %xmm0
+-	pxor	%xmm0 , %xmm7
+-	add	$16, buf
+-	sub	$16, len
+-	jge	.Lfold_16_bytes_loop
+-
+-.Lfold_16_bytes_loop_done:
+-	# Add 16 to get the correct number of data bytes remaining in 0...15
+-	# (not counting xmm7), following the previous extra subtraction by 16.
+-	add	$16, len
+-	je	.Lreduce_final_16_bytes
+-
+-.Lhandle_partial_segment:
+-	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
+-	# bytes are in xmm7 and the rest are the remaining data in 'buf'.  To do
+-	# this without needing a fold constant for each possible 'len', redivide
+-	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
+-	# bytes, then fold the first chunk into the second.
+-
+-	movdqa	%xmm7, %xmm2
+-
+-	# xmm1 = last 16 original data bytes
+-	movdqu	-16(buf, len), %xmm1
+-	pshufb	BSWAP_MASK, %xmm1
+-
+-	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
+-	lea	.Lbyteshift_table+16(%rip), %rax
+-	sub	len, %rax
+-	movdqu	(%rax), %xmm0
+-	pshufb	%xmm0, %xmm2
+-
+-	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
+-	pxor	.Lmask1(%rip), %xmm0
+-	pshufb	%xmm0, %xmm7
+-
+-	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+-	# then '16-len' bytes from xmm2 (high-order bytes).
+-	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
+-
+-	# Fold the first chunk into the second chunk, storing the result in xmm7.
+-	movdqa	%xmm7, %xmm8
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
+-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
+-	pxor	%xmm8, %xmm7
+-	pxor	%xmm1, %xmm7
+-
+-.Lreduce_final_16_bytes:
+-	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
+-
+-	# Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+-	movdqa	.Lfinal_fold_consts(%rip), FOLD_CONSTS
+-
+-	# Fold the high 64 bits into the low 64 bits, while also multiplying by
+-	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+-	# whose low 48 bits are 0.
+-	movdqa	%xmm7, %xmm0
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+-	pslldq	$8, %xmm0
+-	pxor	%xmm0, %xmm7			  # + low bits * x^64
+-
+-	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+-	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
+-	movdqa	%xmm7, %xmm0
+-	pand	.Lmask2(%rip), %xmm0		  # zero high 32 bits
+-	psrldq	$12, %xmm7			  # extract high 32 bits
+-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+-	pxor	%xmm0, %xmm7			  # + low bits
+-
+-	# Load G(x) and floor(x^48 / G(x)).
+-	movdqa	.Lbarrett_reduction_consts(%rip), FOLD_CONSTS
+-
+-	# Use Barrett reduction to compute the final CRC value.
+-	movdqa	%xmm7, %xmm0
+-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
+-	psrlq	$32, %xmm7			  # /= x^32
+-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+-	psrlq	$48, %xmm0
+-	pxor	%xmm7, %xmm0		     # + low 16 nonzero bits
+-	# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
+-
+-	pextrw	$0, %xmm0, %eax
+-	RET
+-
+-.align 16
+-.Lless_than_256_bytes:
+-	# Checksumming a buffer of length 16...255 bytes
+-
+-	# Load the first 16 data bytes.
+-	movdqu	(buf), %xmm7
+-	pshufb	BSWAP_MASK, %xmm7
+-	add	$16, buf
+-
+-	# XOR the first 16 data *bits* with the initial CRC value.
+-	pxor	%xmm0, %xmm0
+-	pinsrw	$7, init_crc, %xmm0
+-	pxor	%xmm0, %xmm7
+-
+-	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+-	cmp	$16, len
+-	je	.Lreduce_final_16_bytes		# len == 16
+-	sub	$32, len
+-	jge	.Lfold_16_bytes_loop		# 32 <= len <= 255
+-	add	$16, len
+-	jmp	.Lhandle_partial_segment	# 17 <= len <= 31
+-SYM_FUNC_END(crc_t10dif_pcl)
+-
+-.section	.rodata, "a", @progbits
+-.align 16
+-
+-# Fold constants precomputed from the polynomial 0x18bb7
+-# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+-.Lfold_across_128_bytes_consts:
+-	.quad		0x0000000000006123	# x^(8*128)	mod G(x)
+-	.quad		0x0000000000002295	# x^(8*128+64)	mod G(x)
+-.Lfold_across_64_bytes_consts:
+-	.quad		0x0000000000001069	# x^(4*128)	mod G(x)
+-	.quad		0x000000000000dd31	# x^(4*128+64)	mod G(x)
+-.Lfold_across_32_bytes_consts:
+-	.quad		0x000000000000857d	# x^(2*128)	mod G(x)
+-	.quad		0x0000000000007acc	# x^(2*128+64)	mod G(x)
+-.Lfold_across_16_bytes_consts:
+-	.quad		0x000000000000a010	# x^(1*128)	mod G(x)
+-	.quad		0x0000000000001faa	# x^(1*128+64)	mod G(x)
+-.Lfinal_fold_consts:
+-	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
+-	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
+-.Lbarrett_reduction_consts:
+-	.quad		0x0000000000018bb7	# G(x)
+-	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
+-
+-.section	.rodata.cst16.mask1, "aM", @progbits, 16
+-.align 16
+-.Lmask1:
+-	.octa	0x80808080808080808080808080808080
+-
+-.section	.rodata.cst16.mask2, "aM", @progbits, 16
+-.align 16
+-.Lmask2:
+-	.octa	0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+-
+-.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
+-.align 16
+-.Lbswap_mask:
+-	.octa	0x000102030405060708090A0B0C0D0E0F
+-
+-.section	.rodata.cst32.byteshift_table, "aM", @progbits, 32
+-.align 16
+-# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
+-# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+-# 0x80} XOR the index vector to shift right by '16 - len' bytes.
+-.Lbyteshift_table:
+-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
+diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
+index 09ed1f61c9a8..aa859464519e 100644
+--- a/drivers/nvme/host/Kconfig
++++ b/drivers/nvme/host/Kconfig
+@@ -80,8 +80,7 @@ config NVME_TCP
+ 	depends on INET
+ 	depends on BLOCK
+ 	select NVME_FABRICS
+-	select CRYPTO
+-	select CRYPTO_CRC32C
++	select CRC32
+ 	help
+ 	  This provides support for the NVMe over Fabrics protocol using
+ 	  the TCP transport.  This allows you to use remote block devices
+diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
+index d991baa82a1c..2a17b535bba6 100644
+--- a/drivers/nvme/host/tcp.c
++++ b/drivers/nvme/host/tcp.c
+@@ -8,6 +8,7 @@
+ #include <linux/init.h>
+ #include <linux/slab.h>
+ #include <linux/err.h>
++#include <linux/crc32c.h>
+ #include <linux/key.h>
+ #include <linux/nvme-tcp.h>
+ #include <linux/nvme-keyring.h>
+@@ -17,7 +18,6 @@
+ #include <net/tls_prot.h>
+ #include <net/handshake.h>
+ #include <linux/blk-mq.h>
+-#include <crypto/hash.h>
+ #include <net/busy_poll.h>
+ #include <trace/events/sock.h>
+ 
+@@ -169,8 +169,8 @@ struct nvme_tcp_queue {
+ 	bool			hdr_digest;
+ 	bool			data_digest;
+ 	bool			tls_enabled;
+-	struct ahash_request	*rcv_hash;
+-	struct ahash_request	*snd_hash;
++	u32			rcv_crc;
++	u32			snd_crc;
+ 	__le32			exp_ddgst;
+ 	__le32			recv_ddgst;
+ 	struct completion       tls_complete;
+@@ -457,32 +457,29 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+ 	return req;
+ }
+ 
+-static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
+-		__le32 *dgst)
++static inline void nvme_tcp_ddgst_init(u32 *crcp)
+ {
+-	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+-	crypto_ahash_final(hash);
++	*crcp = ~0;
+ }
+ 
+-static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
++static inline void nvme_tcp_ddgst_update(u32 *crcp,
+ 		struct page *page, off_t off, size_t len)
+ {
+-	struct scatterlist sg;
++	const void *virt = kmap_local_page(page + (off >> PAGE_SHIFT));
+ 
+-	sg_init_table(&sg, 1);
+-	sg_set_page(&sg, page, len, off);
+-	ahash_request_set_crypt(hash, &sg, NULL, len);
+-	crypto_ahash_update(hash);
++	*crcp = crc32c(*crcp, virt + (off & ~PAGE_MASK), len);
++
++	kunmap_local(virt);
+ }
+ 
+-static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+-		void *pdu, size_t len)
++static inline void nvme_tcp_ddgst_final(u32 *crcp, __le32 *dgst)
+ {
+-	struct scatterlist sg;
++	*dgst = cpu_to_le32(~*crcp);
++}
+ 
+-	sg_init_one(&sg, pdu, len);
+-	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+-	crypto_ahash_digest(hash);
++static inline void nvme_tcp_hdgst(void *pdu, size_t len)
++{
++	put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
+ }
+ 
+ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+@@ -500,7 +497,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+ 	}
+ 
+ 	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+-	nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
++	nvme_tcp_hdgst(pdu, pdu_len);
+ 	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ 	if (recv_digest != exp_digest) {
+ 		dev_err(queue->ctrl->ctrl.device,
+@@ -527,7 +524,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+ 		nvme_tcp_queue_id(queue));
+ 		return -EPROTO;
+ 	}
+-	crypto_ahash_init(queue->rcv_hash);
++	nvme_tcp_ddgst_init(&queue->rcv_crc);
+ 
+ 	return 0;
+ }
+@@ -890,6 +887,17 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
+ 		nvme_complete_rq(rq);
+ }
+ 
++static size_t crc_and_copy_to_iter(const void *addr, size_t bytes, void *crcp_,
++				   struct iov_iter *i)
++{
++	u32 *crcp = crcp_;
++	size_t copied;
++
++	copied = copy_to_iter(addr, bytes, i);
++	*crcp = crc32c(*crcp, addr, copied);
++	return copied;
++}
++
+ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ 			      unsigned int *offset, size_t *len)
+ {
+@@ -927,8 +935,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ 				iov_iter_count(&req->iter));
+ 
+ 		if (queue->data_digest)
+-			ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+-				&req->iter, recv_len, queue->rcv_hash);
++			ret = __skb_datagram_iter(skb, *offset, &req->iter,
++						  recv_len, true,
++						  crc_and_copy_to_iter,
++						  &queue->rcv_crc);
+ 		else
+ 			ret = skb_copy_datagram_iter(skb, *offset,
+ 					&req->iter, recv_len);
+@@ -946,7 +956,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ 
+ 	if (!queue->data_remaining) {
+ 		if (queue->data_digest) {
+-			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
++			nvme_tcp_ddgst_final(&queue->rcv_crc,
++					     &queue->exp_ddgst);
+ 			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+ 		} else {
+ 			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
+@@ -1148,7 +1159,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+ 			return ret;
+ 
+ 		if (queue->data_digest)
+-			nvme_tcp_ddgst_update(queue->snd_hash, page,
++			nvme_tcp_ddgst_update(&queue->snd_crc, page,
+ 					offset, ret);
+ 
+ 		/*
+@@ -1162,7 +1173,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+ 		/* fully successful last send in current PDU */
+ 		if (last && ret == len) {
+ 			if (queue->data_digest) {
+-				nvme_tcp_ddgst_final(queue->snd_hash,
++				nvme_tcp_ddgst_final(&queue->snd_crc,
+ 					&req->ddgst);
+ 				req->state = NVME_TCP_SEND_DDGST;
+ 				req->offset = 0;
+@@ -1195,7 +1206,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+ 		msg.msg_flags |= MSG_EOR;
+ 
+ 	if (queue->hdr_digest && !req->offset)
+-		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
++		nvme_tcp_hdgst(pdu, sizeof(*pdu));
+ 
+ 	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+ 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+@@ -1208,7 +1219,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+ 		if (inline_data) {
+ 			req->state = NVME_TCP_SEND_DATA;
+ 			if (queue->data_digest)
+-				crypto_ahash_init(queue->snd_hash);
++				nvme_tcp_ddgst_init(&queue->snd_crc);
+ 		} else {
+ 			nvme_tcp_done_send_req(queue);
+ 		}
+@@ -1230,7 +1241,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+ 	int ret;
+ 
+ 	if (queue->hdr_digest && !req->offset)
+-		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
++		nvme_tcp_hdgst(pdu, sizeof(*pdu));
+ 
+ 	if (!req->h2cdata_left)
+ 		msg.msg_flags |= MSG_SPLICE_PAGES;
+@@ -1245,7 +1256,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+ 	if (!len) {
+ 		req->state = NVME_TCP_SEND_DATA;
+ 		if (queue->data_digest)
+-			crypto_ahash_init(queue->snd_hash);
++			nvme_tcp_ddgst_init(&queue->snd_crc);
+ 		return 1;
+ 	}
+ 	req->offset += ret;
+@@ -1385,41 +1396,6 @@ static void nvme_tcp_io_work(struct work_struct *w)
+ 	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
+ 
+-static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+-{
+-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+-
+-	ahash_request_free(queue->rcv_hash);
+-	ahash_request_free(queue->snd_hash);
+-	crypto_free_ahash(tfm);
+-}
+-
+-static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+-{
+-	struct crypto_ahash *tfm;
+-
+-	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+-	if (IS_ERR(tfm))
+-		return PTR_ERR(tfm);
+-
+-	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+-	if (!queue->snd_hash)
+-		goto free_tfm;
+-	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+-
+-	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+-	if (!queue->rcv_hash)
+-		goto free_snd_hash;
+-	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+-
+-	return 0;
+-free_snd_hash:
+-	ahash_request_free(queue->snd_hash);
+-free_tfm:
+-	crypto_free_ahash(tfm);
+-	return -ENOMEM;
+-}
+-
+ static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+ {
+ 	struct nvme_tcp_request *async = &ctrl->async_req;
+@@ -1452,9 +1428,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+ 	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+ 		return;
+ 
+-	if (queue->hdr_digest || queue->data_digest)
+-		nvme_tcp_free_crypto(queue);
+-
+ 	page_frag_cache_drain(&queue->pf_cache);
+ 
+ 	noreclaim_flag = memalloc_noreclaim_save();
+@@ -1865,21 +1838,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
+ 
+ 	queue->hdr_digest = nctrl->opts->hdr_digest;
+ 	queue->data_digest = nctrl->opts->data_digest;
+-	if (queue->hdr_digest || queue->data_digest) {
+-		ret = nvme_tcp_alloc_crypto(queue);
+-		if (ret) {
+-			dev_err(nctrl->device,
+-				"failed to allocate queue %d crypto\n", qid);
+-			goto err_sock;
+-		}
+-	}
+ 
+ 	rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+ 			nvme_tcp_hdgst_len(queue);
+ 	queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+ 	if (!queue->pdu) {
+ 		ret = -ENOMEM;
+-		goto err_crypto;
++		goto err_sock;
+ 	}
+ 
+ 	dev_dbg(nctrl->device, "connecting queue %d\n",
+@@ -1912,9 +1877,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
+ 	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ err_rcv_pdu:
+ 	kfree(queue->pdu);
+-err_crypto:
+-	if (queue->hdr_digest || queue->data_digest)
+-		nvme_tcp_free_crypto(queue);
+ err_sock:
+ 	/* ->sock will be released by fput() */
+ 	fput(queue->sock->file);
+diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
+index 4f9cac8a5abe..cbedf61c8d0a 100644
+--- a/drivers/nvme/target/tcp.c
++++ b/drivers/nvme/target/tcp.c
+@@ -7,6 +7,7 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+ #include <linux/slab.h>
++#include <linux/crc32c.h>
+ #include <linux/err.h>
+ #include <linux/key.h>
+ #include <linux/nvme-tcp.h>
+@@ -18,7 +19,6 @@
+ #include <net/handshake.h>
+ #include <linux/inet.h>
+ #include <linux/llist.h>
+-#include <crypto/hash.h>
+ #include <trace/events/sock.h>
+ 
+ #include "nvmet.h"
+@@ -173,8 +173,8 @@ struct nvmet_tcp_queue {
+ 	/* digest state */
+ 	bool			hdr_digest;
+ 	bool			data_digest;
+-	struct ahash_request	*snd_hash;
+-	struct ahash_request	*rcv_hash;
++	u32			snd_crc;
++	u32			rcv_crc;
+ 
+ 	/* TLS state */
+ 	key_serial_t		tls_pskid;
+@@ -295,14 +295,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+ 	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+ }
+ 
+-static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
+-		void *pdu, size_t len)
++static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
+ {
+-	struct scatterlist sg;
+-
+-	sg_init_one(&sg, pdu, len);
+-	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+-	crypto_ahash_digest(hash);
++	put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
+ }
+ 
+ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+@@ -319,7 +314,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+ 	}
+ 
+ 	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+-	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
++	nvmet_tcp_hdgst(pdu, len);
+ 	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ 	if (recv_digest != exp_digest) {
+ 		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
+@@ -442,12 +437,20 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
+ 	return NVME_SC_INTERNAL;
+ }
+ 
+-static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
+-		struct nvmet_tcp_cmd *cmd)
++static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
+ {
+-	ahash_request_set_crypt(hash, cmd->req.sg,
+-		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
+-	crypto_ahash_digest(hash);
++	size_t total_len = cmd->req.transfer_len;
++	struct scatterlist *sg = cmd->req.sg;
++	u32 crc = ~0;
++
++	while (total_len) {
++		size_t len = min_t(size_t, total_len, sg->length);
++
++		crc = crc32c(crc, sg_virt(sg), len);
++		total_len -= len;
++		sg = sg_next(sg);
++	}
++	cmd->exp_ddgst = cpu_to_le32(~crc);
+ }
+ 
+ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+@@ -474,19 +477,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+ 
+ 	if (queue->data_digest) {
+ 		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+-		nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
++		nvmet_tcp_calc_ddgst(cmd);
+ 	}
+ 
+ 	if (cmd->queue->hdr_digest) {
+ 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+-		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
++		nvmet_tcp_hdgst(pdu, sizeof(*pdu));
+ 	}
+ }
+ 
+ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+ {
+ 	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
+-	struct nvmet_tcp_queue *queue = cmd->queue;
+ 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ 
+ 	cmd->offset = 0;
+@@ -504,14 +506,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+ 	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
+ 	if (cmd->queue->hdr_digest) {
+ 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+-		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
++		nvmet_tcp_hdgst(pdu, sizeof(*pdu));
+ 	}
+ }
+ 
+ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+ {
+ 	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
+-	struct nvmet_tcp_queue *queue = cmd->queue;
+ 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ 
+ 	cmd->offset = 0;
+@@ -524,7 +525,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+ 	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+ 	if (cmd->queue->hdr_digest) {
+ 		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+-		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
++		nvmet_tcp_hdgst(pdu, sizeof(*pdu));
+ 	}
+ }
+ 
+@@ -858,42 +859,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
+ 	smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
+ }
+ 
+-static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
+-{
+-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+-
+-	ahash_request_free(queue->rcv_hash);
+-	ahash_request_free(queue->snd_hash);
+-	crypto_free_ahash(tfm);
+-}
+-
+-static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
+-{
+-	struct crypto_ahash *tfm;
+-
+-	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+-	if (IS_ERR(tfm))
+-		return PTR_ERR(tfm);
+-
+-	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+-	if (!queue->snd_hash)
+-		goto free_tfm;
+-	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+-
+-	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+-	if (!queue->rcv_hash)
+-		goto free_snd_hash;
+-	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+-
+-	return 0;
+-free_snd_hash:
+-	ahash_request_free(queue->snd_hash);
+-free_tfm:
+-	crypto_free_ahash(tfm);
+-	return -ENOMEM;
+-}
+-
+-
+ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+ {
+ 	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
+@@ -922,11 +887,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+ 
+ 	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+ 	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+-	if (queue->hdr_digest || queue->data_digest) {
+-		ret = nvmet_tcp_alloc_crypto(queue);
+-		if (ret)
+-			return ret;
+-	}
+ 
+ 	memset(icresp, 0, sizeof(*icresp));
+ 	icresp->hdr.type = nvme_tcp_icresp;
+@@ -1247,7 +1207,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
+ {
+ 	struct nvmet_tcp_queue *queue = cmd->queue;
+ 
+-	nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
++	nvmet_tcp_calc_ddgst(cmd);
+ 	queue->offset = 0;
+ 	queue->left = NVME_TCP_DIGEST_LENGTH;
+ 	queue->rcv_state = NVMET_TCP_RECV_DDGST;
+@@ -1616,8 +1576,6 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
+ 	/* ->sock will be released by fput() */
+ 	fput(queue->sock->file);
+ 	nvmet_tcp_free_cmds(queue);
+-	if (queue->hdr_digest || queue->data_digest)
+-		nvmet_tcp_free_crypto(queue);
+ 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
+ 	page_frag_cache_drain(&queue->pf_cache);
+ 	kfree(queue);
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
+index bb2b751d274a..98804d51986c 100644
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -4145,9 +4145,10 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
+ }
+ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
+ 				   struct msghdr *msg);
+-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+-			   struct iov_iter *to, int len,
+-			   struct ahash_request *hash);
++int __skb_datagram_iter(const struct sk_buff *skb, int offset,
++			struct iov_iter *to, int len, bool fault_short,
++			size_t (*cb)(const void *, size_t, void *,
++				     struct iov_iter *), void *data);
+ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ 				 struct iov_iter *from, int len);
+ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
+diff --git a/net/core/datagram.c b/net/core/datagram.c
+index f0693707aece..19304c7ce7a3 100644
+--- a/net/core/datagram.c
++++ b/net/core/datagram.c
+@@ -61,7 +61,6 @@
+ #include <net/tcp_states.h>
+ #include <trace/events/skb.h>
+ #include <net/busy_poll.h>
+-#include <crypto/hash.h>
+ 
+ /*
+  *	Is a socket 'connection oriented' ?
+@@ -385,10 +384,10 @@ INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ 						void *data __always_unused,
+ 						struct iov_iter *i));
+ 
+-static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+-			       struct iov_iter *to, int len, bool fault_short,
+-			       size_t (*cb)(const void *, size_t, void *,
+-					    struct iov_iter *), void *data)
++int __skb_datagram_iter(const struct sk_buff *skb, int offset,
++			struct iov_iter *to, int len, bool fault_short,
++			size_t (*cb)(const void *, size_t, void *,
++				     struct iov_iter *), void *data)
+ {
+ 	int start = skb_headlen(skb);
+ 	int i, copy = start - offset, start_off = offset, n;
+@@ -481,42 +480,7 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+ 
+ 	return 0;
+ }
+-
+-static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+-				    struct iov_iter *i)
+-{
+-#ifdef CONFIG_CRYPTO_HASH
+-	struct ahash_request *hash = hashp;
+-	struct scatterlist sg;
+-	size_t copied;
+-
+-	copied = copy_to_iter(addr, bytes, i);
+-	sg_init_one(&sg, addr, copied);
+-	ahash_request_set_crypt(hash, &sg, NULL, copied);
+-	crypto_ahash_update(hash);
+-	return copied;
+-#else
+-	return 0;
+-#endif
+-}
+-
+-/**
+- *	skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
+- *          and update a hash.
+- *	@skb: buffer to copy
+- *	@offset: offset in the buffer to start copying from
+- *	@to: iovec iterator to copy to
+- *	@len: amount of data to copy from buffer to iovec
+- *      @hash: hash request to update
+- */
+-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+-			   struct iov_iter *to, int len,
+-			   struct ahash_request *hash)
+-{
+-	return __skb_datagram_iter(skb, offset, to, len, true,
+-			hash_and_copy_to_iter, hash);
+-}
+-EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
++EXPORT_SYMBOL_GPL(__skb_datagram_iter);
+ 
+ static size_t simple_copy_to_iter(const void *addr, size_t bytes,
+ 		void *data __always_unused, struct iov_iter *i)
+diff --git a/scripts/gen-crc-consts.py b/scripts/gen-crc-consts.py
+new file mode 100755
+index 000000000000..aa678a50897d
+--- /dev/null
++++ b/scripts/gen-crc-consts.py
+@@ -0,0 +1,238 @@
++#!/usr/bin/env python3
++# SPDX-License-Identifier: GPL-2.0-or-later
++#
++# Script that generates constants for computing the given CRC variant(s).
++#
++# Copyright 2025 Google LLC
++#
++# Author: Eric Biggers <ebiggers@google.com>
++
++import sys
++
++# XOR (add) an iterable of polynomials.
++def xor(iterable):
++    res = 0
++    for val in iterable:
++        res ^= val
++    return res
++
++# Multiply two polynomials.
++def clmul(a, b):
++    return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
++
++# Polynomial division floor(a / b).
++def div(a, b):
++    q = 0
++    while a.bit_length() >= b.bit_length():
++        q ^= 1 << (a.bit_length() - b.bit_length())
++        a ^= b << (a.bit_length() - b.bit_length())
++    return q
++
++# Reduce the polynomial 'a' modulo the polynomial 'b'.
++def reduce(a, b):
++    return a ^ clmul(div(a, b), b)
++
++# Reflect the bits of a polynomial.
++def bitreflect(poly, num_bits):
++    assert poly.bit_length() <= num_bits
++    return xor(((poly >> i) & 1) << (num_bits - 1 - i) for i in range(num_bits))
++
++# Format a polynomial as hex.  Bit-reflect it if the CRC is lsb-first.
++def fmt_poly(variant, poly, num_bits):
++    if variant.lsb:
++        poly = bitreflect(poly, num_bits)
++    return f'0x{poly:0{2*num_bits//8}x}'
++
++# Print a pair of 64-bit polynomial multipliers.  They are always passed in the
++# order [HI64_TERMS, LO64_TERMS] but will be printed in the appropriate order.
++def print_mult_pair(variant, mults):
++    mults = list(mults if variant.lsb else reversed(mults))
++    terms = ['HI64_TERMS', 'LO64_TERMS'] if variant.lsb else ['LO64_TERMS', 'HI64_TERMS']
++    for i in range(2):
++        print(f'\t\t{fmt_poly(variant, mults[i]["val"], 64)},\t/* {terms[i]}: {mults[i]["desc"]} */')
++
++# Pretty-print a polynomial.
++def pprint_poly(prefix, poly):
++    terms = [f'x^{i}' for i in reversed(range(poly.bit_length()))
++             if (poly & (1 << i)) != 0]
++    j = 0
++    while j < len(terms):
++        s = prefix + terms[j] + (' +' if j < len(terms) - 1 else '')
++        j += 1
++        while j < len(terms) and len(s) < 73:
++            s += ' ' + terms[j] + (' +' if j < len(terms) - 1 else '')
++            j += 1
++        print(s)
++        prefix = ' * ' + (' ' * (len(prefix) - 3))
++
++# Print a comment describing constants generated for the given CRC variant.
++def print_header(variant, what):
++    print('/*')
++    s = f'{"least" if variant.lsb else "most"}-significant-bit-first CRC-{variant.bits}'
++    print(f' * {what} generated for {s} using')
++    pprint_poly(' * G(x) = ', variant.G)
++    print(' */')
++
++class CrcVariant:
++    def __init__(self, bits, generator_poly, bit_order):
++        self.bits = bits
++        if bit_order not in ['lsb', 'msb']:
++            raise ValueError('Invalid value for bit_order')
++        self.lsb = bit_order == 'lsb'
++        self.name = f'crc{bits}_{bit_order}_0x{generator_poly:0{(2*bits+7)//8}x}'
++        if self.lsb:
++            generator_poly = bitreflect(generator_poly, bits)
++        self.G = generator_poly ^ (1 << bits)
++
++# Generate tables for CRC computation using the "slice-by-N" method.
++# N=1 corresponds to the traditional byte-at-a-time table.
++def gen_slicebyN_tables(variants, n):
++    for v in variants:
++        print('')
++        print_header(v, f'Slice-by-{n} CRC table')
++        print(f'static const u{v.bits} __maybe_unused {v.name}_table[{256*n}] = {{')
++        s = ''
++        for i in range(256 * n):
++            # The i'th table entry is the CRC of the message consisting of byte
++            # i % 256 followed by i // 256 zero bytes.
++            poly = (bitreflect(i % 256, 8) if v.lsb else i % 256) << (v.bits + 8*(i//256))
++            next_entry = fmt_poly(v, reduce(poly, v.G), v.bits) + ','
++            if len(s + next_entry) > 71:
++                print(f'\t{s}')
++                s = ''
++            s += (' ' if s else '') + next_entry
++        if s:
++            print(f'\t{s}')
++        print('};')
++
++# Generate constants for carryless multiplication based CRC computation.
++def gen_x86_pclmul_consts(variants):
++    # These are the distances, in bits, to generate folding constants for.
++    FOLD_DISTANCES = [2048, 1024, 512, 256, 128]
++
++    for v in variants:
++        (G, n, lsb) = (v.G, v.bits, v.lsb)
++        print('')
++        print_header(v, 'CRC folding constants')
++        print('static const struct {')
++        if not lsb:
++            print('\tu8 bswap_mask[16];')
++        for i in FOLD_DISTANCES:
++            print(f'\tu64 fold_across_{i}_bits_consts[2];')
++        print('\tu8 shuf_table[48];')
++        print('\tu64 barrett_reduction_consts[2];')
++        print(f'}} {v.name}_consts ____cacheline_aligned __maybe_unused = {{')
++
++        # Byte-reflection mask, needed for msb-first CRCs
++        if not lsb:
++            print('\t.bswap_mask = {' + ', '.join(str(i) for i in reversed(range(16))) + '},')
++
++        # Fold constants for all distances down to 128 bits
++        for i in FOLD_DISTANCES:
++            print(f'\t.fold_across_{i}_bits_consts = {{')
++            # Given 64x64 => 128 bit carryless multiplication instructions, two
++            # 64-bit fold constants are needed per "fold distance" i: one for
++            # HI64_TERMS that is basically x^(i+64) mod G and one for LO64_TERMS
++            # that is basically x^i mod G.  The exact values however undergo a
++            # couple adjustments, described below.
++            mults = []
++            for j in [64, 0]:
++                pow_of_x = i + j
++                if lsb:
++                    # Each 64x64 => 128 bit carryless multiplication instruction
++                    # actually generates a 127-bit product in physical bits 0
++                    # through 126, which in the lsb-first case represent the
++                    # coefficients of x^1 through x^127, not x^0 through x^126.
++                    # Thus in the lsb-first case, each such instruction
++                    # implicitly adds an extra factor of x.  The below removes a
++                    # factor of x from each constant to compensate for this.
++                    # For n < 64 the x could be removed from either the reduced
++                    # part or unreduced part, but for n == 64 the reduced part
++                    # is the only option.  Just always use the reduced part.
++                    pow_of_x -= 1
++                # Make a factor of x^(64-n) be applied unreduced rather than
++                # reduced, to cause the product to use only the x^(64-n) and
++                # higher terms and always be zero in the lower terms.  Usually
++                # this makes no difference as it does not affect the product's
++                # congruence class mod G and the constant remains 64-bit, but
++                # part of the final reduction from 128 bits does rely on this
++                # property when it reuses one of the constants.
++                pow_of_x -= 64 - n
++                mults.append({ 'val': reduce(1 << pow_of_x, G) << (64 - n),
++                               'desc': f'(x^{pow_of_x} mod G) * x^{64-n}' })
++            print_mult_pair(v, mults)
++            print('\t},')
++
++        # Shuffle table for handling 1..15 bytes at end
++        print('\t.shuf_table = {')
++        print('\t\t' + (16*'-1, ').rstrip())
++        print('\t\t' + ''.join(f'{i:2}, ' for i in range(16)).rstrip())
++        print('\t\t' + (16*'-1, ').rstrip())
++        print('\t},')
++
++        # Barrett reduction constants for reducing 128 bits to the final CRC
++        print('\t.barrett_reduction_consts = {')
++        mults = []
++
++        val = div(1 << (63+n), G)
++        desc = f'floor(x^{63+n} / G)'
++        if not lsb:
++            val = (val << 1) - (1 << 64)
++            desc = f'({desc} * x) - x^64'
++        mults.append({ 'val': val, 'desc': desc })
++
++        val = G - (1 << n)
++        desc = f'G - x^{n}'
++        if lsb and n == 64:
++            assert (val & 1) != 0  # The x^0 term should always be nonzero.
++            val >>= 1
++            desc = f'({desc} - x^0) / x'
++        else:
++            pow_of_x = 64 - n - (1 if lsb else 0)
++            val <<= pow_of_x
++            desc = f'({desc}) * x^{pow_of_x}'
++        mults.append({ 'val': val, 'desc': desc })
++
++        print_mult_pair(v, mults)
++        print('\t},')
++
++        print('};')
++
++def parse_crc_variants(vars_string):
++    variants = []
++    for var_string in vars_string.split(','):
++        bits, bit_order, generator_poly = var_string.split('_')
++        assert bits.startswith('crc')
++        bits = int(bits.removeprefix('crc'))
++        assert generator_poly.startswith('0x')
++        generator_poly = generator_poly.removeprefix('0x')
++        assert len(generator_poly) % 2 == 0
++        generator_poly = int(generator_poly, 16)
++        variants.append(CrcVariant(bits, generator_poly, bit_order))
++    return variants
++
++if len(sys.argv) != 3:
++    sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n')
++    sys.stderr.write('  CONSTS_TYPE can be sliceby[1-8] or x86_pclmul\n')
++    sys.stderr.write('  CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n')
++    sys.stderr.write('     E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n')
++    sys.stderr.write('     Polynomial must use the given bit_order and exclude x^{num_bits}\n')
++    sys.exit(1)
++
++print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
++print('/*')
++print(' * CRC constants generated by:')
++print(' *')
++print(f' *\t{sys.argv[0]} {" ".join(sys.argv[1:])}')
++print(' *')
++print(' * Do not edit manually.')
++print(' */')
++consts_types = sys.argv[1].split(',')
++variants = parse_crc_variants(sys.argv[2])
++for consts_type in consts_types:
++    if consts_type.startswith('sliceby'):
++        gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby')))
++    elif consts_type == 'x86_pclmul':
++        gen_x86_pclmul_consts(variants)
++    else:
++        raise ValueError(f'Unknown consts_type: {consts_type}')
+-- 
+2.49.0.634.g8613c2bb6c
+
diff --git a/sys-kernel/gentoo-sources-6.14/0009-zstd.patch b/sys-kernel/gentoo-sources-6.14/0009-zstd.patch
new file mode 100644
index 0000000..b47cba6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.14/0009-zstd.patch
@@ -0,0 +1,23554 @@
+From 2e674186d8b03209dca74867e2d4c885190243fe Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 22 May 2025 16:36:17 +0200
+Subject: [PATCH 9/9] zstd
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ MAINTAINERS                                   |    1 +
+ include/linux/zstd.h                          |   87 +-
+ include/linux/zstd_errors.h                   |   30 +-
+ include/linux/zstd_lib.h                      | 1123 ++++--
+ lib/zstd/Makefile                             |    3 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  150 +
+ lib/zstd/common/bitstream.h                   |  155 +-
+ lib/zstd/common/compiler.h                    |  151 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   37 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   13 +-
+ lib/zstd/common/error_private.h               |   88 +-
+ lib/zstd/common/fse.h                         |  103 +-
+ lib/zstd/common/fse_decompress.c              |  132 +-
+ lib/zstd/common/huf.h                         |  240 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   51 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |  153 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
+ lib/zstd/compress/hist.c                      |   13 +-
+ lib/zstd/compress/hist.h                      |   10 +-
+ lib/zstd/compress/huf_compress.c              |  441 ++-
+ lib/zstd/compress/zstd_compress.c             | 3293 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  621 +++-
+ lib/zstd/compress/zstd_compress_literals.c    |  157 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |   21 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |   16 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  394 +-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  222 +-
+ lib/zstd/compress/zstd_double_fast.c          |  245 +-
+ lib/zstd/compress/zstd_double_fast.h          |   27 +-
+ lib/zstd/compress/zstd_fast.c                 |  703 +++-
+ lib/zstd/compress/zstd_fast.h                 |   16 +-
+ lib/zstd/compress/zstd_lazy.c                 |  840 +++--
+ lib/zstd/compress/zstd_lazy.h                 |  195 +-
+ lib/zstd/compress/zstd_ldm.c                  |  102 +-
+ lib/zstd/compress/zstd_ldm.h                  |   17 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  571 +--
+ lib/zstd/compress/zstd_opt.h                  |   55 +-
+ lib/zstd/compress/zstd_preSplit.c             |  239 ++
+ lib/zstd/compress/zstd_preSplit.h             |   34 +
+ lib/zstd/decompress/huf_decompress.c          |  887 +++--
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  377 +-
+ lib/zstd/decompress/zstd_decompress_block.c   |  724 ++--
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |   19 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |   75 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 61 files changed, 8755 insertions(+), 4384 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+ create mode 100644 lib/zstd/compress/zstd_preSplit.c
+ create mode 100644 lib/zstd/compress/zstd_preSplit.h
+
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 161dd28ca25b..de1f3f463548 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -26310,6 +26310,7 @@ F:	mm/zsmalloc.c
+ 
+ ZSTD
+ M:	Nick Terrell <terrelln@fb.com>
++M:	David Sterba <dsterba@suse.com>
+ S:	Maintained
+ B:	https://github.com/facebook/zstd/issues
+ T:	git https://github.com/terrelln/linux.git
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index b2c7cf310c8f..2f2a3c8b8a33 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -160,7 +160,6 @@ typedef ZSTD_parameters zstd_parameters;
+ zstd_parameters zstd_get_params(int level,
+ 	unsigned long long estimated_src_size);
+ 
+-
+ /**
+  * zstd_get_cparams() - returns zstd_compression_parameters for selected level
+  * @level:              The compression level
+@@ -173,9 +172,20 @@ zstd_parameters zstd_get_params(int level,
+ zstd_compression_parameters zstd_get_cparams(int level,
+ 	unsigned long long estimated_src_size, size_t dict_size);
+ 
+-/* ======   Single-pass Compression   ====== */
+-
+ typedef ZSTD_CCtx zstd_cctx;
++typedef ZSTD_cParameter zstd_cparameter;
++
++/**
++ * zstd_cctx_set_param() - sets a compression parameter
++ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
++ * @param:        The parameter to set.
++ * @value:        The value to set the parameter to.
++ *
++ * Return:        Zero or an error, which can be checked using zstd_is_error().
++ */
++size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value);
++
++/* ======   Single-pass Compression   ====== */
+ 
+ /**
+  * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+@@ -190,6 +200,20 @@ typedef ZSTD_CCtx zstd_cctx;
+  */
+ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+ 
++/**
++ * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to
++ * initialize a zstd_cctx when using the block-level external sequence
++ * producer API.
++ * @parameters: The compression parameters to be used.
++ *
++ * If multiple compression parameters might be used, the caller must call
++ * this function for each set of parameters and use the maximum size.
++ *
++ * Return:      A lower bound on the size of the workspace that is passed to
++ *              zstd_init_cctx().
++ */
++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters);
++
+ /**
+  * zstd_init_cctx() - initialize a zstd compression context
+  * @workspace:      The workspace to emplace the context into. It must outlive
+@@ -424,6 +448,16 @@ typedef ZSTD_CStream zstd_cstream;
+  */
+ size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+ 
++/**
++ * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize
++ * a zstd_cstream when using the block-level external sequence producer API.
++ * @cparams: The compression parameters to be used for compression.
++ *
++ * Return:   A lower bound on the size of the workspace that is passed to
++ *           zstd_init_cstream().
++ */
++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams);
++
+ /**
+  * zstd_init_cstream() - initialize a zstd streaming compression context
+  * @parameters        The zstd parameters to use for compression.
+@@ -583,6 +617,18 @@ size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+  */
+ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
+ 
++/**
++ * zstd_register_sequence_producer() - exposes the zstd library function
++ * ZSTD_registerSequenceProducer(). This is used for the block-level external
++ * sequence producer API. See upstream zstd.h for detailed documentation.
++ */
++typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f;
++void zstd_register_sequence_producer(
++  zstd_cctx *cctx,
++  void* sequence_producer_state,
++  zstd_sequence_producer_f sequence_producer
++);
++
+ /**
+  * struct zstd_frame_params - zstd frame parameters stored in the frame header
+  * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+@@ -596,7 +642,7 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
+  *
+  * See zstd_lib.h.
+  */
+-typedef ZSTD_frameHeader zstd_frame_header;
++typedef ZSTD_FrameHeader zstd_frame_header;
+ 
+ /**
+  * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+@@ -611,4 +657,35 @@ typedef ZSTD_frameHeader zstd_frame_header;
+ size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+ 	size_t src_size);
+ 
++/**
++ * struct zstd_sequence - a sequence of literals or a match
++ *
++ * @offset: The offset of the match
++ * @litLength: The literal length of the sequence
++ * @matchLength: The match length of the sequence
++ * @rep: Represents which repeat offset is used
++ */
++typedef ZSTD_Sequence zstd_sequence;
++
++/**
++ * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals
++ *
++ * @cctx: The zstd compression context.
++ * @dst: The buffer to compress the data into.
++ * @dst_capacity: The size of the destination buffer.
++ * @in_seqs: The array of zstd_sequence to compress.
++ * @in_seqs_size: The number of sequences in in_seqs.
++ * @literals: The literals associated to the sequences to be compressed.
++ * @lit_size: The size of the literals in the literals buffer.
++ * @lit_capacity: The size of the literals buffer.
++ * @decompressed_size: The size of the input data
++ *
++ * Return: The compressed size or an error, which can be checked using
++ * 	   zstd_is_error().
++ */
++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity,
++					    const zstd_sequence *in_seqs, size_t in_seqs_size,
++					    const void* literals, size_t lit_size, size_t lit_capacity,
++					    size_t decompressed_size);
++
+ #endif  /* LINUX_ZSTD_H */
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..c307fb011132 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,13 +13,18 @@
+ #define ZSTD_ERRORS_H_398273423
+ 
+ 
+-/*===== dependency =====*/
+-#include <linux/types.h>   /* size_t */
++/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
++#define ZSTDERRORLIB_VISIBLE 
+ 
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
+ 
+-/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +49,18 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_cannotProduce_uncompressedBlock = 49,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,18 +68,18 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+-/*! ZSTD_getErrorCode() :
+-    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+-    which can be used to compare with enum list published above */
+-ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+ 
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..e295d4125dde 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,47 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
+-#include <linux/limits.h>   /* INT_MAX */
++
++/* ======   Dependencies   ======*/
+ #include <linux/types.h>   /* size_t */
+ 
++#include <linux/zstd_errors.h> /* list of errors */
++#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
++#include <linux/limits.h>   /* INT_MAX */
++#endif /* ZSTD_STATIC_LINKING_ONLY */
++
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +90,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  7
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ 
+ 
+ /* *************************************
+-*  Simple API
++*  Simple Core API
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                                   int compressionLevel);
+ 
+ /*! ZSTD_decompress() :
+- *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+- *  `dstCapacity` is an upper bound of originalSize to regenerate.
+- *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+- *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+- *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
++ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
++ *  Multiple compressed frames can be decompressed at once with this method.
++ *  The result will be the concatenation of all decompressed frames, back to back.
++ * `dstCapacity` is an upper bound of originalSize to regenerate.
++ *  First frame's decompressed size can be extracted using ZSTD_getFrameContentSize().
++ *  If maximum upper bound isn't known, prefer using streaming mode to decompress data.
++ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
++ *           or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                               const void* src, size_t compressedSize);
+ 
++
++/*======  Decompression helper functions  ======*/
++
+ /*! ZSTD_getFrameContentSize() : requires v1.3.0+
+- *  `src` should point to the start of a ZSTD encoded frame.
+- *  `srcSize` must be at least as large as the frame header.
+- *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+- *  @return : - decompressed size of `src` frame content, if known
+- *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+- *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+- *   note 1 : a 0 return value means the frame is valid but "empty".
+- *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+- *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+- *            In which case, it's necessary to use streaming mode to decompress data.
+- *            Optionally, application can rely on some implicit limit,
+- *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+- *            (For example, data could be necessarily cut into blocks <= 16 KB).
+- *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+- *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+- *   note 4 : decompressed size can be very large (64-bits value),
+- *            potentially larger than what local system can handle as a single memory segment.
+- *            In which case, it's necessary to use streaming mode to decompress data.
+- *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+- *            Always ensure return value fits within application's authorized limits.
+- *            Each application can set its own limits.
+- *   note 6 : This function replaces ZSTD_getDecompressedSize() */
++ * `src` should point to the start of a ZSTD encoded frame.
++ * `srcSize` must be at least as large as the frame header.
++ *           hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
++ * @return : - decompressed size of `src` frame content, if known
++ *           - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
++ *           - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
++ *  note 1 : a 0 return value means the frame is valid but "empty".
++ *           When invoking this method on a skippable frame, it will return 0.
++ *  note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode).
++ *           When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
++ *           In which case, it's necessary to use streaming mode to decompress data.
++ *           Optionally, application can rely on some implicit limit,
++ *           as ZSTD_decompress() only needs an upper bound of decompressed size.
++ *           (For example, data could be necessarily cut into blocks <= 16 KB).
++ *  note 3 : decompressed size is always present when compression is completed using single-pass functions,
++ *           such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
++ *  note 4 : decompressed size can be very large (64-bits value),
++ *           potentially larger than what local system can handle as a single memory segment.
++ *           In which case, it's necessary to use streaming mode to decompress data.
++ *  note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
++ *           Always ensure return value fits within application's authorized limits.
++ *           Each application can set its own limits.
++ *  note 6 : This function replaces ZSTD_getDecompressedSize() */
+ #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+ #define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+ 
+-/*! ZSTD_getDecompressedSize() :
+- *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
++/*! ZSTD_getDecompressedSize() (obsolete):
++ *  This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+  *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+@@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t
+  * `srcSize` must be >= first frame size
+  * @return : the compressed size of the first frame starting at `src`,
+  *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+- *        or an error code if input is invalid */
++ *           or an error code if input is invalid
++ *  Note 1: this method is called _find*() because it's not enough to read the header,
++ *          it may have to scan through the frame's content, to reach its end.
++ *  Note 2: this method also works with Skippable Frames. In which case,
++ *          it returns the size of the complete skippable frame,
++ *          which is always equal to its content size + 8 bytes for headers. */
+ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+ 
+ 
+-/*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+-ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+-ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+-ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+-ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
++/*======  Compression helper functions  ======*/
++
++/*! ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()`, or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize is too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++
++
++/*======  Error helper functions  ======*/
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
++ZSTDLIB_API unsigned     ZSTD_isError(size_t result);      /*!< tells if a `size_t` function result is an error code */
++ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */
++ZSTDLIB_API const char*  ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */
++ZSTDLIB_API int          ZSTD_minCLevel(void);             /*!< minimum negative compression level allowed, requires v1.4.0+ */
++ZSTDLIB_API int          ZSTD_maxCLevel(void);             /*!< maximum compression level available */
++ZSTDLIB_API int          ZSTD_defaultCLevel(void);         /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
+ 
+ 
+ /* *************************************
+@@ -182,25 +248,25 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
+ ***************************************/
+ /*= Compression context
+  *  When compressing many times,
+- *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
+- *  This will make workload friendlier for system's memory.
++ *  it is recommended to allocate a compression context just once,
++ *  and reuse it for each successive compression operation.
++ *  This will make the workload easier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+- *  Note 2 : In multi-threaded environments,
+- *         use one different context per thread for parallel execution.
++ *  Note 2: For parallel execution in multi-threaded environments,
++ *         use one different context per thread .
+  */
+ typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+-ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer */
++ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* compatible with NULL pointer */
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+- *  they will all be reset. Only `compressionLevel` remains.
++ *  they will all be reset. Only @compressionLevel remains.
+  */
+ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +286,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +390,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximately targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,15 +482,18 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+      * ZSTD_c_stableOutBuffer
+      * ZSTD_c_blockDelimiters
+      * ZSTD_c_validateSequences
+-     * ZSTD_c_useBlockSplitter
++     * ZSTD_c_blockSplitterLevel
++     * ZSTD_c_splitAfterSequences
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -421,7 +503,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +512,12 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016,
++     ZSTD_c_experimentalParam20=1017
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +580,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +632,17 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
+ 
+ } ZSTD_dParameter;
+ 
+@@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +793,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be re-employed multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *  The function will update both `pos` fields.
+ *  If `input.pos < input.size`, some input has not been consumed.
+ *  It's up to the caller to present again remaining data.
++*
+ *  The function tries to flush all data decoded immediately, respecting output buffer size.
+ *  If `output.pos < output.size`, decoder has flushed everything it could.
+-*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
++*
++*  However, when `output.pos == output.size`, it's more difficult to know.
++*  If @return > 0, the frame is not complete, meaning
++*  either there is still some data left to flush within internal buffers,
++*  or there is more input to read to complete the frame (or both).
+ *  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+ *  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *        or an error code, which can be tested using ZSTD_isError(),
+ *        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+ *                                the return value is a suggested next input size (just a hint for better latency)
+-*                                that will never request more than the remaining frame size.
++*                                that will never request more than the remaining content of the compressed frame.
+ * *******************************************************************************/
+ 
+ typedef ZSTD_DCtx ZSTD_DStream;  /*< DCtx and DStream are now effectively same object (>= v1.3.0) */
+@@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder flushed internal output buffer.
++ * - `output.pos == output.size`, unflushed data potentially present in the internal buffers,
++ *   check ZSTD_decompressStream() @return value,
++ *   if > 0, invoke it again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is reused,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ 
++
+ #endif  /* ZSTD_H_235446 */
+ 
+ 
+@@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+ #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+ 
++
+ /* This can be overridden externally to hide static symbols. */
+ #ifndef ZSTDLIB_STATIC_API
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1188,7 +1311,7 @@ typedef struct {
+                                *
+                                * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                                * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+-                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
++                               * sequence provider perspective. For example, ZSTD_compressSequences() does not
+                                * use this 'rep' field at all (as of now).
+                                */
+ } ZSTD_Sequence;
+@@ -1293,17 +1416,18 @@ typedef enum {
+ } ZSTD_literalCompressionMode_e;
+ 
+ typedef enum {
+-  /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final
+-   * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable
+-   * or ZSTD_ps_disable allow for a force enable/disable the feature.
++  /* Note: This enum controls features which are conditionally beneficial.
++   * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto),
++   * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature.
+    */
+   ZSTD_ps_auto = 0,         /* Let the library automatically determine whether the feature shall be enabled */
+   ZSTD_ps_enable = 1,       /* Force-enable the feature */
+   ZSTD_ps_disable = 2       /* Do not use the feature */
+-} ZSTD_paramSwitch_e;
++} ZSTD_ParamSwitch_e;
++#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e  /* old name */
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src,
+ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_frameHeaderSize() :
+- *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
++ *  srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+  * @return : size of the Frame Header,
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e;
++#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_FrameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;                     /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_FrameHeader;
++#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header into `zfhPtr`, or requires larger `srcSize`.
++ * @return : 0 => header is complete, `zfhPtr` is correctly filled,
++ *          >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize);
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+-  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+-  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+-} ZSTD_sequenceFormat_e;
++  ZSTD_sf_noBlockDelimiters = 0,         /* ZSTD_Sequence[] has no block delimiters, just sequences */
++  ZSTD_sf_explicitBlockDelimiters = 1    /* ZSTD_Sequence[] contains explicit block delimiters */
++} ZSTD_SequenceFormat_e;
++#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */
++
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+ 
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsCapacity The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
+  */
+-
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++                       const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+- * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
++ * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.).
+  * The entire source is compressed into a single frame.
+  *
+  * The compression behavior changes based on cctx params. In particular:
+@@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  *    the block size derived from the cctx, and sequences may be split. This is the default setting.
+  *
+  *    If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+- *    block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
++ *    valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
++ *
++ *    When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes
++ *    using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit
++ *    can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation.
++ *    By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10).
++ *    ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction.
+  *
+- *    If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined
+- *    behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for
+- *    specifics regarding offset/matchlength requirements) then the function will bail out and return an error.
++ *    If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined
++ *    behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for
++ *    specifics regarding offset/matchlength requirements) and then bail out and return an error.
+  *
+  *    In addition to the two adjustable experimental params, there are other important cctx params.
+  *    - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN.
+@@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  *    - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset
+  *      is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md
+  *
+- * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+- * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+- *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
+- */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused.
++ * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly,
++ *         and cannot emit an RLE block that disagrees with the repcode history.
++ * @return : final compressed size, or a ZSTD error code.
++ */
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                       void* dst, size_t dstCapacity,
++                 const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                 const void* src, size_t srcSize);
++
++
++/*! ZSTD_compressSequencesAndLiterals() :
++ * This is a variant of ZSTD_compressSequences() which,
++ * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize),
++ * aka all the literals, already extracted and laid out into a single continuous buffer.
++ * This can be useful if the process generating the sequences also happens to generate the buffer of literals,
++ * thus skipping an extraction + caching stage.
++ * It's a speed optimization, useful when the right conditions are met,
++ * but it also features the following limitations:
++ * - Only supports explicit delimiter mode
++ * - Currently does not support Sequences validation (so input Sequences are trusted)
++ * - Not compatible with frame checksum, which must be disabled
++ * - If any block is incompressible, will fail and return an error
++ * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error.
++ * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals.
++ *   @litBufCapacity must be at least 8 bytes larger than @litSize.
++ * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error.
++ * @return : final compressed size, or a ZSTD error code.
++ */
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx,
++                                  void* dst, size_t dstCapacity,
++                            const ZSTD_Sequence* inSeqs, size_t nbSequences,
++                            const void* literals, size_t litSize, size_t litBufCapacity,
++                            size_t decompressedSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds
+  *
+  * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number,
+  * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15.
+- * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so
+- * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
++ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used,
++ * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
+  *
+  * Returns an error if destination buffer is not large enough, if the source size is not representable
+  * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid).
+@@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds
+  * @return : number of bytes written or a ZSTD error.
+  */
+ ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+-                                            const void* src, size_t srcSize, unsigned magicVariant);
++                                             const void* src, size_t srcSize,
++                                                   unsigned magicVariant);
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer.
+  *
+- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+- * in the magicVariant.
++ * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written,
++ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.
++ * This can be NULL if the caller is not interested in the magicVariant.
+  *
+  * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                                                  unsigned* magicVariant,
++                                                  const void* src, size_t srcSize);
+ 
+ /*! ZSTD_isSkippableFrame() :
+  *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
+  */
+-ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
++ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ 
+ 
+ 
+@@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1568,7 +1837,15 @@ typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+ typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+ static
+ __attribute__((__unused__))
++
++#if defined(__clang__) && __clang_major__ >= 5
++#pragma clang diagnostic push
++#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
++#endif
+ ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /*< this constant defers to stdlib's functions */
++#if defined(__clang__) && __clang_major__ >= 5
++#pragma clang diagnostic pop
++#endif
+ 
+ ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+@@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * See the comments on that enum for an explanation of the feature. */
+ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+ 
+-/* Controlled with ZSTD_paramSwitch_e enum.
++/* Controlled with ZSTD_ParamSwitch_e enum.
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never compress literals.
+  * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals
+@@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ /* ZSTD_c_validateSequences
+  * Default is 0 == disabled. Set to 1 to enable sequence validation.
+  *
+- * For use with sequence compression API: ZSTD_compressSequences().
+- * Designates whether or not we validate sequences provided to ZSTD_compressSequences()
++ * For use with sequence compression API: ZSTD_compressSequences*().
++ * Designates whether or not provided sequences are validated within ZSTD_compressSequences*()
+  * during function execution.
+  *
+- * Without validation, providing a sequence that does not conform to the zstd spec will cause
+- * undefined behavior, and may produce a corrupted block.
++ * When Sequence validation is disabled (default), Sequences are compressed as-is,
++ * so they must correct, otherwise it would result in a corruption error.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * Sequence validation adds some protection, by ensuring that all values respect boundary conditions.
++ * If a Sequence is detected invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+- *
+  */
+ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
+ 
+-/* ZSTD_c_useBlockSplitter
+- * Controlled with ZSTD_paramSwitch_e enum.
++/* ZSTD_c_blockSplitterLevel
++ * note: this parameter only influences the first splitter stage,
++ *       which is active before producing the sequences.
++ *       ZSTD_c_splitAfterSequences controls the next splitter stage,
++ *       which is active after sequence production.
++ *       Note that both can be combined.
++ * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included.
++ * 0 means "auto", which will select a value depending on current ZSTD_c_strategy.
++ * 1 means no splitting.
++ * Then, values from 2 to 6 are sorted in increasing cpu load order.
++ *
++ * Note that currently the first block is never split,
++ * to ensure expansion guarantees in presence of incompressible data.
++ */
++#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6
++#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20
++
++/* ZSTD_c_splitAfterSequences
++ * This is a stronger splitter algorithm,
++ * based on actual sequences previously produced by the selected parser.
++ * It's also slower, and as a consequence, mostly used for high compression levels.
++ * While the post-splitter does overlap with the pre-splitter,
++ * both can nonetheless be combined,
++ * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX,
++ * resulting in higher compression ratio than just one of them.
++ *
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never use block splitter.
+  * Set to ZSTD_ps_enable to always use block splitter.
+@@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
+  * block splitting based on the compression parameters.
+  */
+-#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
++#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13
+ 
+ /* ZSTD_c_useRowMatchFinder
+- * Controlled with ZSTD_paramSwitch_e enum.
++ * Controlled with ZSTD_ParamSwitch_e enum.
+  * Default is ZSTD_ps_auto.
+  * Set to ZSTD_ps_disable to never use row-based matchfinder.
+  * Set to ZSTD_ps_enable to force usage of row-based matchfinder.
+@@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_repcodeResolution
++ * This parameter only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters (may change in the future).
++ *
++ * This parameter affects how zstd parses external sequences,
++ * provided via the ZSTD_compressSequences*() API
++ * or from an external block-level sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets within
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences*() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level (currently: level<10 disables, level>=10 enables).
++ */
++#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */
++
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t (*ZSTD_sequenceProducer_F) (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+-  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
++  It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame,
+   such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+   Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+   As a consequence, check that values remain within valid application range.
+@@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+ 
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+-
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..be218b5e0ed5 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+@@ -26,6 +26,7 @@ zstd_compress-y := \
+ 		compress/zstd_lazy.o \
+ 		compress/zstd_ldm.o \
+ 		compress/zstd_opt.o \
++		compress/zstd_preSplit.o \
+ 
+ zstd_decompress-y := \
+ 		zstd_decompress_module.o \
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..16c3d08e8d1a
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "compiler.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..c5faaa3d7b08
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,150 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)__builtin_ctz(val);
++#else
++    return ZSTD_countTrailingZeros32_fallback(val);
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)__builtin_clz(val);
++#else
++    return ZSTD_countLeadingZeros32_fallback(val);
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4) && defined(__LP64__)
++    return (unsigned)__builtin_ctzll(val);
++#else
++    {
++        U32 mostSignificantWord = (U32)(val >> 32);
++        U32 leastSignificantWord = (U32)val;
++        if (leastSignificantWord == 0) {
++            return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++        } else {
++            return ZSTD_countTrailingZeros32(leastSignificantWord);
++        }
++    }
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#if (__GNUC__ >= 4)
++    return (unsigned)(__builtin_clzll(val));
++#else
++    {
++        U32 mostSignificantWord = (U32)(val >> 32);
++        U32 leastSignificantWord = (U32)val;
++        if (mostSignificantWord == 0) {
++            return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++        } else {
++            return ZSTD_countLeadingZeros32(mostSignificantWord);
++        }
++    }
++#endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..86439da0eea7 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,7 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
+-
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ /*=========================================
+ *  Target specific
+@@ -41,12 +42,13 @@
+ /*-******************************************
+ *  bitStream encoding API (write forward)
+ ********************************************/
++typedef size_t BitContainerType;
+ /* bitStream can mix input from multiple sources.
+  * A critical property of these streams is that they encode and decode in **reverse** direction.
+  * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+  */
+ typedef struct {
+-    size_t bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitPos;
+     char*  startPtr;
+     char*  ptr;
+@@ -54,7 +56,7 @@ typedef struct {
+ } BIT_CStream_t;
+ 
+ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+-MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
++MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits);
+ MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ 
+@@ -63,7 +65,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ *  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+ *
+ *  bits are first added to a local register.
+-*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
++*  Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+ *  Writing data into memory is an explicit operation, performed by the flushBits function.
+ *  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+ *  After a flushBits, a maximum of 7 bits might still be stored into local register.
+@@ -80,28 +82,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ *  bitStream decoding API (read backward)
+ **********************************************/
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+-MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
++MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ 
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -113,7 +115,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ /*-****************************************
+ *  unsafe API
+ ******************************************/
+-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
++MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits);
+ /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+ 
+ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,16 +153,22 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+-                            size_t value, unsigned nbBits)
++                            BitContainerType value, unsigned nbBits)
+ {
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -195,7 +176,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+  *  works only if `value` is _clean_,
+  *  meaning all high bits above nbBits are 0 */
+ MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+-                                size_t value, unsigned nbBits)
++                                BitContainerType value, unsigned nbBits)
+ {
+     assert((value>>nbBits) == 0);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+@@ -242,7 +223,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+     BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+     BIT_flushBits(bitC);
+     if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
++    return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+ }
+ 
+ 
+@@ -266,35 +247,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -303,12 +284,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -318,26 +299,20 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+      * such cpus old (pre-Haswell, 2013) and their performance is not of that
+      * importance.
+      */
+-#if defined(__x86_64__) || defined(_M_X86)
++#if defined(__x86_64__) || defined(_M_X64)
+     return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
+ #else
+     return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -353,14 +328,14 @@ MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U3
+ 
+ /*! BIT_lookBitsFast() :
+  *  unsafe version; only works if nbBits >= 1 */
+-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
++MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+ {
+     U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+     assert(nbBits >= 1);
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,23 +344,38 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+-    size_t const value = BIT_lookBits(bitD, nbBits);
++    BitContainerType const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+     return value;
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
+-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
++ *  unsafe version; only works if nbBits >= 1 */
++MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+-    size_t const value = BIT_lookBitsFast(bitD, nbBits);
++    BitContainerType const value = BIT_lookBitsFast(bitD, nbBits);
+     assert(nbBits >= 1);
+     BIT_skipBits(bitD, nbBits);
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +386,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
+         return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
+@@ -442,5 +436,4 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+     return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+ }
+ 
+-
+ #endif /* BITSTREAM_H_MODULE */
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..dc9bd15e174e 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,16 +143,13 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+ 
+-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
+-
+-
+ /* compile time determination of SIMD support */
+ 
+ /* C-language Attributes are added in C23. */
+@@ -158,9 +172,15 @@
+ #define ZSTD_FALLTHROUGH fallthrough
+ 
+ /*-**************************************************************
+-*  Alignment check
++*  Alignment
+ *****************************************************************/
+ 
++/* @return 1 if @u is a 2^n value, 0 otherwise
++ * useful to check a value is valid for alignment restrictions */
++MEM_STATIC int ZSTD_isPower2(size_t u) {
++    return (u & (u-1)) == 0;
++}
++
+ /* this test was initially positioned in mem.h,
+  * but this file is removed (or replaced) for linux kernel
+  * so it's now hosted in compiler.h,
+@@ -175,10 +195,95 @@
+ 
+ #endif /* ZSTD_ALIGNOF */
+ 
++#ifndef ZSTD_ALIGNED
++/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
++#define ZSTD_ALIGNED(a) __attribute__((aligned(a)))
++#endif /* ZSTD_ALIGNED */
++
++
+ /*-**************************************************************
+ *  Sanitizer
+ *****************************************************************/
+ 
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without triggering
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..8eb6aa9a3b20 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,10 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..c8a10281f112 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -33,7 +34,6 @@
+ #define DEBUG_H_12987983217
+ 
+ 
+-
+ /* static assert is triggered at compile time, leaving no runtime artefact.
+  * static assert only works with compile-time constants.
+  * Also, this variant can only be used inside a function. */
+@@ -82,20 +82,27 @@ extern int g_debuglevel; /* the variable is only declared,
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+-
+-
+ #endif /* DEBUG_H_12987983217 */
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..6c3dbad838b6 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,23 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..08ee87b68cca 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,8 +14,6 @@
+ #ifndef ERROR_H_MODULE
+ #define ERROR_H_MODULE
+ 
+-
+-
+ /* ****************************************
+ *  Dependencies
+ ******************************************/
+@@ -23,7 +22,6 @@
+ #include "debug.h"
+ #include "zstd_deps.h"       /* size_t */
+ 
+-
+ /* ****************************************
+ *  Compiler-specific
+ ******************************************/
+@@ -49,8 +47,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +87,12 @@ void _force_has_format_string(const char *format, ...) {
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +103,49 @@ void _force_has_format_string(const char *format, ...) {
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
+-
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ #endif /* ERROR_H_MODULE */
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..b36ce7a2a8c3 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -11,8 +12,6 @@
+  * in the COPYING file in the root directory of this source tree).
+  * You may select, at your option, one of the above-listed licenses.
+ ****************************************************************** */
+-
+-
+ #ifndef FSE_H
+ #define FSE_H
+ 
+@@ -22,7 +21,6 @@
+ ******************************************/
+ #include "zstd_deps.h"    /* size_t, ptrdiff_t */
+ 
+-
+ /*-*****************************************
+ *  FSE_PUBLIC_API : control library symbols visibility
+ ******************************************/
+@@ -50,34 +48,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +58,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +192,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -286,13 +224,11 @@ If there is an error, the function will return an error code, which can be teste
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+-
+-/* *** Dependency *** */
+ #include "bitstream.h"
+ 
+-
+ /* *****************************************
+ *  Static allocation
+ *******************************************/
+@@ -317,16 +253,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +270,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -539,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+@@ -705,7 +623,4 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+ 
+ #define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3)
+ 
+-
+ #endif /* FSE_STATIC_LINKING_ONLY */
+-
+-
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39767..15081d8dc607 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+ #include "error_private.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +56,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -248,6 +191,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+     FSE_initDState(&state1, &bitD, dt);
+     FSE_initDState(&state2, &bitD, dt);
+ 
++    RETURN_ERROR_IF(BIT_reloadDStream(&bitD)==BIT_DStream_overflow, corruption_detected, "");
++
+ #define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+ 
+     /* 4 symbols per loop */
+@@ -287,32 +232,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+             break;
+     }   }
+ 
+-    return op-ostart;
+-}
+-
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +252,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +272,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +313,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..49736dcd8f49 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -12,105 +13,26 @@
+  * You may select, at your option, one of the above-listed licenses.
+ ****************************************************************** */
+ 
+-
+ #ifndef HUF_H_298734234
+ #define HUF_H_298734234
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
+-
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+ 
+-/* ***   Advanced function   *** */
+-
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +73,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +128,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +142,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +153,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +185,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -246,9 +193,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +236,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +252,27 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+-
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
+-
++#endif   /* HUF_H_298734234 */
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index c22a2e69bf46..d9bd752fe17b 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0dde8bf56595..efae9465d57d 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -45,30 +46,37 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+ 
++/* Compile time determination of BMI2 support */
++
++
+ /* Enable runtime BMI2 dispatch based on the CPU.
+- * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
++ * Enabled for clang & gcc >= 11.4 on x86 when BMI2 isn't enabled by default.
++ * Disabled for gcc < 11.4 because of a segfault while compiling
++ * HUF_compress1X_usingCTable_internal_body().
+  */
+ #ifndef DYNAMIC_BMI2
+-  #if ((defined(__clang__) && __has_attribute(__target__)) \
++#  if ((defined(__clang__) && __has_attribute(__target__)) \
+       || (defined(__GNUC__) \
+-          && (__GNUC__ >= 11))) \
+-      && (defined(__x86_64__) || defined(_M_X64)) \
++          && (__GNUC__ >= 12 || (__GNUC__ == 11 && __GNUC_MINOR__ >= 4)))) \
++      && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \
+       && !defined(__BMI2__)
+-  #  define DYNAMIC_BMI2 1
+-  #else
+-  #  define DYNAMIC_BMI2 0
+-  #endif
++#    define DYNAMIC_BMI2 1
++#  else
++#    define DYNAMIC_BMI2 0
++#  endif
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNU C compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+- * Only enable assembly for Linux / MacOS, other platforms may
++ * Only enable assembly for Linux / MacOS / Win32, other platforms may
+  * work, but they haven't been tested. This could likely be
+  * extended to BSD systems.
+  *
+@@ -90,4 +98,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..44b95b25344a 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33a1c..f931f7d0e294 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..52a79435caf6 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,12 +29,10 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+ 
+-
+ /* ---- static assert (debug) --- */
+ #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+ #define ZSTD_isError ERR_isError   /* for inlining */
+@@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+-typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
++typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+ 
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +103,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +168,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +188,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +217,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+@@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -273,62 +268,6 @@ typedef enum {
+ /*-*******************************************
+ *  Private declarations
+ *********************************************/
+-typedef struct seqDef_s {
+-    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
+-    U16 litLength;
+-    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
+-} seqDef;
+-
+-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
+-typedef enum {
+-    ZSTD_llt_none = 0,             /* no longLengthType */
+-    ZSTD_llt_literalLength = 1,    /* represents a long literal */
+-    ZSTD_llt_matchLength = 2       /* represents a long match */
+-} ZSTD_longLengthType_e;
+-
+-typedef struct {
+-    seqDef* sequencesStart;
+-    seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
+-    size_t maxNbSeq;
+-    size_t maxNbLit;
+-
+-    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
+-     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+-     * the existing value of the litLength or matchLength by 0x10000.
+-     */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+-} seqStore_t;
+-
+-typedef struct {
+-    U32 litLength;
+-    U32 matchLength;
+-} ZSTD_sequenceLength;
+-
+-/*
+- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
+- */
+-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+-{
+-    ZSTD_sequenceLength seqLen;
+-    seqLen.litLength = seq->litLength;
+-    seqLen.matchLength = seq->mlBase + MINMATCH;
+-    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+-        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
+-        }
+-        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
+-        }
+-    }
+-    return seqLen;
+-}
+ 
+ /*
+  * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+@@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
+-
+-
+ /* ZSTD_invalidateRepCodes() :
+  * ensures next compression will not use repcodes from previous block.
+  * Note : only works with regular variant;
+@@ -420,13 +296,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
+@@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+     return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+ }
+ 
+-
+ #endif   /* ZSTD_CCOMMON_H_MODULE */
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..44a3c10becf2 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..87145a2d9160 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,6 +27,16 @@ unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+ /*-**************************************************************
+  *  Histogram functions
+  ****************************************************************/
++void HIST_add(unsigned* count, const void* src, size_t srcSize)
++{
++    const BYTE* ip = (const BYTE*)src;
++    const BYTE* const end = ip + srcSize;
++
++    while (ip<end) {
++        count[*ip++]++;
++    }
++}
++
+ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                            const void* src, size_t srcSize)
+ {
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..e5d57d79e4d5 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -73,3 +74,10 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+  */
+ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                            const void* src, size_t srcSize);
++
++/*! HIST_add() :
++ *  Lowest level: just add nb of occurrences of characters from @src into @count.
++ *  @count is not reset. @count array is presumed large enough (i.e. 1 KB).
++ @  This function does not need any additional stack memory.
++ */
++void HIST_add(unsigned* count, const void* src, size_t srcSize);
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..0b229f5d2ae2 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
+ }
+ 
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
++}
++
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +519,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +528,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1249,81 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
++
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index 16bb995bc6c4..c41a747413e0 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,13 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
++#include "../common/error_private.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +29,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -44,7 +47,7 @@
+  * in log format, aka 17 => 1 << 17 == 128Ki positions.
+  * This structure is only used in zstd_opt.
+  * Since allocation is centralized for all strategies, it has to be known here.
+- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
++ * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3,
+  * so that zstd_opt.c doesn't need to know about this constant.
+  */
+ #ifndef ZSTD_HASHLOG3_MAX
+@@ -55,14 +58,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -75,12 +81,12 @@ struct ZSTD_CDict_s {
+     ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */
+     U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+     ZSTD_cwksp workspace;
+-    ZSTD_matchState_t matchState;
++    ZSTD_MatchState_t matchState;
+     ZSTD_compressedBlockState_t cBlockState;
+     ZSTD_customMem customMem;
+     U32 dictID;
+     int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+-    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
++    ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+                                            * row-based matchfinder. Unless the cdict is reloaded, we will use
+                                            * the same greedy/lazy matchfinder at compression time.
+                                            */
+@@ -130,11 +136,12 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+     ZSTD_cwksp_move(&cctx->workspace, &ws);
+     cctx->staticSize = workspaceSize;
+ 
+-    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+-    if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
++    /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */
++    if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+     cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+     cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+-    cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE);
++    cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE);
++    cctx->tmpWkspSize = TMP_WORKSPACE_SIZE;
+     cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+     return cctx;
+ }
+@@ -168,15 +175,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -205,7 +210,7 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+ }
+ 
+ /* private API call, for dictBuilder only */
+-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+ 
+ /* Returns true if the strategy supports using a row based matchfinder */
+ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+@@ -215,32 +220,27 @@ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+ /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
+  * for this compression.
+  */
+-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
++static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) {
+     assert(mode != ZSTD_ps_auto);
+     return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
+ }
+ 
+ /* Returns row matchfinder usage given an initial mode and cParams */
+-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode,
+                                                          const ZSTD_compressionParameters* const cParams) {
+-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
+-    int const kHasSIMD128 = 1;
+-#else
+-    int const kHasSIMD128 = 0;
+-#endif
++    /* The Linux Kernel does not use SIMD, and 128KB is a very common size, e.g. in BtrFS.
++     * The row match finder is slower for this size without SIMD, so disable it.
++     */
++    const unsigned kWindowLogLowerBound = 17;
+     if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+     mode = ZSTD_ps_disable;
+     if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+-    if (kHasSIMD128) {
+-        if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
+-    } else {
+-        if (cParams->windowLog > 17) mode = ZSTD_ps_enable;
+-    }
++    if (cParams->windowLog > kWindowLogLowerBound) mode = ZSTD_ps_enable;
+     return mode;
+ }
+ 
+ /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
+-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode,
+                                                         const ZSTD_compressionParameters* const cParams) {
+     if (mode != ZSTD_ps_auto) return mode;
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
+@@ -248,7 +248,7 @@ static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
+ 
+ /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
+ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+-                                   const ZSTD_paramSwitch_e useRowMatchFinder,
++                                   const ZSTD_ParamSwitch_e useRowMatchFinder,
+                                    const U32 forDDSDict) {
+     assert(useRowMatchFinder != ZSTD_ps_auto);
+     /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
+@@ -257,16 +257,44 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
++static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+     if (mode != ZSTD_ps_auto) return mode;
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -282,8 +310,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
+         assert(cctxParams.ldmParams.hashRateLog < 32);
+     }
+-    cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
++    cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +361,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -343,10 +378,13 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+      */
+     cctxParams->compressionLevel = compressionLevel;
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+-    cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
++    cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+-                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
++                cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+ 
+ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+@@ -359,7 +397,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +493,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -534,11 +572,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
+         bounds.lowerBound = (int)ZSTD_ps_auto;
+         bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
++    case ZSTD_c_blockSplitterLevel:
++        bounds.lowerBound = 0;
++        bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX;
++        return bounds;
++
+     case ZSTD_c_useRowMatchFinder:
+         bounds.lowerBound = (int)ZSTD_ps_auto;
+         bounds.upperBound = (int)ZSTD_ps_disable;
+@@ -549,6 +592,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_repcodeResolution:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -567,10 +630,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -584,6 +648,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_minMatch:
+     case ZSTD_c_targetLength:
+     case ZSTD_c_strategy:
++    case ZSTD_c_blockSplitterLevel:
+         return 1;
+ 
+     case ZSTD_c_format:
+@@ -610,9 +675,13 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_stableOutBuffer:
+     case ZSTD_c_blockDelimiters:
+     case ZSTD_c_validateSequences:
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_repcodeResolution:
+     default:
+         return 0;
+     }
+@@ -625,7 +694,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -665,9 +734,14 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_stableOutBuffer:
+     case ZSTD_c_blockDelimiters:
+     case ZSTD_c_validateSequences:
+-    case ZSTD_c_useBlockSplitter:
++    case ZSTD_c_splitAfterSequences:
++    case ZSTD_c_blockSplitterLevel:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_repcodeResolution:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +797,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +815,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +829,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+-        const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value;
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +863,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
+-        CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
++        CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        }
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -843,28 +920,55 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_blockDelimiters:
+         BOUNDCHECK(ZSTD_c_blockDelimiters, value);
+-        CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value;
++        CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value;
+         return CCtxParams->blockDelimiters;
+ 
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
++
++    case ZSTD_c_splitAfterSequences:
++        BOUNDCHECK(ZSTD_c_splitAfterSequences, value);
++        CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->postBlockSplitter;
+ 
+-    case ZSTD_c_useBlockSplitter:
+-        BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+-        CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
+-        return CCtxParams->useBlockSplitter;
++    case ZSTD_c_blockSplitterLevel:
++        BOUNDCHECK(ZSTD_c_blockSplitterLevel, value);
++        CCtxParams->preBlockSplitter_level = value;
++        return (size_t)CCtxParams->preBlockSplitter_level;
+ 
+     case ZSTD_c_useRowMatchFinder:
+         BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+-        CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
++        CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value;
+         return CCtxParams->useRowMatchFinder;
+ 
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        assert(value>=0);
++        CCtxParams->maxBlockSize = (size_t)value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_repcodeResolution:
++        BOUNDCHECK(ZSTD_c_repcodeResolution, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+@@ -881,7 +985,7 @@ size_t ZSTD_CCtxParams_getParameter(
+     switch(param)
+     {
+     case ZSTD_c_format :
+-        *value = CCtxParams->format;
++        *value = (int)CCtxParams->format;
+         break;
+     case ZSTD_c_compressionLevel :
+         *value = CCtxParams->compressionLevel;
+@@ -896,16 +1000,16 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = (int)CCtxParams->cParams.chainLog;
+         break;
+     case ZSTD_c_searchLog :
+-        *value = CCtxParams->cParams.searchLog;
++        *value = (int)CCtxParams->cParams.searchLog;
+         break;
+     case ZSTD_c_minMatch :
+-        *value = CCtxParams->cParams.minMatch;
++        *value = (int)CCtxParams->cParams.minMatch;
+         break;
+     case ZSTD_c_targetLength :
+-        *value = CCtxParams->cParams.targetLength;
++        *value = (int)CCtxParams->cParams.targetLength;
+         break;
+     case ZSTD_c_strategy :
+-        *value = (unsigned)CCtxParams->cParams.strategy;
++        *value = (int)CCtxParams->cParams.strategy;
+         break;
+     case ZSTD_c_contentSizeFlag :
+         *value = CCtxParams->fParams.contentSizeFlag;
+@@ -920,10 +1024,10 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = CCtxParams->forceWindow;
+         break;
+     case ZSTD_c_forceAttachDict :
+-        *value = CCtxParams->attachDictPref;
++        *value = (int)CCtxParams->attachDictPref;
+         break;
+     case ZSTD_c_literalCompressionMode :
+-        *value = CCtxParams->literalCompressionMode;
++        *value = (int)CCtxParams->literalCompressionMode;
+         break;
+     case ZSTD_c_nbWorkers :
+         assert(CCtxParams->nbWorkers == 0);
+@@ -939,19 +1043,19 @@ size_t ZSTD_CCtxParams_getParameter(
+         *value = CCtxParams->enableDedicatedDictSearch;
+         break;
+     case ZSTD_c_enableLongDistanceMatching :
+-        *value = CCtxParams->ldmParams.enableLdm;
++        *value = (int)CCtxParams->ldmParams.enableLdm;
+         break;
+     case ZSTD_c_ldmHashLog :
+-        *value = CCtxParams->ldmParams.hashLog;
++        *value = (int)CCtxParams->ldmParams.hashLog;
+         break;
+     case ZSTD_c_ldmMinMatch :
+-        *value = CCtxParams->ldmParams.minMatchLength;
++        *value = (int)CCtxParams->ldmParams.minMatchLength;
+         break;
+     case ZSTD_c_ldmBucketSizeLog :
+-        *value = CCtxParams->ldmParams.bucketSizeLog;
++        *value = (int)CCtxParams->ldmParams.bucketSizeLog;
+         break;
+     case ZSTD_c_ldmHashRateLog :
+-        *value = CCtxParams->ldmParams.hashRateLog;
++        *value = (int)CCtxParams->ldmParams.hashRateLog;
+         break;
+     case ZSTD_c_targetCBlockSize :
+         *value = (int)CCtxParams->targetCBlockSize;
+@@ -971,8 +1075,11 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_validateSequences :
+         *value = (int)CCtxParams->validateSequences;
+         break;
+-    case ZSTD_c_useBlockSplitter :
+-        *value = (int)CCtxParams->useBlockSplitter;
++    case ZSTD_c_splitAfterSequences :
++        *value = (int)CCtxParams->postBlockSplitter;
++        break;
++    case ZSTD_c_blockSplitterLevel :
++        *value = CCtxParams->preBlockSplitter_level;
+         break;
+     case ZSTD_c_useRowMatchFinder :
+         *value = (int)CCtxParams->useRowMatchFinder;
+@@ -980,6 +1087,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_repcodeResolution:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1125,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1181,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1196,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1217,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,7 +1310,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+@@ -1168,7 +1329,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+     BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+     BOUNDCHECK(ZSTD_c_minMatch,  (int)cParams.minMatch);
+     BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+-    BOUNDCHECK(ZSTD_c_strategy,  cParams.strategy);
++    BOUNDCHECK(ZSTD_c_strategy,  (int)cParams.strategy);
+     return 0;
+ }
+ 
+@@ -1178,11 +1339,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1240,19 +1402,62 @@ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize)
+  *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+  *  mostly downsize to reduce memory consumption and initialization latency.
+  * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`.
++ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`.
+  *  note : `srcSize==0` means 0!
+  *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_CParamMode_e mode,
++                            ZSTD_ParamSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1486,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1505,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,11 +1551,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
++static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
+ 
+ static void ZSTD_overrideCParams(
+               ZSTD_compressionParameters* cParams,
+@@ -1330,24 +1571,25 @@ static void ZSTD_overrideCParams(
+ }
+ 
+ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     ZSTD_compressionParameters cParams;
+     if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+-      srcSizeHint = CCtxParams->srcSizeHint;
++        assert(CCtxParams->srcSizeHint>=0);
++        srcSizeHint = (U64)CCtxParams->srcSizeHint;
+     }
+     cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
+     if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+-                       const ZSTD_paramSwitch_e useRowMatchFinder,
+-                       const U32 enableDedicatedDictSearch,
++                       const ZSTD_ParamSwitch_e useRowMatchFinder,
++                       const int enableDedicatedDictSearch,
+                        const U32 forCCtx)
+ {
+     /* chain table size should be 0 for fast or row-hash strategies */
+@@ -1363,14 +1605,14 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                             + hSize * sizeof(U32)
+                             + h3Size * sizeof(U32);
+     size_t const optPotentialSpace =
+-        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++        ZSTD_cwksp_aligned64_alloc_size((MaxML+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((MaxLL+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((MaxOff+1) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size((1<<Litbits) * sizeof(U32))
++      + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned64_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,30 +1628,38 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+         const int isStatic,
+-        const ZSTD_paramSwitch_e useRowMatchFinder,
++        const ZSTD_ParamSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+-                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
++                            + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+-    size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
++    size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE);
+     size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+     size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
+ 
+     size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
+     size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
+     size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ?
+-        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
++        ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
+ 
+ 
+     size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize)
+@@ -1417,15 +1667,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+-        entropySpace +
++        tmpWorkSpace +
+         blockStateSpace +
+         ldmSpace +
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1435,7 +1691,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ {
+     ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
++    ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+                                                                                &cParams);
+ 
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+@@ -1443,7 +1699,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,18 +1749,18 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+         size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+-        ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
++        ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+     }
+ }
+ 
+@@ -1600,7 +1856,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+  *  Invalidate all the matches in the match finder tables.
+  *  Requires nextSrc and base to be set (can be NULL).
+  */
+-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
++static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms)
+ {
+     ZSTD_window_clear(&ms->window);
+ 
+@@ -1637,12 +1893,25 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+-ZSTD_reset_matchState(ZSTD_matchState_t* ms,
++ZSTD_reset_matchState(ZSTD_MatchState_t* ms,
+                       ZSTD_cwksp* ws,
+                 const ZSTD_compressionParameters* cParams,
+-                const ZSTD_paramSwitch_e useRowMatchFinder,
++                const ZSTD_ParamSwitch_e useRowMatchFinder,
+                 const ZSTD_compResetPolicy_e crp,
+                 const ZSTD_indexResetPolicy_e forceResetIndex,
+                 const ZSTD_resetTarget_e forWho)
+@@ -1664,6 +1933,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,22 +1955,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1976,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         }
+     }
+ 
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1754,7 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ {
+     ZSTD_cwksp* const ws = &zc->workspace;
+     DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
+-                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
++                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter);
+     assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+ 
+     zc->isFirstBlock = 1;
+@@ -1766,8 +2044,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     params = &zc->appliedParams;
+ 
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+-    assert(params->useBlockSplitter != ZSTD_ps_auto);
++    assert(params->postBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +2055,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,8 +2073,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+ 
+@@ -1805,7 +2082,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1823,21 +2100,23 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+                 DEBUGLOG(5, "reserving object space");
+                 /* Statically sized space.
+-                 * entropyWorkspace never moves,
++                 * tmpWorkspace never moves,
+                  * though prev/next block swap places */
+                 assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+                 zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                 RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+                 zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                 RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+-                zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
+-                RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
++                zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE);
++                RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace");
++                zc->tmpWkspSize = TMP_WORKSPACE_SIZE;
+         }   }
+ 
+         ZSTD_cwksp_clear(ws);
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1845,7 +2124,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+             zc->appliedParams.fParams.contentSizeFlag = 0;
+         DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+             (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+-        zc->blockSize = blockSize;
++        zc->blockSizeMax = blockSize;
+ 
+         xxh64_reset(&zc->xxhState, 0);
+         zc->stage = ZSTDcs_init;
+@@ -1854,13 +2133,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (ZSTD_hasExtSeqProd(params)) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2195,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2269,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2309,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,26 +2360,29 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+     /* Zero the hashTable3, since the cdict never fills it */
+-    {   int const h3log = cctx->blockState.matchState.hashLog3;
++    assert(cctx->blockState.matchState.hashLog3 <= 31);
++    {   U32 const h3log = cctx->blockState.matchState.hashLog3;
+         size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+         assert(cdict->matchState.hashLog3 == 0);
+         ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+@@ -2082,8 +2391,8 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+     ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+ 
+     /* copy dictionary offsets */
+-    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+-        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
++    {   ZSTD_MatchState_t const* srcMatchState = &cdict->matchState;
++        ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState;
+         dstMatchState->window       = srcMatchState->window;
+         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+@@ -2141,12 +2450,13 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         /* Copy only compression parameters related to tables. */
+         params.cParams = srcCCtx->appliedParams.cParams;
+         assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
+-        assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
++        assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto);
+         assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
+         params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
+-        params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
++        params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2166,7 +2476,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                                     ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
+                                     : 0;
+         size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+-        int const h3log = srcCCtx->blockState.matchState.hashLog3;
++        U32 const h3log = srcCCtx->blockState.matchState.hashLog3;
+         size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+ 
+         ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable,
+@@ -2184,8 +2494,8 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+ 
+     /* copy dictionary offsets */
+     {
+-        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+-        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
++        const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState;
++        ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+         dstMatchState->window       = srcMatchState->window;
+         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+@@ -2234,7 +2544,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
+     /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+     U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
+     assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+-    assert(size < (1U<<31));   /* can be casted to int */
++    assert(size < (1U<<31));   /* can be cast to int */
+ 
+ 
+     for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+@@ -2267,7 +2577,7 @@ static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const
+ 
+ /*! ZSTD_reduceIndex() :
+ *   rescale all indexes to avoid future overflow (indexes are U32) */
+-static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
++static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+ {
+     {   U32 const hSize = (U32)1 << params->cParams.hashLog;
+         ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+@@ -2294,26 +2604,32 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr)
+ {
+-    const seqDef* const sequences = seqStorePtr->sequencesStart;
++    const SeqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+     BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2333,9 +2649,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+  * Returns 1 if true, 0 otherwise. */
+ static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
+ {
+-    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
+-    assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
+-    return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
++    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter);
++    assert(cctxParams->postBlockSplitter != ZSTD_ps_auto);
++    return (cctxParams->postBlockSplitter == ZSTD_ps_enable);
+ }
+ 
+ /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
+@@ -2347,6 +2663,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2674,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const SeqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2694,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2392,7 +2711,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype,
++                CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype,
+                 countWorkspace, max, llCodeTable, nbSeq,
+                 LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                 prevEntropy->litlengthCTable,
+@@ -2413,7 +2732,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         size_t const mostFrequent = HIST_countFast_wksp(
+             countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */
+         /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+-        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
++        ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+         DEBUGLOG(5, "Building OF table");
+         nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+         stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+@@ -2424,7 +2743,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype,
++                CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype,
+                 countWorkspace, max, ofCodeTable, nbSeq,
+                 OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                 prevEntropy->offcodeCTable,
+@@ -2454,7 +2773,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+         assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+         {   size_t const countSize = ZSTD_buildCTable(
+                 op, (size_t)(oend - op),
+-                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype,
++                CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype,
+                 countWorkspace, max, mlCodeTable, nbSeq,
+                 ML_defaultNorm, ML_defaultNormLog, MaxML,
+                 prevEntropy->matchlengthCTable,
+@@ -2480,22 +2799,23 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                              void* dst, size_t dstCapacity,
++                        const void* literals, size_t litSize,
++                        const SeqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+-    const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const SeqDef* const sequences = seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2823,28 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+-    {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++    {   size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+-        unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+-        size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++        int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2870,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2882,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2597,104 +2916,146 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     return (size_t)(op - ostart);
+ }
+ 
+-MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++static size_t
++ZSTD_entropyCompressSeqStore_wExtLitBuffer(
++                          void* dst, size_t dstCapacity,
++                    const void* literals, size_t litSize,
++                          size_t blockSize,
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+-                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                             dst, dstCapacity,
++                            literals, litSize,
++                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                             entropyWorkspace, entropyWkspSize, bmi2);
+     if (cSize == 0) return 0;
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+-    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
++    {   size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
++static size_t
++ZSTD_entropyCompressSeqStore(
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
++{
++    return ZSTD_entropyCompressSeqStore_wExtLitBuffer(
++                dst, dstCapacity,
++                seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart),
++                srcSize,
++                seqStorePtr,
++                prevEntropy, nextEntropy,
++                cctxParams,
++                entropyWorkspace, entropyWkspSize,
++                bmi2);
++}
++
+ /* ZSTD_selectBlockCompressor() :
+  * Not static, but internal use only (used by long distance matcher)
+  * assumption : strat is a valid strategy */
+-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
+ {
+-    static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
++    static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+           NULL }
+     };
+-    ZSTD_blockCompressor selectedCompressor;
++    ZSTD_BlockCompressor_f selectedCompressor;
+     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+ 
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+-    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
++    DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+-        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++        static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = {
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+-        DEBUGLOG(4, "Selecting a row-based matchfinder");
++        DEBUGLOG(5, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+         selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
+     } else {
+@@ -2704,30 +3065,126 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     return selectedCompressor;
+ }
+ 
+-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
++static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr,
+                                    const BYTE* anchor, size_t lastLLSize)
+ {
+     ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize);
+     seqStorePtr->lit += lastLLSize;
+ }
+ 
+-void ZSTD_resetSeqStore(seqStore_t* ssPtr)
++void ZSTD_resetSeqStore(SeqStore_t* ssPtr)
+ {
+     ssPtr->lit = ssPtr->litStart;
+     ssPtr->sequences = ssPtr->sequencesStart;
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
+-typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
++/*
++ * Function to validate sequences produced by a block compressor.
++ */
++static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams)
++{
++#if DEBUGLEVEL >= 1
++    const SeqDef* seq = seqStore->sequencesStart;
++    const SeqDef* const seqEnd = seqStore->sequences;
++    size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4;
++    for (; seq < seqEnd; ++seq) {
++        const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq);
++        assert(seqLength.matchLength >= matchLenLowerBound);
++        (void)seqLength;
++        (void)matchLenLowerBound;
++    }
++#else
++    (void)seqStore;
++    (void)cParams;
++#endif
++}
++
++static size_t
++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx,
++                                   ZSTD_SequencePosition* seqPos,
++                             const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                             const void* src, size_t blockSize,
++                                   ZSTD_ParamSwitch_e externalRepSearch);
++
++typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ {
+-    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
++    ZSTD_MatchState_t* const ms = &zc->blockState.matchState;
+     DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3220,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2772,7 +3238,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        src, srcSize);
+             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+-            rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
++            RawSeqStore_t ldmSeqStore = kNullRawSeqStore;
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
+ 
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+@@ -2788,42 +3262,116 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
++            assert(
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->extSeqBuf,
++                    nbExternalSeqs,
++                    zc->extSeqBufCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_SequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_transferSequences_wBlockDelim(
++                            zc, &seqPos,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_BlockCompressor_f const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
++            ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+         {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+             ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+     }   }
++    ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams);
+     return ZSTDbss_compress;
+ }
+ 
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
++    const SeqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs);
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+ 
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    Repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+-
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,46 +3380,75 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
++}
++
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                               size_t outSeqsSize, const void* src, size_t srcSize)
+ {
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+-    void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
++    void* dst; /* Make C90 happy. */
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
++    dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+     seqCollector.collectSequences = 1;
+@@ -2880,8 +3457,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3491,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2930,7 +3509,7 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+  * This is just a heuristic based on the compressibility.
+  * It may return both false positives and false negatives.
+  */
+-static int ZSTD_maybeRLE(seqStore_t const* seqStore)
++static int ZSTD_maybeRLE(SeqStore_t const* seqStore)
+ {
+     size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+@@ -2938,7 +3517,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,12 +3526,14 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+     MEM_writeLE24(op, cBlockHeader);
+-    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
++    DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
+ }
+ 
+ /* ZSTD_buildBlockEntropyStats_literals() :
+@@ -2959,13 +3541,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3558,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3575,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3655,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3668,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const SeqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3103,9 +3695,9 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+                                           entropyWorkspace, entropyWorkspaceSize)
+                        : ZSTD_buildDummySequencesStatistics(nextEntropy);
+     FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+-    fseMetadata->llType = (symbolEncodingType_e) stats.LLtype;
+-    fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype;
+-    fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype;
++    fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype;
++    fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype;
++    fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype;
+     fseMetadata->lastCountSize = stats.lastCountSize;
+     return stats.size;
+ }
+@@ -3114,23 +3706,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const SeqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3740,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3767,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,116 +3805,121 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->tmpWorkspace, zc->tmpWkspSize), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->tmpWorkspace, zc->tmpWkspSize,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        SeqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        SeqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+ /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx).
+  * Stores the result in resultSeqStore.
+  */
+-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+-                               const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore,
++                               const SeqStore_t* originalSeqStore,
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3932,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3945,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3980,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes,
++                        const SeqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+-        seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        SeqDef* const seq = seqStore->sequencesStart + idx;
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +4016,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+-                                  repcodes_t* const dRep, repcodes_t* const cRep,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const SeqStore_t* const seqStore,
++                                  Repcodes_t* const dRep, Repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3417,7 +4030,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+     size_t cSeqsSize;
+ 
+     /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
+-    repcodes_t const dRepOriginal = *dRep;
++    Repcodes_t const dRepOriginal = *dRep;
+     DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
+     if (isPartition)
+         ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+@@ -3428,7 +4041,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+                 &zc->appliedParams,
+                 op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
+                 srcSize,
+-                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++                zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */,
+                 zc->bmi2);
+     FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
+ 
+@@ -3442,8 +4055,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3451,18 +4065,18 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+     if (cSeqsSize == 0) {
+         cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+         FORWARD_IF_ERROR(cSize, "Nocompress block failed");
+-        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize);
+         *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+     } else if (cSeqsSize == 1) {
+         cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
+         FORWARD_IF_ERROR(cSize, "RLE compress block failed");
+-        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize);
+         *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+     } else {
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
+         cSize = ZSTD_blockHeaderSize + cSeqsSize;
+-        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
++        DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize);
+     }
+ 
+     if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+@@ -3481,45 +4095,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+-                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
++                             ZSTD_CCtx* zc, const SeqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +4145,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4172,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3577,36 +4201,37 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+      *
+      * See ZSTD_seqStore_resolveOffCodes() for more details.
+      */
+-    repcodes_t dRep;
+-    repcodes_t cRep;
+-    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
++    Repcodes_t dRep;
++    Repcodes_t cRep;
++    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4246,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,12 +4255,12 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+-    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
++    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t));
+     return cSize;
+ }
+ 
+@@ -3643,21 +4269,20 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+-    assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock");
++    assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable);
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+-            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
++            DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+         }
+         nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
+@@ -3673,9 +4298,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3687,11 +4312,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3702,7 +4331,7 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+             &zc->appliedParams,
+             dst, dstCapacity,
+             srcSize,
+-            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++            zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */,
+             zc->bmi2);
+ 
+     if (frame &&
+@@ -3767,10 +4396,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4408,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3807,7 +4437,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+     return cSize;
+ }
+ 
+-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
++static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          void const* ip,
+@@ -3831,39 +4461,82 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+     }
+ }
+ 
++#include "zstd_preSplit.h"
++
++static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings)
++{
++    /* split level based on compression strategy, from `fast` to `btultra2` */
++    static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 };
++    /* note: conservatively only split full blocks (128 KB) currently.
++     * While it's possible to go lower, let's keep it simple for a first implementation.
++     * Besides, benefits of splitting are reduced when blocks are already small.
++     */
++    if (srcSize < 128 KB || blockSizeMax < 128 KB)
++        return MIN(srcSize, blockSizeMax);
++    /* do not split incompressible data though:
++     * require verified savings to allow pre-splitting.
++     * Note: as a consequence, the first full block is not split.
++     */
++    if (savings < 3) {
++        DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings);
++        return 128 KB;
++    }
++    /* apply @splitLevel, or use default value (which depends on @strat).
++     * note that splitting heuristic is still conditioned by @savings >= 3,
++     * so the first block will not reach this code path */
++    if (splitLevel == 1) return 128 KB;
++    if (splitLevel == 0) {
++        assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2);
++        splitLevel = splitLevels[strat];
++    } else {
++        assert(2 <= splitLevel && splitLevel <= 6);
++        splitLevel -= 2;
++    }
++    return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize);
++}
++
+ /*! ZSTD_compress_frameChunk() :
+ *   Compress a chunk of data into one or multiple blocks.
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                      U32 lastFrameChunk)
+ {
+-    size_t blockSize = cctx->blockSize;
++    size_t blockSizeMax = cctx->blockSizeMax;
+     size_t remaining = srcSize;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+     U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
++    S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize;
+ 
+     assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+ 
+-    DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
++    DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax);
+     if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+         xxh64_update(&cctx->xxhState, src, srcSize);
+ 
+     while (remaining) {
+-        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+-        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+-
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        ZSTD_MatchState_t* const ms = &cctx->blockState.matchState;
++        size_t const blockSize = ZSTD_optimalBlockSize(cctx,
++                                ip, remaining,
++                                blockSizeMax,
++                                cctx->appliedParams.preBlockSplitter_level,
++                                cctx->appliedParams.cParams.strategy,
++                                savings);
++        U32 const lastBlock = lastFrameChunk & (blockSize == remaining);
++        assert(blockSize <= remaining);
++
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+-        if (remaining < blockSize) blockSize = remaining;
+ 
+         ZSTD_overflowCorrectIfNeeded(
+             ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+@@ -3899,8 +4572,23 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
+-
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
++
++            /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data.
++             * Without splitting, the maximum expansion is 3 bytes per full block.
++             * An adversarial input could attempt to fudge the split detector,
++             * and make it split incompressible data, resulting in more block headers.
++             * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block,
++             * and the splitter never creates blocks that small (current lower limit is 8 KB),
++             * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit.
++             * But if the goal is to not expand by more than 3-bytes per 128 KB full block,
++             * then yes, it becomes possible to make the block splitter oversplit incompressible data.
++             * Using @savings, we enforce an even more conservative condition,
++             * requiring the presence of enough savings (at least 3 bytes) to authorize splitting,
++             * otherwise only full blocks are used.
++             * But being conservative is fine,
++             * since splitting barely compressible blocks is not fruitful anyway */
++            savings += (S64)blockSize - (S64)cSize;
+ 
+             ip += blockSize;
+             assert(remaining >= blockSize);
+@@ -3919,8 +4607,10 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+ 
+ 
+ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+-                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+-{   BYTE* const op = (BYTE*)dst;
++                                    const ZSTD_CCtx_params* params,
++                                    U64 pledgedSrcSize, U32 dictID)
++{
++    BYTE* const op = (BYTE*)dst;
+     U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+     U32   const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+     U32   const checksumFlag = params->fParams.checksumFlag>0;
+@@ -4001,19 +4691,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4022,7 +4708,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                         const void* src, size_t srcSize,
+                                U32 frame, U32 lastFrameChunk)
+ {
+-    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
++    ZSTD_MatchState_t* const ms = &cctx->blockState.matchState;
+     size_t fhSize = 0;
+ 
+     DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+@@ -4057,7 +4743,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+             src, (BYTE const*)src + srcSize);
+     }
+ 
+-    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
++    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax);
+     {   size_t const cSize = frame ?
+                              ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                              ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+@@ -4078,58 +4764,90 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+-                                         ldmState_t* ls,
+-                                         ZSTD_cwksp* ws,
+-                                         ZSTD_CCtx_params const* params,
+-                                         const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++static size_t
++ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms,
++                        ldmState_t* ls,
++                        ZSTD_cwksp* ws,
++                        ZSTD_CCtx_params const* params,
++                        const void* src, size_t srcSize,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,35 +4856,59 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
++        DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict");
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++        DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes");
++    }
++
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    {   U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
+     }
+ 
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4916,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4183,14 +4925,24 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
++        DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4233,20 +4985,19 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+     {   unsigned maxSymbolValue = 255;
+         unsigned hasZeroWeights = 1;
+         size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+-            dictEnd-dictPtr, &hasZeroWeights);
++            (size_t)(dictEnd-dictPtr), &hasZeroWeights);
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+     {   unsigned offcodeLog;
+-        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
++        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+         /* fill all offset symbols to avoid garbage at end of table */
+@@ -4261,7 +5012,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+     {   short matchlengthNCount[MaxML+1];
+         unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+-        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
++        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+@@ -4275,7 +5026,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+     {   short litlengthNCount[MaxLL+1];
+         unsigned litlengthMaxValue = MaxLL, litlengthLog;
+-        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
++        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+         RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+         RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+         RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+@@ -4309,7 +5060,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                 RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+     }   }   }
+ 
+-    return dictPtr - (const BYTE*)dict;
++    return (size_t)(dictPtr - (const BYTE*)dict);
+ }
+ 
+ /* Dictionary format :
+@@ -4322,11 +5073,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+  *                dictSize supposed >= 8
+  */
+ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+-                                      ZSTD_matchState_t* ms,
++                                      ZSTD_MatchState_t* ms,
+                                       ZSTD_cwksp* ws,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +5097,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4354,13 +5106,14 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+ *   @return : dictID, or an error code */
+ static size_t
+ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+-                               ZSTD_matchState_t* ms,
++                               ZSTD_MatchState_t* ms,
+                                ldmState_t* ls,
+                                ZSTD_cwksp* ws,
+                          const ZSTD_CCtx_params* params,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +5126,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +5140,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +5180,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->tmpWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +5225,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +5237,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4496,14 +5256,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5272,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4528,7 +5288,7 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     }
+ 
+     cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+@@ -4537,9 +5297,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5323,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5359,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5477,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4719,14 +5487,16 @@ static size_t ZSTD_initCDict_internal(
+     return 0;
+ }
+ 
+-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
+-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+-                                      ZSTD_compressionParameters cParams,
+-                                      ZSTD_paramSwitch_e useRowMatchFinder,
+-                                      U32 enableDedicatedDictSearch,
+-                                      ZSTD_customMem customMem)
++static ZSTD_CDict*
++ZSTD_createCDict_advanced_internal(size_t dictSize,
++                                ZSTD_dictLoadMethod_e dictLoadMethod,
++                                ZSTD_compressionParameters cParams,
++                                ZSTD_ParamSwitch_e useRowMatchFinder,
++                                int enableDedicatedDictSearch,
++                                ZSTD_customMem customMem)
+ {
+     if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
++    DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize);
+ 
+     {   size_t const workspaceSize =
+             ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+@@ -4763,6 +5533,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+ {
+     ZSTD_CCtx_params cctxParams;
+     ZSTD_memset(&cctxParams, 0, sizeof(cctxParams));
++    DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType);
+     ZSTD_CCtxParams_init(&cctxParams, 0);
+     cctxParams.cParams = cParams;
+     cctxParams.customMem = customMem;
+@@ -4783,7 +5554,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+     ZSTD_compressionParameters cParams;
+     ZSTD_CDict* cdict;
+ 
+-    DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType);
++    DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType);
+     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+ 
+     if (cctxParams.enableDedicatedDictSearch) {
+@@ -4802,7 +5573,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+             &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+     }
+ 
+-    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
++    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch);
+     cctxParams.cParams = cParams;
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+ 
+@@ -4810,10 +5581,8 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+                         dictLoadMethod, cctxParams.cParams,
+                         cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                         customMem);
+-    if (!cdict)
+-        return NULL;
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4867,7 +5636,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+  *  workspaceSize: Use ZSTD_estimateCDictSize()
+  *                 to determine how large workspace must be.
+  *  cParams : use ZSTD_getCParams() to transform a compression level
+- *            into its relevants cParams.
++ *            into its relevant cParams.
+  * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+  *  Note : there is no corresponding "free" function.
+  *         Since workspace was allocated externally, it must be freed externally.
+@@ -4879,7 +5648,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+                                  ZSTD_dictContentType_e dictContentType,
+                                  ZSTD_compressionParameters cParams)
+ {
+-    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
++    ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
+     /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
+     size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
+     size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+@@ -4890,6 +5659,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     ZSTD_CDict* cdict;
+     ZSTD_CCtx_params params;
+ 
++    DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize);
+     if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+ 
+     {
+@@ -4900,14 +5670,13 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+         ZSTD_cwksp_move(&cdict->workspace, &ws);
+     }
+ 
+-    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+-        (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+     if (workspaceSize < neededSize) return NULL;
+ 
+     ZSTD_CCtxParams_init(&params, 0);
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4987,12 +5756,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5002,7 +5776,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5068,7 +5842,7 @@ size_t ZSTD_CStreamOutSize(void)
+     return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+ }
+ 
+-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
++static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
+ {
+     if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize))
+         return ZSTD_cpm_attachDict;
+@@ -5199,30 +5973,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSizeMax - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSizeMax;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5231,8 +6016,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5243,12 +6030,13 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ 
+         case zcss_load:
+             if ( (flushMode == ZSTD_e_end)
+-              && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip)     /* Enough output space */
++              && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip))     /* Enough output space */
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
+-                                                op, oend-op, ip, iend-ip);
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
++                                                op, (size_t)(oend-op),
++                                                ip, (size_t)(iend-ip));
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+                 ip = iend;
+@@ -5262,10 +6050,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                 size_t const loaded = ZSTD_limitCopy(
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+-                                        ip, iend-ip);
++                                        ip, (size_t)(iend-ip));
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5276,16 +6063,29 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+             {   int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered);
+                 void* cDst;
+                 size_t cSize;
+-                size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t oSize = (size_t)(oend-op);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSizeMax);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5293,34 +6093,31 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+                     /* prepare next block */
+-                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
++                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax;
+                     if (zcs->inBuffTarget > zcs->inBuffSize)
+-                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
++                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax;
+                     DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                             (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5369,8 +6166,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         }
+     }
+ 
+-    input->pos = ip - istart;
+-    output->pos = op - ostart;
++    input->pos = (size_t)(ip - istart);
++    output->pos = (size_t)(op - ostart);
+     if (zcs->frameEnded) return 0;
+     return ZSTD_nextInputSizeHint(zcs);
+ }
+@@ -5390,8 +6187,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5410,22 +6209,27 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
++/*
++ * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize.
++ * Otherwise, it's ignored.
++ * @return: 0 on success, or a ZSTD_error code otherwise.
++ */
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5438,21 +6242,24 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+          */
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+-    DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage");
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+-        ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
++        ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+         params.cParams = ZSTD_getCParamsFromCCtxParams(
+                 &params, cctx->pledgedSrcSizePlusOne-1,
+                 dictSize, mode);
+     }
+ 
+-    params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
++    params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5468,7 +6275,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+             /* for small input: avoid automatic flush on reaching end of block, since
+             * it would require to add a 3-bytes null block to end frame
+             */
+-            cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize);
++            cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize);
+         } else {
+             cctx->inBuffTarget = 0;
+         }
+@@ -5479,6 +6286,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5493,8 +6302,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5512,13 +6340,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5541,6 +6376,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5551,64 +6387,67 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+- * @offCode : is presumed to follow format required by ZSTD_storeSeq()
++ * @offBase : must use the format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++/* This function scans through an array of ZSTD_Sequence,
++ * storing the sequences it reads, until it reaches a block delimiter.
++ * Note that the block delimiter includes the last literals of the block.
++ * @blockSize must be == sum(sequence_lengths).
++ * @returns @blockSize on success, and a ZSTD_error otherwise.
+  */
+ static size_t
+-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+-                                              ZSTD_sequencePosition* seqPos,
+-                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx,
++                                   ZSTD_SequencePosition* seqPos,
++                             const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                             const void* src, size_t blockSize,
++                                   ZSTD_ParamSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+-    repcodes_t updatedRepcodes;
++    Repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5616,27 +6455,60 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+     } else {
+         dictSize = 0;
+     }
+-    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
++
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch,
++                                                seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize,
++                                                ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
+-    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
++    RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found.");
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+         DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength);
+@@ -5644,37 +6516,43 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+-    return 0;
++    return blockSize;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
++/*
++ * This function attempts to scan through @blockSize bytes in @src
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
+  *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
++ * Occasionally, we may want to reduce the actual number of bytes consumed from @src
++ * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH.
+  *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ * @returns the number of bytes consumed from @src, necessarily <= @blockSize.
++ * Otherwise, it may return a ZSTD error if something went wrong.
+  */
+ static size_t
+-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+-                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx,
++                               ZSTD_SequencePosition* seqPos,
++                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                         const void* src, size_t blockSize,
++                               ZSTD_ParamSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+     U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
+     size_t dictSize;
+-    BYTE const* ip = (BYTE const*)(src);
+-    BYTE const* iend = ip + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
+-    repcodes_t updatedRepcodes;
++    const BYTE* const istart = (const BYTE*)(src);
++    const BYTE* ip = istart;
++    const BYTE* iend = istart + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
++    Repcodes_t updatedRepcodes;
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5682,15 +6560,15 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+-    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+         const ZSTD_Sequence currSeq = inSeqs[idx];
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5704,7 +6582,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5744,58 +6621,113 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+     seqPos->idx = idx;
+     seqPos->posInSequence = endPosInSequence;
+-    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
+ 
+     iend -= bytesAdjustment;
+     if (ip != iend) {
+         /* Store any last literals */
+-        U32 lastLLSize = (U32)(iend - ip);
++        U32 const lastLLSize = (U32)(iend - ip);
+         assert(ip <= iend);
+         DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize);
+         ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize);
+         seqPos->posInSrc += lastLLSize;
+     }
+ 
+-    return bytesAdjustment;
++    return (size_t)(iend-istart);
+ }
+ 
+-typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+-                                       const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
+-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
++/* @seqPos represents a position within @inSeqs,
++ * it is read and updated by this function,
++ * once the goal to produce a block of size @blockSize is reached.
++ * @return: nb of bytes consumed from @src, necessarily <= @blockSize.
++ */
++typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx,
++                                        ZSTD_SequencePosition* seqPos,
++                                  const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                  const void* src, size_t blockSize,
++                                        ZSTD_ParamSwitch_e externalRepSearch);
++
++static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode)
+ {
+-    ZSTD_sequenceCopier sequenceCopier = NULL;
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode));
+     if (mode == ZSTD_sf_explicitBlockDelimiters) {
+-        return ZSTD_copySequencesToSeqStoreExplicitBlockDelim;
+-    } else if (mode == ZSTD_sf_noBlockDelimiters) {
+-        return ZSTD_copySequencesToSeqStoreNoBlockDelim;
++        return ZSTD_transferSequences_wBlockDelim;
++    }
++    assert(mode == ZSTD_sf_noBlockDelimiters);
++    return ZSTD_transferSequences_noDelim;
++}
++
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
+     }
+-    assert(sequenceCopier != NULL);
+-    return sequenceCopier;
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
+ }
+ 
+-/* Compress, block-by-block, all of the sequences given.
++static size_t determine_blockSize(ZSTD_SequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                           ZSTD_SequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters) {
++        /* Note: more a "target" block size */
++        return MIN(remaining, blockSize);
++    }
++    assert(mode == ZSTD_sf_explicitBlockDelimiters);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
++/* Compress all provided sequences, block-by-block.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+  * otherwise a ZSTD error.
+@@ -5807,15 +6739,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+-    ZSTD_sequencePosition seqPos = {0, 0, 0};
++    ZSTD_SequencePosition seqPos = {0, 0, 0};
+ 
+-    BYTE const* ip = (BYTE const*)src;
++    const BYTE* ip = (BYTE const*)src;
+     BYTE* op = (BYTE*)dst;
+-    ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
++    ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
+ 
+     DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
+     /* Special case: empty frame */
+@@ -5829,22 +6758,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+-        size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSizeMax, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+-        FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+-        blockSize -= additionalByteAdjustment;
++        blockSize = sequenceCopier(cctx,
++                                   &seqPos, inSeqs, inSeqsSize,
++                                   ip, blockSize,
++                                   cctx->appliedParams.searchForExternalRepcodes);
++        FORWARD_IF_ERROR(blockSize, "Bad sequence copy");
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5853,35 +6789,36 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+                                 op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
+                                 blockSize,
+-                                cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
++                                cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+-            /* We don't want to emit our first block as a RLE even if it qualifies because
+-            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+-            * This is only an issue for zstd <= v1.4.3
+-            */
++            ZSTD_isRLE(ip, blockSize)) {
++            /* Note: don't emit the first block as RLE even if it qualifies because
++             * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error
++             * "should consume all input error."
++             */
+             compressedSeqsSize = 1;
+         }
+ 
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5893,11 +6830,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5908,41 +6844,50 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+     BYTE* op = (BYTE*)dst;
+     size_t cSize = 0;
+-    size_t compressedBlocksSize = 0;
+-    size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
++
+     /* Begin writing output, starting with frame header */
+-    frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID);
+-    op += frameHeaderSize;
+-    dstCapacity -= frameHeaderSize;
+-    cSize += frameHeaderSize;
++    {   size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity,
++                    &cctx->appliedParams, srcSize, cctx->dictID);
++        op += frameHeaderSize;
++        assert(frameHeaderSize <= dstCapacity);
++        dstCapacity -= frameHeaderSize;
++        cSize += frameHeaderSize;
++    }
+     if (cctx->appliedParams.fParams.checksumFlag && srcSize) {
+         xxh64_update(&cctx->xxhState, src, srcSize);
+     }
+-    /* cSize includes block header size and compressed sequences size */
+-    compressedBlocksSize = ZSTD_compressSequences_internal(cctx,
++
++    /* Now generate compressed blocks */
++    {   size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx,
+                                                            op, dstCapacity,
+                                                            inSeqs, inSeqsSize,
+                                                            src, srcSize);
+-    FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!");
+-    cSize += compressedBlocksSize;
+-    dstCapacity -= compressedBlocksSize;
++        FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!");
++        cSize += cBlocksSize;
++        assert(cBlocksSize <= dstCapacity);
++        dstCapacity -= cBlocksSize;
++    }
+ 
++    /* Complete with frame checksum, if needed */
+     if (cctx->appliedParams.fParams.checksumFlag) {
+         U32 const checksum = (U32) xxh64_digest(&cctx->xxhState);
+         RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+@@ -5951,26 +6896,557 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
++    return cSize;
++}
++
++
++#if defined(__AVX2__)
++
++#include <immintrin.h>  /* AVX2 intrinsics */
++
++/*
++ * Convert 2 sequences per iteration, using AVX2 intrinsics:
++ *   - offset -> offBase = offset + 2
++ *   - litLength -> (U16) litLength
++ *   - matchLength -> (U16)(matchLength - 3)
++ *   - rep is ignored
++ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]).
++ *
++ * At the end, instead of extracting two __m128i,
++ * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1,
++ * then store the lower 16 bytes in one go.
++ *
++ * @returns 0 on succes, with no long length detected
++ * @returns > 0 if there is one long length (> 65535),
++ * indicating the position, and type.
++ */
++static size_t convertSequences_noRepcodes(
++    SeqDef* dstSeqs,
++    const ZSTD_Sequence* inSeqs,
++    size_t nbSequences)
++{
++    /*
++     * addition:
++     *   For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0)
++     */
++    const __m256i addition = _mm256_setr_epi32(
++        ZSTD_REP_NUM, 0, -MINMATCH, 0,    /* for sequence i */
++        ZSTD_REP_NUM, 0, -MINMATCH, 0     /* for sequence i+1 */
++    );
++
++    /* limit: check if there is a long length */
++    const __m256i limit = _mm256_set1_epi32(65535);
++
++    /*
++     * shuffle mask for byte-level rearrangement in each 128-bit half:
++     *
++     * Input layout (after addition) per 128-bit half:
++     *   [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ]
++     * We only need:
++     *   offBase (4 bytes) = offset+2
++     *   litLength (2 bytes) = low 2 bytes of litLength
++     *   mlBase (2 bytes) = low 2 bytes of (matchLength)
++     * => Bytes [0..3, 4..5, 8..9], zero the rest.
++     */
++    const __m256i mask = _mm256_setr_epi8(
++        /* For the lower 128 bits => sequence i */
++         0, 1, 2, 3,       /* offset+2 */
++         4, 5,             /* litLength (16 bits) */
++         8, 9,             /* matchLength (16 bits) */
++         (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++         (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++
++        /* For the upper 128 bits => sequence i+1 */
++        16,17,18,19,       /* offset+2 */
++        20,21,             /* litLength */
++        24,25,             /* matchLength */
++        (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80,
++        (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80
++    );
++
++    /*
++     * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8).
++     * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3].
++     * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1.
++     */
++#define PERM_LANE_0X_E8 0xE8  /* [0,2,2,3] in lane indices */
++
++    size_t longLen = 0, i = 0;
++
++    /* AVX permutation depends on the specific definition of target structures */
++    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
++    ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
++    ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
++    ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
++
++    /* Process 2 sequences per loop iteration */
++    for (; i + 1 < nbSequences; i += 2) {
++        /* Load 2 ZSTD_Sequence (32 bytes) */
++        __m256i vin  = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]);
++
++        /* Add {2, 0, -3, 0} in each 128-bit half */
++        __m256i vadd = _mm256_add_epi32(vin, addition);
++
++        /* Check for long length */
++        __m256i ll_cmp  = _mm256_cmpgt_epi32(vadd, limit);  /* 0xFFFFFFFF for element > 65535 */
++        int ll_res  = _mm256_movemask_epi8(ll_cmp);
++
++        /* Shuffle bytes so each half gives us the 8 bytes we need */
++        __m256i vshf = _mm256_shuffle_epi8(vadd, mask);
++        /*
++         * Now:
++         *   Lane0 = seq0's 8 bytes
++         *   Lane1 = 0
++         *   Lane2 = seq1's 8 bytes
++         *   Lane3 = 0
++         */
++
++        /* Permute 64-bit lanes => move Lane2 down into Lane1. */
++        __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8);
++        /*
++         * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1].
++         * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them.
++         */
++
++        /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */
++        _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm));
++        /*
++         * This writes out 16 bytes total:
++         *   - offset 0..7  => seq0 (offBase, litLength, mlBase)
++         *   - offset 8..15 => seq1 (offBase, litLength, mlBase)
++         */
++
++        /* check (unlikely) long lengths > 65535
++         * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27]
++         * => combined mask = 0x0FF00FF0
++         */
++        if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) {
++            /* long length detected: let's figure out which one*/
++            if (inSeqs[i].matchLength > 65535+MINMATCH) {
++                assert(longLen == 0);
++                longLen = i + 1;
++            }
++            if (inSeqs[i].litLength > 65535) {
++                assert(longLen == 0);
++                longLen = i + nbSequences + 1;
++            }
++            if (inSeqs[i+1].matchLength > 65535+MINMATCH) {
++                assert(longLen == 0);
++                longLen = i + 1 + 1;
++            }
++            if (inSeqs[i+1].litLength > 65535) {
++                assert(longLen == 0);
++                longLen = i + 1 + nbSequences + 1;
++            }
++        }
++    }
++
++    /* Handle leftover if @nbSequences is odd */
++    if (i < nbSequences) {
++        /* process last sequence */
++        assert(i == nbSequences - 1);
++        dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset);
++        dstSeqs[i].litLength = (U16)inSeqs[i].litLength;
++        dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH);
++        /* check (unlikely) long lengths > 65535 */
++        if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) {
++            assert(longLen == 0);
++            longLen = i + 1;
++        }
++        if (UNLIKELY(inSeqs[i].litLength > 65535)) {
++            assert(longLen == 0);
++            longLen = i + nbSequences + 1;
++        }
++    }
++
++    return longLen;
++}
++
++/* the vector implementation could also be ported to SSSE3,
++ * but since this implementation is targeting modern systems (>= Sapphire Rapid),
++ * it's not useful to develop and maintain code for older pre-AVX2 platforms */
++
++#else /* no AVX2 */
++
++static size_t convertSequences_noRepcodes(
++    SeqDef* dstSeqs,
++    const ZSTD_Sequence* inSeqs,
++    size_t nbSequences)
++{
++    size_t longLen = 0;
++    size_t n;
++    for (n=0; n<nbSequences; n++) {
++        dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
++        dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
++        dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
++        /* check for long length > 65535 */
++        if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
++            assert(longLen == 0);
++            longLen = n + 1;
++        }
++        if (UNLIKELY(inSeqs[n].litLength > 65535)) {
++            assert(longLen == 0);
++            longLen = n + nbSequences + 1;
++        }
++    }
++    return longLen;
++}
++
++#endif
++
++/*
++ * Precondition: Sequences must end on an explicit Block Delimiter
++ * @return: 0 on success, or an error code.
++ * Note: Sequence validation functionality has been disabled (removed).
++ * This is helpful to generate a lean main pipeline, improving performance.
++ * It may be re-inserted later.
++ */
++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
++                const ZSTD_Sequence* const inSeqs, size_t nbSequences,
++                int repcodeResolution)
++{
++    Repcodes_t updatedRepcodes;
++    size_t seqNb = 0;
++
++    DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences);
++
++    RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
++                    "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
++
++    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t));
++
++    /* check end condition */
++    assert(nbSequences >= 1);
++    assert(inSeqs[nbSequences-1].matchLength == 0);
++    assert(inSeqs[nbSequences-1].offset == 0);
++
++    /* Convert Sequences from public format to internal format */
++    if (!repcodeResolution) {
++        size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1);
++        cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1;
++        if (longl) {
++            DEBUGLOG(5, "long length");
++            assert(cctx->seqStore.longLengthType == ZSTD_llt_none);
++            if (longl <= nbSequences-1) {
++                DEBUGLOG(5, "long match length detected at pos %zu", longl-1);
++                cctx->seqStore.longLengthType = ZSTD_llt_matchLength;
++                cctx->seqStore.longLengthPos = (U32)(longl-1);
++            } else {
++                DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences);
++                assert(longl <= 2* (nbSequences-1));
++                cctx->seqStore.longLengthType = ZSTD_llt_literalLength;
++                cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1);
++            }
++        }
++    } else {
++        for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) {
++            U32 const litLength = inSeqs[seqNb].litLength;
++            U32 const matchLength = inSeqs[seqNb].matchLength;
++            U32 const ll0 = (litLength == 0);
++            U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0);
++
++            DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++            ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
++    }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    if (!repcodeResolution && nbSequences > 1) {
++        U32* const rep = updatedRepcodes.rep;
++
++        if (nbSequences >= 4) {
++            U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (nbSequences == 3) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[0].offset;
++            rep[0] = inSeqs[1].offset;
++        } else {
++            assert(nbSequences == 2);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[0].offset;
++        }
++    }
++
++    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t));
++
++    return 0;
++}
++
++#if defined(ZSTD_ARCH_X86_AVX2)
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
++{
++    size_t i;
++    __m256i const zeroVec = _mm256_setzero_si256();
++    __m256i sumVec = zeroVec;  /* accumulates match+lit in 32-bit lanes */
++    ZSTD_ALIGNED(32) U32 tmp[8];      /* temporary buffer for reduction */
++    size_t mSum = 0, lSum = 0;
++    ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
++
++    /* Process 2 structs (32 bytes) at a time */
++    for (i = 0; i + 2 <= nbSeqs; i += 2) {
++        /* Load two consecutive ZSTD_Sequence (8Ã4 = 32 bytes) */
++        __m256i data     = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]);
++        /* check end of block signal */
++        __m256i cmp      = _mm256_cmpeq_epi32(data, zeroVec);
++        int cmp_res      = _mm256_movemask_epi8(cmp);
++        /* indices for match lengths correspond to bits [8..11], [24..27]
++         * => combined mask = 0x0F000F00 */
++        ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
++        if (cmp_res & 0x0F000F00) break;
++        /* Accumulate in sumVec */
++        sumVec           = _mm256_add_epi32(sumVec, data);
++    }
++
++    /* Horizontal reduction */
++    _mm256_store_si256((__m256i*)tmp, sumVec);
++    lSum = tmp[1] + tmp[5];
++    mSum = tmp[2] + tmp[6];
++
++    /* Handle the leftover */
++    for (; i < nbSeqs; i++) {
++        lSum += seqs[i].litLength;
++        mSum += seqs[i].matchLength;
++        if (seqs[i].matchLength == 0) break; /* end of block */
++    }
++
++    if (i==nbSeqs) {
++        /* reaching end of sequences: end of block signal was not present */
++        BlockSummary bs;
++        bs.nbSequences = ERROR(externalSequences_invalid);
++        return bs;
++    }
++    {   BlockSummary bs;
++        bs.nbSequences = i+1;
++        bs.blockSize = lSum + mSum;
++        bs.litSize = lSum;
++        return bs;
++    }
++}
++
++#else
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
++{
++    size_t totalMatchSize = 0;
++    size_t litSize = 0;
++    size_t n;
++    assert(seqs);
++    for (n=0; n<nbSeqs; n++) {
++        totalMatchSize += seqs[n].matchLength;
++        litSize += seqs[n].litLength;
++        if (seqs[n].matchLength == 0) {
++            assert(seqs[n].offset == 0);
++            break;
++        }
++    }
++    if (n==nbSeqs) {
++        BlockSummary bs;
++        bs.nbSequences = ERROR(externalSequences_invalid);
++        return bs;
++    }
++    {   BlockSummary bs;
++        bs.nbSequences = n+1;
++        bs.blockSize = litSize + totalMatchSize;
++        bs.litSize = litSize;
++        return bs;
++    }
++}
++#endif
++
++
++static size_t
++ZSTD_compressSequencesAndLiterals_internal(ZSTD_CCtx* cctx,
++                                void* dst, size_t dstCapacity,
++                          const ZSTD_Sequence* inSeqs, size_t nbSequences,
++                          const void* literals, size_t litSize, size_t srcSize)
++{
++    size_t remaining = srcSize;
++    size_t cSize = 0;
++    BYTE* op = (BYTE*)dst;
++    int const repcodeResolution = (cctx->appliedParams.searchForExternalRepcodes == ZSTD_ps_enable);
++    assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto);
++
++    DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize);
++    RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block");
++
++    /* Special case: empty frame */
++    if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) {
++        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header");
++        MEM_writeLE24(op, cBlockHeader24);
++        op += ZSTD_blockHeaderSize;
++        dstCapacity -= ZSTD_blockHeaderSize;
++        cSize += ZSTD_blockHeaderSize;
++    }
++
++    while (nbSequences) {
++        size_t compressedSeqsSize, cBlockSize, conversionStatus;
++        BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences);
++        U32 const lastBlock = (block.nbSequences == nbSequences);
++        FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block");
++        assert(block.nbSequences <= nbSequences);
++        RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer");
++        ZSTD_resetSeqStore(&cctx->seqStore);
++
++        conversionStatus = ZSTD_convertBlockSequences(cctx,
++                            inSeqs, block.nbSequences,
++                            repcodeResolution);
++        FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion");
++        inSeqs += block.nbSequences;
++        nbSequences -= block.nbSequences;
++        remaining -= block.blockSize;
++
++        /* Note: when blockSize is very small, other variant send it uncompressed.
++         * Here, we still send the sequences, because we don't have the original source to send it uncompressed.
++         * One could imagine in theory reproducing the source from the sequences,
++         * but that's complex and costly memory intensive, and goes against the objectives of this variant. */
++
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
++
++        compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal(
++                                op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
++                                literals, block.litSize,
++                                &cctx->seqStore,
++                                &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
++                                &cctx->appliedParams,
++                                cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */,
++                                cctx->bmi2);
++        FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
++        /* note: the spec forbids for any compressed block to be larger than maximum block size */
++        if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0;
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
++        litSize -= block.litSize;
++        literals = (const char*)literals + block.litSize;
++
++        /* Note: difficult to check source for RLE block when only Literals are provided,
++         * but it could be considered from analyzing the sequence directly */
++
++        if (compressedSeqsSize == 0) {
++            /* Sending uncompressed blocks is out of reach, because the source is not provided.
++             * In theory, one could use the sequences to regenerate the source, like a decompressor,
++             * but it's complex, and memory hungry, killing the purpose of this variant.
++             * Current outcome: generate an error code.
++             */
++            RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block");
++        } else {
++            U32 cBlockHeader;
++            assert(compressedSeqsSize > 1); /* no RLE */
++            /* Error checking and repcodes update */
++            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
++            if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
++                cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
++
++            /* Write block header into beginning of block*/
++            cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
++            MEM_writeLE24(op, cBlockHeader);
++            cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
++        }
++
++        cSize += cBlockSize;
++        op += cBlockSize;
++        dstCapacity -= cBlockSize;
++        cctx->isFirstBlock = 0;
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
++
++        if (lastBlock) {
++            assert(nbSequences == 0);
++            break;
++        }
++    }
++
++    RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed");
++    RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize);
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
++    return cSize;
++}
++
++size_t
++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx,
++                    void* dst, size_t dstCapacity,
++                    const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                    const void* literals, size_t litSize, size_t litCapacity,
++                    size_t decompressedSize)
++{
++    BYTE* op = (BYTE*)dst;
++    size_t cSize = 0;
++
++    /* Transparent initialization stage, same as compressStream2() */
++    DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity);
++    assert(cctx != NULL);
++    if (litCapacity < litSize) {
++        RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)");
++    }
++    FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed");
++
++    if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) {
++        RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters");
++    }
++    if (cctx->appliedParams.validateSequences) {
++        RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation");
++    }
++    if (cctx->appliedParams.fParams.checksumFlag) {
++        RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum");
++    }
++
++    /* Begin writing output, starting with frame header */
++    {   size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity,
++                    &cctx->appliedParams, decompressedSize, cctx->dictID);
++        op += frameHeaderSize;
++        assert(frameHeaderSize <= dstCapacity);
++        dstCapacity -= frameHeaderSize;
++        cSize += frameHeaderSize;
++    }
++
++    /* Now generate compressed blocks */
++    {   size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx,
++                                            op, dstCapacity,
++                                            inSeqs, inSeqsSize,
++                                            literals, litSize, decompressedSize);
++        FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!");
++        cSize += cBlocksSize;
++        assert(cBlocksSize <= dstCapacity);
++        dstCapacity -= cBlocksSize;
++    }
++
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+-
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6046,7 +7522,7 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+     }
+ }
+ 
+-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+@@ -6070,8 +7546,8 @@ static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMo
+  * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+  *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+  *        Use dictSize == 0 for unknown or unused.
+- *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */
+-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
++ *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */
++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
+ {
+     U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode);
+     U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+@@ -6092,7 +7568,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6109,7 +7585,9 @@ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long l
+  *  same idea as ZSTD_getCParams()
+  * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+  *  Fields of `ZSTD_frameParameters` are set to default values */
+-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
++static ZSTD_parameters
++ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode)
++{
+     ZSTD_parameters params;
+     ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode);
+     DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+@@ -6123,7 +7601,34 @@ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned lo
+  *  same idea as ZSTD_getCParams()
+  * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+  *  Fields of `ZSTD_frameParameters` are set to default values */
+-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
++ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
++{
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc)
++{
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc)
++{
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
++    } else {
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..b10978385876 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,7 +21,8 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
+-
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
++#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */
+ 
+ /*-*************************************
+ *  Constants
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
+@@ -75,6 +77,70 @@ typedef struct {
+     ZSTD_fseCTables_t fse;
+ } ZSTD_entropyCTables_t;
+ 
++/* *********************************************
++*  Sequences *
++***********************************************/
++typedef struct SeqDef_s {
++    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
++    U16 litLength;
++    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
++} SeqDef;
++
++/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */
++typedef enum {
++    ZSTD_llt_none = 0,             /* no longLengthType */
++    ZSTD_llt_literalLength = 1,    /* represents a long literal */
++    ZSTD_llt_matchLength = 2       /* represents a long match */
++} ZSTD_longLengthType_e;
++
++typedef struct {
++    SeqDef* sequencesStart;
++    SeqDef* sequences;      /* ptr to end of sequences */
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
++    size_t maxNbSeq;
++    size_t maxNbLit;
++
++    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
++     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
++     * the existing value of the litLength or matchLength by 0x10000.
++     */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
++} SeqStore_t;
++
++typedef struct {
++    U32 litLength;
++    U32 matchLength;
++} ZSTD_SequenceLength;
++
++/*
++ * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences
++ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
++ */
++MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq)
++{
++    ZSTD_SequenceLength seqLen;
++    seqLen.litLength = seq->litLength;
++    seqLen.matchLength = seq->mlBase + MINMATCH;
++    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
++        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
++            seqLen.litLength += 0x10000;
++        }
++        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
++            seqLen.matchLength += 0x10000;
++        }
++    }
++    return seqLen;
++}
++
++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
++
++
+ /* *********************************************
+ *  Entropy buffer statistics structs and funcs *
+ ***********************************************/
+@@ -84,7 +150,7 @@ typedef struct {
+  *  hufDesSize refers to the size of huffman tree description in bytes.
+  *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
+ typedef struct {
+-    symbolEncodingType_e hType;
++    SymbolEncodingType_e hType;
+     BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
+     size_t hufDesSize;
+ } ZSTD_hufCTablesMetadata_t;
+@@ -95,9 +161,9 @@ typedef struct {
+  *  fseTablesSize refers to the size of fse tables in bytes.
+  *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */
+ typedef struct {
+-    symbolEncodingType_e llType;
+-    symbolEncodingType_e ofType;
+-    symbolEncodingType_e mlType;
++    SymbolEncodingType_e llType;
++    SymbolEncodingType_e ofType;
++    SymbolEncodingType_e mlType;
+     BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
+     size_t fseTablesSize;
+     size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+@@ -111,12 +177,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const SeqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -140,28 +207,29 @@ typedef struct {
+                            stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */
+   size_t size;          /* The number of sequences. <= capacity. */
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+-} rawSeqStore_t;
++} RawSeqStore_t;
+ 
+-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
++UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -173,7 +241,7 @@ typedef struct {
+     U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+     ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+     const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+-    ZSTD_paramSwitch_e literalCompressionMode;
++    ZSTD_ParamSwitch_e literalCompressionMode;
+ } optState_t;
+ 
+ typedef struct {
+@@ -195,11 +263,11 @@ typedef struct {
+ 
+ #define ZSTD_WINDOW_START_INDEX 2
+ 
+-typedef struct ZSTD_matchState_t ZSTD_matchState_t;
++typedef struct ZSTD_MatchState_t ZSTD_MatchState_t;
+ 
+ #define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */
+ 
+-struct ZSTD_matchState_t {
++struct ZSTD_MatchState_t {
+     ZSTD_window_t window;   /* State for window round buffer management */
+     U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
+                              * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+@@ -212,28 +280,42 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+     U32* chainTable;
+ 
+-    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
++    int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
+ 
+     int dedicatedDictSearch;  /* Indicates whether this matchState is using the
+                                * dedicated dictionary search structure.
+                                */
+     optState_t opt;         /* optimal parser state */
+-    const ZSTD_matchState_t* dictMatchState;
++    const ZSTD_MatchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+-    const rawSeqStore_t* ldmSeqStore;
++    const RawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+     ZSTD_compressedBlockState_t* prevCBlock;
+     ZSTD_compressedBlockState_t* nextCBlock;
+-    ZSTD_matchState_t matchState;
++    ZSTD_MatchState_t matchState;
+ } ZSTD_blockState_t;
+ 
+ typedef struct {
+@@ -260,7 +342,7 @@ typedef struct {
+ } ldmState_t;
+ 
+ typedef struct {
+-    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
++    ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
+     U32 hashLog;            /* Log size of hashTable */
+     U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+     U32 minMatchLength;     /* Minimum match length */
+@@ -291,7 +373,7 @@ struct ZSTD_CCtx_params_s {
+                                 * There is no guarantee that hint is close to actual source size */
+ 
+     ZSTD_dictAttachPref_e attachDictPref;
+-    ZSTD_paramSwitch_e literalCompressionMode;
++    ZSTD_ParamSwitch_e literalCompressionMode;
+ 
+     /* Multithreading: used to pass parameters to mtctx */
+     int nbWorkers;
+@@ -310,24 +392,54 @@ struct ZSTD_CCtx_params_s {
+     ZSTD_bufferMode_e outBufferMode;
+ 
+     /* Sequence compression API */
+-    ZSTD_sequenceFormat_e blockDelimiters;
++    ZSTD_SequenceFormat_e blockDelimiters;
+     int validateSequences;
+ 
+-    /* Block splitting */
+-    ZSTD_paramSwitch_e useBlockSplitter;
++    /* Block splitting
++     * @postBlockSplitter executes split analysis after sequences are produced,
++     * it's more accurate but consumes more resources.
++     * @preBlockSplitter_level splits before knowing sequences,
++     * it's more approximative but also cheaper.
++     * Valid @preBlockSplitter_level values range from 0 to 6 (included).
++     * 0 means auto, 1 means do not split,
++     * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest).
++     * Highest @preBlockSplitter_level combines well with @postBlockSplitter.
++     */
++    ZSTD_ParamSwitch_e postBlockSplitter;
++    int preBlockSplitter_level;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
+ 
+     /* Param for deciding whether to use row-based matchfinder */
+-    ZSTD_paramSwitch_e useRowMatchFinder;
++    ZSTD_ParamSwitch_e useRowMatchFinder;
+ 
+     /* Always load a dictionary in ext-dict mode (not prefix mode)? */
+     int deterministicRefPrefix;
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_ParamSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_ParamSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+ #define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE)
++#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE))
+ 
+ /*
+  * Indicates whether this compression proceeds directly from user-provided
+@@ -345,11 +457,11 @@ typedef enum {
+  */
+ #define ZSTD_MAX_NB_BLOCK_SPLITS 196
+ typedef struct {
+-    seqStore_t fullSeqStoreChunk;
+-    seqStore_t firstHalfSeqStore;
+-    seqStore_t secondHalfSeqStore;
+-    seqStore_t currSeqStore;
+-    seqStore_t nextSeqStore;
++    SeqStore_t fullSeqStoreChunk;
++    SeqStore_t firstHalfSeqStore;
++    SeqStore_t secondHalfSeqStore;
++    SeqStore_t currSeqStore;
++    SeqStore_t nextSeqStore;
+ 
+     U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+@@ -366,7 +478,7 @@ struct ZSTD_CCtx_s {
+     size_t dictContentSize;
+ 
+     ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+-    size_t blockSize;
++    size_t blockSizeMax;
+     unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+     unsigned long long consumedSrcSize;
+     unsigned long long producedCSize;
+@@ -378,13 +490,14 @@ struct ZSTD_CCtx_s {
+     int isFirstBlock;
+     int initialized;
+ 
+-    seqStore_t seqStore;      /* sequences storage ptrs */
++    SeqStore_t seqStore;      /* sequences storage ptrs */
+     ldmState_t ldmState;      /* long distance matching state */
+     rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+     size_t maxNbLdmSequences;
+-    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
++    RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+     ZSTD_blockState_t blockState;
+-    U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
++    void* tmpWorkspace;  /* used as substitute of stack space - must be aligned for S64 type */
++    size_t tmpWkspSize;
+ 
+     /* Whether we are streaming or not */
+     ZSTD_buffered_policy_e bufferedPolicy;
+@@ -404,6 +517,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +531,14 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,17 +560,17 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+                                  */
+-} ZSTD_cParamMode_e;
++} ZSTD_CParamMode_e;
+ 
+-typedef size_t (*ZSTD_blockCompressor) (
+-        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++typedef size_t (*ZSTD_BlockCompressor_f) (
++        ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
+ 
+ 
+ MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+@@ -497,12 +616,33 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+     return 1;
+ }
+ 
++/* ZSTD_selectAddr:
++ * @return index >= lowLimit ? candidate : backup,
++ * tries to force branchless codegen. */
++MEM_STATIC const BYTE*
++ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup)
++{
++#if defined(__x86_64__)
++    __asm__ (
++        "cmp %1, %2\n"
++        "cmova %3, %0\n"
++        : "+r"(candidate)
++        : "r"(index), "r"(lowLimit), "r"(backup)
++        );
++    return candidate;
++#else
++    return index >= lowLimit ? candidate : backup;
++#endif
++}
++
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +650,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +670,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +706,68 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
++
++/*! ZSTD_storeSeqOnly() :
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t.
++ *  Literals themselves are not copied, but @litPtr is updated.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
++ *  @matchLength : must be >= MINMATCH
++*/
++HINT_INLINE UNUSED_ATTR void
++ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr,
++              size_t litLength,
++              U32 offBase,
++              size_t matchLength)
++{
++    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
++
++    /* literal Length */
++    assert(litLength <= ZSTD_BLOCKSIZE_MAX);
++    if (UNLIKELY(litLength>0xFFFF)) {
++        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
++        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
++        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++    }
++    seqStorePtr->sequences[0].litLength = (U16)litLength;
++
++    /* match offset */
++    seqStorePtr->sequences[0].offBase = offBase;
++
++    /* match Length */
++    assert(matchLength <= ZSTD_BLOCKSIZE_MAX);
++    assert(matchLength >= MINMATCH);
++    {   size_t const mlBase = matchLength - MINMATCH;
++        if (UNLIKELY(mlBase>0xFFFF)) {
++            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
++            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
++            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        }
++        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
++    }
++
++    seqStorePtr->sequences++;
++}
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+-ZSTD_storeSeq(seqStore_t* seqStorePtr,
++ZSTD_storeSeq(SeqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +776,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +787,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -619,44 +799,22 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     }
+     seqStorePtr->lit += litLength;
+ 
+-    /* literal Length */
+-    if (litLength>0xFFFF) {
+-        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+-        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
+-        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+-    }
+-    seqStorePtr->sequences[0].litLength = (U16)litLength;
+-
+-    /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+-
+-    /* match Length */
+-    assert(matchLength >= MINMATCH);
+-    {   size_t const mlBase = matchLength - MINMATCH;
+-        if (mlBase>0xFFFF) {
+-            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+-            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
+-            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+-        }
+-        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
+-    }
+-
+-    seqStorePtr->sequences++;
++    ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength);
+ }
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -670,14 +828,14 @@ ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ 
+ typedef struct repcodes_s {
+     U32 rep[3];
+-} repcodes_t;
++} Repcodes_t;
+ 
+-MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++MEM_STATIC Repcodes_t
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    repcodes_t newReps;
++    Repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +843,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -771,8 +876,8 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+     size_t const matchLength = ZSTD_count(ip, match, vEnd);
+     if (match + matchLength != mEnd) return matchLength;
+     DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+-    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+-    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
++    DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match));
++    DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip));
+     DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+     DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+     return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+@@ -783,32 +888,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +936,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -881,11 +1015,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64
+ /*-*************************************
+ *  Round buffer management
+ ***************************************/
+-#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+-# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+-#endif
+-/* Max current allowed */
+-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
++/* Max @current value allowed:
++ * In 32-bit mode: we want to avoid crossing the 2 GB limit,
++ *                 reducing risks of side effects in case of signed operations on indexes.
++ * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB)
++ *                 doesn't overflow U32 index capacity (4 GB) */
++#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB)
+ /* Maximum chunk size before overflow correction needs to be called again */
+ #define ZSTD_CHUNKSIZE_MAX                                                     \
+     ( ((U32)-1)                  /* Maximum ending current index */            \
+@@ -925,7 +1060,7 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+  * Inspects the provided matchState and figures out what dictMode should be
+  * passed to the compressor.
+  */
+-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
++MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms)
+ {
+     return ZSTD_window_hasExtDict(ms->window) ?
+         ZSTD_extDict :
+@@ -1011,7 +1146,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1112,7 +1249,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                      const void* blockEnd,
+                            U32   maxDist,
+                            U32*  loadedDictEndPtr,
+-                     const ZSTD_matchState_t** dictMatchStatePtr)
++                     const ZSTD_MatchState_t** dictMatchStatePtr)
+ {
+     U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+     U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+@@ -1157,7 +1294,7 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                        const void* blockEnd,
+                              U32   maxDist,
+                              U32*  loadedDictEndPtr,
+-                       const ZSTD_matchState_t** dictMatchStatePtr)
++                       const ZSTD_MatchState_t** dictMatchStatePtr)
+ {
+     assert(loadedDictEndPtr != NULL);
+     assert(dictMatchStatePtr != NULL);
+@@ -1167,10 +1304,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1199,9 +1341,11 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+-                                  void const* src, size_t srcSize,
+-                                  int forceNonContiguous)
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
++                 const void* src, size_t srcSize,
++                       int forceNonContiguous)
+ {
+     BYTE const* const ip = (BYTE const*)src;
+     U32 contiguous = 1;
+@@ -1228,8 +1372,9 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+     /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+     if ( (ip+srcSize > window->dictBase + window->lowLimit)
+        & (ip < window->dictBase + window->dictLimit)) {
+-        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+-        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
++        size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase);
++        U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
++        assert(highInputIdx < UINT_MAX);
+         window->lowLimit = lowLimitMax;
+         DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+     }
+@@ -1239,7 +1384,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+ /*
+  * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+  */
+-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
++MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog)
+ {
+     U32 const maxDistance = 1U << windowLog;
+     U32 const lowestValid = ms->window.lowLimit;
+@@ -1256,7 +1401,7 @@ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, u
+ /*
+  * Returns the lowest allowed match index in the prefix.
+  */
+-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
++MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog)
+ {
+     U32    const maxDistance = 1U << windowLog;
+     U32    const lowestValid = ms->window.dictLimit;
+@@ -1269,6 +1414,13 @@ MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr,
+     return matchLowest;
+ }
+ 
++/* index_safety_check:
++ * intentional underflow : ensure repIndex isn't overlapping dict + prefix
++ * @return 1 if values are not overlapping,
++ * 0 otherwise */
++MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) {
++    return ((U32)((prefixLowestIndex-1)  - repIndex) >= 3);
++}
+ 
+ 
+ /* debug functions */
+@@ -1302,7 +1454,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
+ 
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ /* ===============================================================
+  * Shared internal declarations
+@@ -1319,6 +1506,25 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_SequencePosition;
++
++/* for benchmark */
++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
++                        const ZSTD_Sequence* const inSeqs, size_t nbSequences,
++                        int const repcodeResolution);
++
++typedef struct {
++    size_t nbSequences;
++    size_t blockSize;
++    size_t litSize;
++} BlockSummary;
++
++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs);
++
+ /* ==============================================================
+  * Private declarations
+  * These prototypes shall only be called from within lib/compress
+@@ -1330,7 +1536,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+  * Note: srcSizeHint == 0 means 0!
+  */
+ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
++        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode);
+ 
+ /*! ZSTD_initCStream_internal() :
+  *  Private use only. Init streaming operation.
+@@ -1342,7 +1548,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                      const ZSTD_CDict* cdict,
+                      const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+ 
+-void ZSTD_resetSeqStore(seqStore_t* ssPtr);
++void ZSTD_resetSeqStore(SeqStore_t* ssPtr);
+ 
+ /*! ZSTD_getCParamsFromCDict() :
+  *  as the name implies */
+@@ -1381,11 +1587,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1601,28 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..ec39b4299b6f 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+-    symbolEncodingType_e hType = set_compressed;
++    SymbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..256980c9d85a 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -153,20 +154,20 @@ size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+     return cost >> 8;
+ }
+ 
+-symbolEncodingType_e
++SymbolEncodingType_e
+ ZSTD_selectEncodingType(
+         FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+         size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+         FSE_CTable const* prevCTable,
+         short const* defaultNorm, U32 defaultNormLog,
+-        ZSTD_defaultPolicy_e const isDefaultAllowed,
++        ZSTD_DefaultPolicy_e const isDefaultAllowed,
+         ZSTD_strategy const strategy)
+ {
+     ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+@@ -241,7 +242,7 @@ typedef struct {
+ 
+ size_t
+ ZSTD_buildCTable(void* dst, size_t dstCapacity,
+-                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
++                FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type,
+                 unsigned* count, U32 max,
+                 const BYTE* codeTable, size_t nbSeq,
+                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+@@ -293,7 +294,7 @@ ZSTD_encodeSequences_body(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     BIT_CStream_t blockStream;
+     FSE_CState_t  stateMatchLength;
+@@ -387,7 +388,7 @@ ZSTD_encodeSequences_default(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                     CTable_MatchLength, mlCodeTable,
+@@ -405,7 +406,7 @@ ZSTD_encodeSequences_bmi2(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets)
+ {
+     return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                     CTable_MatchLength, mlCodeTable,
+@@ -421,7 +422,7 @@ size_t ZSTD_encodeSequences(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
++            SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+ {
+     DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+ #if DYNAMIC_BMI2
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..14fdccb6547f 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,26 +12,27 @@
+ #ifndef ZSTD_COMPRESS_SEQUENCES_H
+ #define ZSTD_COMPRESS_SEQUENCES_H
+ 
++#include "zstd_compress_internal.h" /* SeqDef */
+ #include "../common/fse.h" /* FSE_repeat, FSE_CTable */
+-#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
++#include "../common/zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */
+ 
+ typedef enum {
+     ZSTD_defaultDisallowed = 0,
+     ZSTD_defaultAllowed = 1
+-} ZSTD_defaultPolicy_e;
++} ZSTD_DefaultPolicy_e;
+ 
+-symbolEncodingType_e
++SymbolEncodingType_e
+ ZSTD_selectEncodingType(
+         FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+         size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+         FSE_CTable const* prevCTable,
+         short const* defaultNorm, U32 defaultNormLog,
+-        ZSTD_defaultPolicy_e const isDefaultAllowed,
++        ZSTD_DefaultPolicy_e const isDefaultAllowed,
+         ZSTD_strategy const strategy);
+ 
+ size_t
+ ZSTD_buildCTable(void* dst, size_t dstCapacity,
+-                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
++                FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type,
+                 unsigned* count, U32 max,
+                 const BYTE* codeTable, size_t nbSeq,
+                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+@@ -42,7 +44,7 @@ size_t ZSTD_encodeSequences(
+             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+             FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+-            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
++            SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+ 
+ size_t ZSTD_fseBitCost(
+     FSE_CTable const* ctable,
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..dc12d64e935c 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -50,11 +52,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart + lhSize;
+     U32 const singleStream = lhSize == 3;
+-    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
++    SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
++static size_t
++ZSTD_seqDecompressedSize(SeqStore_t const* seqStore,
++                   const SeqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
++{
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_SequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const SeqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -258,7 +263,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+  *            Or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                    const seqDef* sequences, size_t nbSeq,
++                                    const SeqDef* sequences, size_t nbSeq,
+                                     const BYTE* literals, size_t litSize,
+                                     const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                     const ZSTD_CCtx_params* cctxParams,
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -322,7 +328,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit
+     return 0;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
++static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type,
+                         const BYTE* codeTable, unsigned maxCode,
+                         size_t nbSeq, const FSE_CTable* fseCTable,
+                         const U8* additionalBits,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,14 +427,57 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
+     return 0;
+ }
+ 
++static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
+-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
++static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+                             const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+@@ -432,12 +487,14 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const int bmi2, U32 lastBlock,
+                             void* workspace, size_t wkspSize)
+ {
+-    const seqDef* const sstart = seqStorePtr->sequencesStart;
+-    const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const SeqDef* const sstart = seqStorePtr->sequencesStart;
++    const SeqDef* const send = seqStorePtr->sequences;
++    const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+-
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
+-
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
++
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+         }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
+-            repcodes_t rep;
++            const SeqDef* seq;
++            Repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+@@ -559,7 +675,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+           &zc->blockState.nextCBlock->entropy,
+           &zc->appliedParams,
+           &entropyMetadata,
+-          zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
++          zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), "");
+ 
+     return ZSTD_compressSubBlock_multi(&zc->seqStore,
+             zc->blockState.prevCBlock,
+@@ -569,5 +685,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+             dst, dstCapacity,
+             src, srcSize,
+             zc->bmi2, lastBlock,
+-            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
++            zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */);
+ }
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..dce42f653bae 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,8 +15,10 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
+-
++#include "../common/portability_macros.h"
++#include "../common/compiler.h" /* ZS2_isPower2 */
+ 
+ /*-*************************************
+ *  Constants
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,14 +184,16 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+  * Align must be a power of 2.
+  */
+-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
++MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) {
+     size_t const mask = align - 1;
+-    assert((align & mask) == 0);
++    assert(ZSTD_isPower2(align));
+     return (size + mask) & ~mask;
+ }
+ 
+@@ -189,7 +207,7 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+  * to figure out how much space you need for the matchState tables. Everything
+  * else is though.
+  *
+- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
++ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size().
+  */
+ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+     if (size == 0)
+@@ -197,12 +215,16 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+     return size;
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) {
++    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment));
++}
++
+ /*
+  * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
+  * Used to determine the number of bytes required for a given "aligned".
+  */
+-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+-    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
++MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) {
++    return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES);
+ }
+ 
+ /*
+@@ -210,14 +232,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -229,11 +247,23 @@ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+-    assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(ZSTD_isPower2(alignBytes));
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws)
++{
++    char* endPtr = (char*)ws->workspaceEnd;
++    assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES));
++    endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES);
++    return (void*)endPtr;
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -246,7 +276,7 @@ ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+ {
+     void* const alloc = (BYTE*)ws->allocStart - bytes;
+     void* const bottom = ws->tableEnd;
+-    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
++    DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining",
+         alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+     ZSTD_cwksp_assert_internal_consistency(ws);
+     assert(alloc >= bottom);
+@@ -274,27 +304,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +335,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -345,29 +366,61 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+ 
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
+  */
+-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
+-                                            ZSTD_cwksp_alloc_aligned);
+-    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes)
++{
++    void* const ptr = ZSTD_cwksp_reserve_internal(ws,
++                        ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
++                        ZSTD_cwksp_alloc_aligned);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+     return ptr;
+ }
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -387,7 +440,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ 
+ 
+     assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+-    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+     return alloc;
+ }
+ 
+@@ -421,6 +474,20 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+ 
+     return alloc;
+ }
++/*
++ * with alignment control
++ * Note : should happen only once, at workspace first initialization
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment)
++{
++    size_t const mask = alignment - 1;
++    size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0;
++    void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus);
++    if (start == NULL) return NULL;
++    if (surplus == 0) return start;
++    assert(ZSTD_isPower2(alignment));
++    return (void*)(((size_t)start + surplus) & ~mask);
++}
+ 
+ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+ {
+@@ -451,7 +518,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -460,7 +527,8 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+  * Invalidates table allocations.
+  * All other allocations remain valid.
+  */
+-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
++MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws)
++{
+     DEBUGLOG(4, "cwksp: clearing tables!");
+ 
+ 
+@@ -478,14 +546,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +575,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -529,15 +607,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
+@@ -550,17 +619,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+@@ -591,5 +654,4 @@ MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+     }
+ }
+ 
+-
+ #endif /* ZSTD_CWKSP_H */
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..995e83f3a183 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,49 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,13 +85,26 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+ {
+     ZSTD_compressionParameters const* cParams = &ms->cParams;
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -88,9 +143,14 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* matchl0; /* the long match for ip */
+     const BYTE* matchs0; /* the short match for ip */
+     const BYTE* matchl1; /* the long match for ip1 */
++    const BYTE* matchs0_safe; /* matchs0 or safe address */
+ 
+     const BYTE* ip = istart; /* the current position */
+     const BYTE* ip1; /* the next position */
++    /* Array of ~random data, should have low probability of matching data
++     * we load from here instead of from tables, if matchl0/matchl1 are
++     * invalid indices. Used to avoid unpredictable branches. */
++    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+ 
+@@ -100,8 +160,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,30 +191,35 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+             hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+ 
+-            if (idxl0 > prefixLowestIndex) {
++            /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch.
++             * However expression below complies into conditional move. Since
++             * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
++             * if there is a match, all branches become predictable. */
++            {   const BYTE*  const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]);
++
+                 /* check prefix long match */
+-                if (MEM_read64(matchl0) == MEM_read64(ip)) {
++                if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) {
+                     mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                     offset = (U32)(ip-matchl0);
+                     while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                     goto _match_found;
+-                }
+-            }
++            }   }
+ 
+             idxl1 = hashLong[hl1];
+             matchl1 = base + idxl1;
+ 
+-            if (idxs0 > prefixLowestIndex) {
+-                /* check prefix short match */
+-                if (MEM_read32(matchs0) == MEM_read32(ip)) {
+-                    goto _search_next_long;
+-                }
++            /* Same optimization as matchl0 above */
++            matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]);
++
++            /* check prefix short match */
++            if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) {
++                  goto _search_next_long;
+             }
+ 
+             if (ip1 >= nextStep) {
+@@ -175,30 +240,36 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+ 
+ _search_next_long:
+ 
+-        /* check prefix long +1 match */
+-        if (idxl1 > prefixLowestIndex) {
+-            if (MEM_read64(matchl1) == MEM_read64(ip1)) {
++        /* short match found: let's check for a longer one */
++        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
++        offset = (U32)(ip - matchs0);
++
++        /* check long match at +1 position */
++        if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) {
++            size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8;
++            if (l1len > mLength) {
++                /* use the long match instead */
+                 ip = ip1;
+-                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
++                mLength = l1len;
+                 offset = (U32)(ip-matchl1);
+-                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+-                goto _match_found;
++                matchs0 = matchl1;
+             }
+         }
+ 
+-        /* if no long +1 match, explore the short match we found */
+-        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+-        offset = (U32)(ip - matchs0);
+-        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
++        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */
+ 
+         /* fall-through */
+ 
+@@ -217,7 +288,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +314,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -254,8 +325,9 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+ {
+@@ -275,9 +347,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+     const U32* const dictHashLong  = dms->hashTable;
+     const U32* const dictHashSmall = dms->chainTable;
+@@ -286,8 +357,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +366,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +387,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -323,26 +405,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */
+ 
+         /* check repcode */
+-        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++        if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+             && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+-        if (matchIndexL > prefixLowestIndex) {
++        if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+             /* check prefix long match */
+-            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+-                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+-                offset = (U32)(ip-matchLong);
+-                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+-                goto _match_found;
+-            }
+-        } else {
++            mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
++            offset = (U32)(ip-matchLong);
++            while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
++            goto _match_found;
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -354,13 +434,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         }   }
+ 
+         if (matchIndexS > prefixLowestIndex) {
+-            /* check prefix short match */
++            /* short match  candidate */
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,25 +455,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+             /* check prefix long +1 match */
+-            if (matchIndexL3 > prefixLowestIndex) {
+-                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+-                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+-                    ip++;
+-                    offset = (U32)(ip-matchL3);
+-                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+-                    goto _match_found;
+-                }
+-            } else {
++            if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) {
++                mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
++                ip++;
++                offset = (U32)(ip-matchL3);
++                while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
++                goto _match_found;
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +498,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -443,12 +522,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+                         dictBase + repIndex2 - dictIndexDelta :
+                         base + repIndex2;
+-                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
++                if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2))
+                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +540,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -470,7 +549,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ 
+ #define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
+     static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
+             void const* src, size_t srcSize)                                                             \
+     {                                                                                                    \
+         return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+@@ -488,7 +567,7 @@ ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+ 
+ 
+ size_t ZSTD_compressBlock_doubleFast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     const U32 mls = ms->cParams.minMatch;
+@@ -508,7 +587,7 @@ size_t ZSTD_compressBlock_doubleFast(
+ 
+ 
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     const U32 mls = ms->cParams.minMatch;
+@@ -527,8 +606,10 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+ {
+@@ -579,13 +660,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         size_t mLength;
+         hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
+ 
+-        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
++        if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
+             & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +677,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +702,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -647,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 U32 const current2 = (U32)(ip-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
++                if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
+                     & (offset_2 <= current2 - dictStartIndex))
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -677,7 +758,7 @@ ZSTD_GEN_DFAST_FN(extDict, 6)
+ ZSTD_GEN_DFAST_FN(extDict, 7)
+ 
+ size_t ZSTD_compressBlock_doubleFast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
+@@ -694,3 +775,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..011556ce56f7 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,22 +12,32 @@
+ #ifndef ZSTD_DOUBLE_FAST_H
+ #define ZSTD_DOUBLE_FAST_H
+ 
+-
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
++
+ size_t ZSTD_compressBlock_doubleFast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_doubleFast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..60e07e839e5f 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,46 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
++
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++    }   }   }   }
++}
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +85,60 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
++
++typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
++
++static int
++ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
++{
++    /* Array of ~random data, should have low probability of matching data.
++     * Load from here if the index is invalid.
++     * Used to avoid unpredictable branches. */
++    static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
++
++    /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
++     * However expression below compiles into conditional move.
++     */
++    const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
++    /* Note: this used to be written as : return test1 && test2;
++     * Unfortunately, once inlined, these tests become branches,
++     * in which case it becomes critical that they are executed in the right order (test1 then test2).
++     * So we have to write these tests in a specific manner to ensure their ordering.
++     */
++    if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
++    /* force ordering of these tests, which matters once the function is inlined, as they become branches */
++    __asm__("");
++    return matchIdx >= idxLowLimit;
++}
++
++static int
++ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
++{
++    /* using a branch instead of a cmov,
++     * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
++     * aka almost all candidates are within range */
++    U32 mval;
++    if (matchIdx >= idxLowLimit) {
++        mval = MEM_read32(matchAddress);
++    } else {
++        mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
++    }
++
++    return (MEM_read32(currentPtr) == mval);
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -89,17 +186,17 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+-        U32 const mls, U32 const hasStep)
++        U32 const mls, int useCmov)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+-    /* support stepSize of 0 */
+-    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+@@ -117,12 +214,11 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+-    U32 idx; /* match idx for ip0 */
+-    U32 mval; /* src value at match idx */
++    U32 matchIdx; /* match idx for ip0 */
+ 
+     U32 offcode;
+     const BYTE* match0;
+@@ -135,14 +231,15 @@ ZSTD_compressBlock_fast_noDict_generic(
+     size_t step;
+     const BYTE* nextStep;
+     const size_t kStepIncr = (1 << (kSearchStrength - 1));
++    const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+     ip0 += (ip0 == prefixStart);
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -163,7 +260,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+ 
+-    idx = hashTable[hash0];
++    matchIdx = hashTable[hash0];
+ 
+     do {
+         /* load repcode match for ip[2]*/
+@@ -180,26 +277,28 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* Write next hash table entry: it's already calculated.
++             * This write is known to be safe because ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+-        /* load match for ip[0] */
+-        if (idx >= prefixStartIndex) {
+-            mval = MEM_read32(base + idx);
+-        } else {
+-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+-        }
++         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
++            /* Write next hash table entry (it's already calculated).
++            * This write is known to be safe because the ip1 == ip0 + 1,
++            * so searching will resume after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
+ 
+-        /* check match at ip[0] */
+-        if (MEM_read32(ip0) == mval) {
+-            /* found a match! */
+             goto _offset;
+         }
+ 
+         /* lookup ip[1] */
+-        idx = hashTable[hash1];
++        matchIdx = hashTable[hash1];
+ 
+         /* hash ip[2] */
+         hash0 = hash1;
+@@ -214,21 +313,19 @@ ZSTD_compressBlock_fast_noDict_generic(
+         current0 = (U32)(ip0 - base);
+         hashTable[hash0] = current0;
+ 
+-        /* load match for ip[0] */
+-        if (idx >= prefixStartIndex) {
+-            mval = MEM_read32(base + idx);
+-        } else {
+-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+-        }
+-
+-        /* check match at ip[0] */
+-        if (MEM_read32(ip0) == mval) {
+-            /* found a match! */
++         if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
++            /* Write next hash table entry, since it's already calculated */
++            if (step <= 4) {
++                /* Avoid writing an index if it's >= position where search will resume.
++                * The minimum possible match has length 4, so search can resume at ip0 + 4.
++                */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
+             goto _offset;
+         }
+ 
+         /* lookup ip[1] */
+-        idx = hashTable[hash1];
++        matchIdx = hashTable[hash1];
+ 
+         /* hash ip[2] */
+         hash0 = hash1;
+@@ -250,13 +347,28 @@ ZSTD_compressBlock_fast_noDict_generic(
+     } while (ip3 < ilimit);
+ 
+ _cleanup:
+-    /* Note that there are probably still a couple positions we could search.
++    /* Note that there are probably still a couple positions one could search.
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -264,10 +376,10 @@ ZSTD_compressBlock_fast_noDict_generic(
+ _offset: /* Requires: ip0, idx */
+ 
+     /* Compute the offset code. */
+-    match0 = base + idx;
++    match0 = base + matchIdx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +399,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +413,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -314,12 +421,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+     goto _start;
+ }
+ 
+-#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
+-    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
++#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov)                                                       \
++    static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov(                              \
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
+             void const* src, size_t srcSize)                                                       \
+     {                                                                                              \
+-        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
++        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
+     }
+ 
+ ZSTD_GEN_FAST_FN(noDict, 4, 1)
+@@ -333,13 +440,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ ZSTD_GEN_FAST_FN(noDict, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    U32 const mls = ms->cParams.minMatch;
++    U32 const mml = ms->cParams.minMatch;
++    /* use cmov when "candidate in range" branch is likely unpredictable */
++    int const useCmov = ms->cParams.windowLog < 19;
+     assert(ms->dictMatchState == NULL);
+-    if (ms->cParams.targetLength > 1) {
+-        switch(mls)
++    if (useCmov) {
++        switch(mml)
+         {
+         default: /* includes case 3 */
+         case 4 :
+@@ -352,7 +461,8 @@ size_t ZSTD_compressBlock_fast(
+             return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+         }
+     } else {
+-        switch(mls)
++        /* use a branch instead */
++        switch(mml)
+         {
+         default: /* includes case 3 */
+         case 4 :
+@@ -364,13 +474,13 @@ size_t ZSTD_compressBlock_fast(
+         case 7 :
+             return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+         }
+-
+     }
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -380,16 +490,16 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+     const U32* const dictHashTable = dms->hashTable;
+     const U32 dictStartIndex       = dms->window.dictLimit;
+@@ -397,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +523,154 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
++                /* found a regular match of size >= 4 */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+-                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -525,7 +683,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
+@@ -545,19 +703,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +729,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +760,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -656,10 +964,11 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ ZSTD_GEN_FAST_FN(extDict, 7, 0)
+ 
+ size_t ZSTD_compressBlock_fast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..04fde0a72a4e 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,21 +12,20 @@
+ #ifndef ZSTD_FAST_H
+ #define ZSTD_FAST_H
+ 
+-
+ #include "../common/mem.h"      /* U32 */
+ #include "zstd_compress_internal.h"
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_fast_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_fast_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-
+ #endif /* ZSTD_FAST_H */
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..88e2501fe3ef 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,14 +11,23 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_MatchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,9 +160,10 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
+-        const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
++        const ZSTD_MatchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+         size_t bestLength,
+@@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch (
+         U32 const mls,
+         const ZSTD_dictMode_e dictMode)
+ {
+-    const ZSTD_matchState_t * const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t * const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+     const U32 * const dictHashTable = dms->hashTable;
+     U32         const hashLog = dmsCParams->hashLog;
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -378,24 +391,25 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+ * Dedicated dict search
+ ***********************************/
+ 
+-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip)
+ {
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+@@ -514,7 +528,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
+  */
+ FORCE_INLINE_TEMPLATE
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+-                                            const ZSTD_matchState_t* const dms,
++                                            const ZSTD_MatchState_t* const dms,
+                                             const BYTE* const ip, const BYTE* const iLimit,
+                                             const BYTE* const prefixStart, const U32 curr,
+                                             const U32 dictLimit, const size_t ddsIdx) {
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+-                        ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
++                        ZSTD_MatchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,21 +648,25 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+ }
+ 
+-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+-                        ZSTD_matchState_t* ms,
++                        ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+                         size_t* offsetPtr,
+                         const U32 mls, const ZSTD_dictMode_e dictMode)
+@@ -670,7 +690,7 @@ size_t ZSTD_HcFindBestMatch(
+     U32 nbAttempts = 1U << cParams->searchLog;
+     size_t ml=4-1;
+ 
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
+                          ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+     const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -965,13 +947,41 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const
+  * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+  * processing.
+  */
+-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) {
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+     const U32 rowMask = (1u << rowLog) - 1;
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1103,29 +1124,30 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every context reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by introducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+-                        ZSTD_matchState_t* ms,
++                        ZSTD_MatchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+                         size_t* offsetPtr,
+                         const U32 mls, const ZSTD_dictMode_e dictMode,
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,11 +1165,14 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+ 
+     /* Initialize the following variables to satisfy static analyzer */
+     size_t ddsIdx = 0;
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1301,7 +1341,7 @@ size_t ZSTD_RowFindBestMatch(
+  * ZSTD_searchMax() dispatches to the correct implementation function.
+  *
+  * TODO: The start of the search function involves loading and calculating a
+- * bunch of constants from the ZSTD_matchState_t. These computations could be
++ * bunch of constants from the ZSTD_MatchState_t. These computations could be
+  * done in an initialization function, and saved somewhere in the match state.
+  * Then we could pass a pointer to the saved state instead of the match state,
+  * and avoid duplicate computations.
+@@ -1325,7 +1365,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \
+-            ZSTD_matchState_t* ms,                                                     \
++            ZSTD_MatchState_t* ms,                                                     \
+             const BYTE* ip, const BYTE* const iLimit,                                  \
+             size_t* offBasePtr)                                                        \
+     {                                                                                  \
+@@ -1335,7 +1375,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \
+-            ZSTD_matchState_t* ms,                                                    \
++            ZSTD_MatchState_t* ms,                                                    \
+             const BYTE* ip, const BYTE* const iLimit,                                 \
+             size_t* offsetPtr)                                                        \
+     {                                                                                 \
+@@ -1345,7 +1385,7 @@ size_t ZSTD_RowFindBestMatch(
+ 
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \
+     ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \
+-            ZSTD_matchState_t* ms,                                                             \
++            ZSTD_MatchState_t* ms,                                                             \
+             const BYTE* ip, const BYTE* const iLimit,                                          \
+             size_t* offsetPtr)                                                                 \
+     {                                                                                          \
+@@ -1446,7 +1486,7 @@ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searc
+  * If a match is found its offset is stored in @p offsetPtr.
+  */
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+-    ZSTD_matchState_t* ms,
++    ZSTD_MatchState_t* ms,
+     const BYTE* ip,
+     const BYTE* iend,
+     size_t* offsetPtr,
+@@ -1472,9 +1512,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
+-                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
++                        ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const searchMethod_e searchMethod, const U32 depth,
+@@ -1491,12 +1532,13 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+     const int isDxS = isDMS || isDDS;
+-    const ZSTD_matchState_t* const dms = ms->dictMatchState;
++    const ZSTD_MatchState_t* const dms = ms->dictMatchState;
+     const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+     const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+     const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1548,7 +1591,7 @@ ZSTD_compressBlock_lazy_generic(
+                                 && repIndex < prefixLowestIndex) ?
+                                    dictBase + (repIndex - dictIndexDelta) :
+                                    base + repIndex;
+-            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++            if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,34 +1631,34 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                dictBase + (repIndex - dictIndexDelta) :
+                                base + repIndex;
+-                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++                if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,34 +1667,34 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+                     const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                    dictBase + (repIndex - dictIndexDelta) :
+                                    base + repIndex;
+-                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
++                    if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                         && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1682,12 +1741,12 @@ ZSTD_compressBlock_lazy_generic(
+                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                         dictBase - dictIndexDelta + repIndex :
+                         base + repIndex;
+-                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
++                if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,168 +1760,183 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+-                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
++                        ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const searchMethod_e searchMethod, const U32 depth)
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1912,7 +1987,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             const U32 repIndex = (U32)(curr+1 - offset_1);
+             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+             const BYTE* const repMatch = repBase + repIndex;
+-            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
++            if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
+             if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                 /* repcode detected we should take it */
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,30 +2023,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                 const BYTE* const repMatch = repBase + repIndex;
+-                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++                if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                    & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                 if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                     /* repcode detected */
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,50 +2055,57 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                     const BYTE* const repMatch = repBase + repIndex;
+-                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++                    if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                        & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                     if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                         /* repcode detected */
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2023,14 +2114,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             const U32 repIndex = repCurrent - offset_2;
+             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+             const BYTE* const repMatch = repBase + repIndex;
+-            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
++            if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
+                & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+             if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2045,58 +2136,65 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..987a036d8bde 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,7 +12,6 @@
+ #ifndef ZSTD_LAZY_H
+ #define ZSTD_LAZY_H
+ 
+-
+ #include "zstd_compress_internal.h"
+ 
+ /*
+@@ -22,98 +22,173 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
+-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip);
++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip);
+ 
+-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_greedy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++size_t ZSTD_compressBlock_greedy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_lazy2_extDict_row(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
+ 
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..54eefad9cae6 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,7 @@
+ #include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+ #include "zstd_ldm_geartab.h"
+ 
+-#define LDM_BUCKET_SIZE_LOG 3
++#define LDM_BUCKET_SIZE_LOG 4
+ #define LDM_MIN_MATCH_LENGTH 64
+ #define LDM_HASH_RLOG 7
+ 
+@@ -133,21 +134,35 @@ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
+ }
+ 
+ void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+-                               ZSTD_compressionParameters const* cParams)
++                        const ZSTD_compressionParameters* cParams)
+ {
+     params->windowLog = cParams->windowLog;
+     ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+     DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+-    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+-    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
++    if (params->hashRateLog == 0) {
++        if (params->hashLog > 0) {
++            /* if params->hashLog is set, derive hashRateLog from it */
++            assert(params->hashLog <= ZSTD_HASHLOG_MAX);
++            if (params->windowLog > params->hashLog) {
++                params->hashRateLog = params->windowLog - params->hashLog;
++            }
++        } else {
++            assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
++            /* mapping from [fast, rate7] to [btultra2, rate4] */
++            params->hashRateLog = 7 - (cParams->strategy/3);
++        }
++    }
+     if (params->hashLog == 0) {
+-        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+-        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
++        params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX);
+     }
+-    if (params->hashRateLog == 0) {
+-        params->hashRateLog = params->windowLog < params->hashLog
+-                                   ? 0
+-                                   : params->windowLog - params->hashLog;
++    if (params->minMatchLength == 0) {
++        params->minMatchLength = LDM_MIN_MATCH_LENGTH;
++        if (cParams->strategy >= ZSTD_btultra)
++            params->minMatchLength /= 2;
++    }
++    if (params->bucketSizeLog==0) {
++        assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
++        params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX);
+     }
+     params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+ }
+@@ -170,22 +185,22 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+ /* ZSTD_ldm_getBucket() :
+  *  Returns a pointer to the start of the bucket associated with hash. */
+ static ldmEntry_t* ZSTD_ldm_getBucket(
+-        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
++        const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog)
+ {
+-    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
++    return ldmState->hashTable + (hash << bucketSizeLog);
+ }
+ 
+ /* ZSTD_ldm_insertEntry() :
+  *  Insert the entry with corresponding hash into the hash table */
+ static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                  size_t const hash, const ldmEntry_t entry,
+-                                 ldmParams_t const ldmParams)
++                                 U32 const bucketSizeLog)
+ {
+     BYTE* const pOffset = ldmState->bucketOffsets + hash;
+     unsigned const offset = *pOffset;
+ 
+-    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
+-    *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
++    *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry;
++    *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1));
+ 
+ }
+ 
+@@ -234,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMatch_2segments(
+  *
+  *  The tables for the other strategies are filled within their
+  *  block compressors. */
+-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
++static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms,
+                                       void const* end)
+ {
+     const BYTE* const iend = (const BYTE*)end;
+@@ -242,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_greedy:
+@@ -269,7 +288,8 @@ void ZSTD_ldm_fillHashTable(
+             const BYTE* iend, ldmParams_t const* params)
+ {
+     U32 const minMatchLength = params->minMatchLength;
+-    U32 const hBits = params->hashLog - params->bucketSizeLog;
++    U32 const bucketSizeLog = params->bucketSizeLog;
++    U32 const hBits = params->hashLog - bucketSizeLog;
+     BYTE const* const base = ldmState->window.base;
+     BYTE const* const istart = ip;
+     ldmRollingHashState_t hashState;
+@@ -284,7 +304,7 @@ void ZSTD_ldm_fillHashTable(
+         unsigned n;
+ 
+         numSplits = 0;
+-        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
++        hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits);
+ 
+         for (n = 0; n < numSplits; n++) {
+             if (ip + splits[n] >= istart + minMatchLength) {
+@@ -295,7 +315,7 @@ void ZSTD_ldm_fillHashTable(
+ 
+                 entry.offset = (U32)(split - base);
+                 entry.checksum = (U32)(xxhash >> 32);
+-                ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog);
+             }
+         }
+ 
+@@ -309,7 +329,7 @@ void ZSTD_ldm_fillHashTable(
+  *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+  *  if it is far way
+  *  (after a long match, only update tables a limited amount). */
+-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
++static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor)
+ {
+     U32 const curr = (U32)(anchor - ms->window.base);
+     if (curr > ms->nextToUpdate + 1024) {
+@@ -318,8 +338,10 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
+-        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
++        ldmState_t* ldmState, RawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+     /* LDM parameters */
+@@ -373,7 +395,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+             candidates[n].split = split;
+             candidates[n].hash = hash;
+             candidates[n].checksum = (U32)(xxhash >> 32);
+-            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
++            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog);
+             PREFETCH_L1(candidates[n].bucket);
+         }
+ 
+@@ -396,7 +418,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+              * the previous one, we merely register it in the hash table and
+              * move on */
+             if (split < anchor) {
+-                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+                 continue;
+             }
+ 
+@@ -443,7 +465,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+             /* No match found -- insert an entry into the hash table
+              * and process the next candidate match */
+             if (bestEntry == NULL) {
+-                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+                 continue;
+             }
+ 
+@@ -464,7 +486,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
+ 
+             /* Insert the current entry into the hash table --- it must be
+              * done after the previous block to avoid clobbering bestEntry */
+-            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
++            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog);
+ 
+             anchor = split + forwardMatchLength;
+ 
+@@ -503,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+ }
+ 
+ size_t ZSTD_ldm_generateSequences(
+-        ldmState_t* ldmState, rawSeqStore_t* sequences,
++        ldmState_t* ldmState, RawSeqStore_t* sequences,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+     U32 const maxDist = 1U << params->windowLog;
+@@ -549,7 +571,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -580,7 +602,7 @@ size_t ZSTD_ldm_generateSequences(
+ }
+ 
+ void
+-ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
++ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+ {
+     while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+         rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+@@ -616,7 +638,7 @@ ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const min
+  * Returns the current sequence to handle, or if the rest of the block should
+  * be literals, it returns a sequence with offset == 0.
+  */
+-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
++static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore,
+                                  U32 const remaining, U32 const minMatch)
+ {
+     rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+@@ -640,7 +662,7 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+     return sequence;
+ }
+ 
+-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) {
+     U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+     while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+         rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+@@ -657,14 +679,14 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+     }
+ }
+ 
+-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-    ZSTD_paramSwitch_e useRowMatchFinder,
++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore,
++    ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++    ZSTD_ParamSwitch_e useRowMatchFinder,
+     void const* src, size_t srcSize)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     unsigned const minMatch = cParams->minMatch;
+-    ZSTD_blockCompressor const blockCompressor =
++    ZSTD_BlockCompressor_f const blockCompressor =
+         ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
+     /* Input bounds */
+     BYTE const* const istart = (BYTE const*)src;
+@@ -689,7 +711,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +723,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +733,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..41400a7191b2 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,7 +12,6 @@
+ #ifndef ZSTD_LDM_H
+ #define ZSTD_LDM_H
+ 
+-
+ #include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+ #include <linux/zstd.h>   /* ZSTD_CCtx, size_t */
+ 
+@@ -40,7 +40,7 @@ void ZSTD_ldm_fillHashTable(
+  *       sequences.
+  */
+ size_t ZSTD_ldm_generateSequences(
+-            ldmState_t* ldms, rawSeqStore_t* sequences,
++            ldmState_t* ldms, RawSeqStore_t* sequences,
+             ldmParams_t const* params, void const* src, size_t srcSize);
+ 
+ /*
+@@ -61,9 +61,9 @@ size_t ZSTD_ldm_generateSequences(
+  * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+  * NOTE: This function does not return any errors.
+  */
+-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+-            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-            ZSTD_paramSwitch_e useRowMatchFinder,
++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore,
++            ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++            ZSTD_ParamSwitch_e useRowMatchFinder,
+             void const* src, size_t srcSize);
+ 
+ /*
+@@ -73,7 +73,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+  * Avoids emitting matches less than `minMatch` bytes.
+  * Must be called for data that is not passed to ZSTD_ldm_blockCompress().
+  */
+-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
++void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize,
+     U32 const minMatch);
+ 
+ /* ZSTD_ldm_skipRawSeqStoreBytes():
+@@ -81,7 +81,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+  * Not to be used in conjunction with ZSTD_ldm_skipSequences().
+  * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+  */
+-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes);
++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes);
+ 
+ /* ZSTD_ldm_getTableSize() :
+  *  Estimate the space needed for long distance matching tables or 0 if LDM is
+@@ -107,5 +107,4 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+ void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                                ZSTD_compressionParameters const* cParams);
+ 
+-
+ #endif /* ZSTD_FAST_H */
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..b62fd1b0d83e 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +30,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
+-                const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
++                const ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+                 U32 const mls, const int extDict)
+@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+-                ZSTD_matchState_t* ms,
++                ZSTD_MatchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 const U32 mls, const ZSTD_dictMode_e dictMode)
+ {
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal(
+     ms->nextToUpdate = target;
+ }
+ 
+-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) {
+     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+ }
+ 
+ FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_MatchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+     U32 mnum = 0;
+     U32 nbCompares = 1U << cParams->searchLog;
+ 
+-    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
++    const ZSTD_MatchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+     const ZSTD_compressionParameters* const dmsCParams =
+                                       dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+     const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 assert(curr >= windowLow);
+                 if ( dictMode == ZSTD_extDict
+                   && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow)  /* equivalent to `curr > repIndex >= windowLow` */
+-                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
++                     & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
+                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                 }
+                 if (dictMode == ZSTD_dictMatchState
+                   && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `curr > repIndex >= dmsLowLimit` */
+-                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
++                     & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
+                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+             }   }
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+ typedef U32 (*ZSTD_getAllMatchesFn)(
+     ZSTD_match_t*,
+-    ZSTD_matchState_t*,
++    ZSTD_MatchState_t*,
+     U32*,
+     const BYTE*,
+     const BYTE*,
+@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+-        ZSTD_matchState_t* ms,
++        ZSTD_MatchState_t* ms,
+         U32* nextToUpdate3,
+         const BYTE* ip,
+         const BYTE* const iHighLimit,
+@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+ #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \
+     static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \
+             ZSTD_match_t* matches,                             \
+-            ZSTD_matchState_t* ms,                             \
++            ZSTD_MatchState_t* ms,                             \
+             U32* nextToUpdate3,                                \
+             const BYTE* ip,                                    \
+             const BYTE* const iHighLimit,                      \
+@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+     }
+ 
+ static ZSTD_getAllMatchesFn
+-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
++ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode)
+ {
+     ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
+         ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const di
+ 
+ /* Struct containing info needed to make decision about ldm inclusion */
+ typedef struct {
+-    rawSeqStore_t seqStore;   /* External match candidates store for this block */
++    RawSeqStore_t seqStore;   /* External match candidates store for this block */
+     U32 startPosInBlock;      /* Start position of the current match candidate */
+     U32 endPosInBlock;        /* End position of the current match candidate */
+     U32 offset;               /* Offset of the match candidate */
+@@ -878,7 +916,7 @@ typedef struct {
+  * Moves forward in @rawSeqStore by @nbBytes,
+  * which will update the fields 'pos' and 'posInSequence'.
+  */
+-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
++static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes)
+ {
+     U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+     while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
+         return;
+     }
+ 
+-    /* Matches may be < MINMATCH by this process. In that case, we will reject them
++    /* Matches may be < minMatch by this process. In that case, we will reject them
+        when we are deciding whether or not to add the ldm */
+     optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
+     optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
+@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
+  * into 'matches'. Maintains the correct ordering of 'matches'.
+  */
+ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+-                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
++                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
++                                      U32 minMatch)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+     if (currPosInBlock < optLdm->startPosInBlock
+       || currPosInBlock >= optLdm->endPosInBlock
+-      || candidateMatchLength < MINMATCH) {
++      || candidateMatchLength < minMatch) {
+         return;
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+ static void
+ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+                                   ZSTD_match_t* matches, U32* nbMatches,
+-                                  U32 currPosInBlock, U32 remainingBytes)
++                                  U32 currPosInBlock, U32 remainingBytes,
++                                  U32 minMatch)
+ {
+     if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+         return;
+@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+         }
+         ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
+     }
+-    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
++    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch);
+ }
+ 
+ 
+@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltID)
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+-                               seqStore_t* seqStore,
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
++ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms,
++                               SeqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+                          const void* src, size_t srcSize,
+                          const int optLevel,
+@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip),
++                                              minMatch);
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                     pos, ZSTD_fCost(sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
++                        opt[pos].off = offBase;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
++                    ZSTD_optimal_t const prevMatch = opt[cur];
++                    DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
++                                (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+-            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
++            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+-                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
++                Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
++                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+                 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                                  (U32)(inr-istart), (U32)(iend-inr));
++                                                  (U32)(inr-istart), (U32)(iend-inr),
++                                                  minMatch);
+ 
+                 if (!nbMatches) {
+                     DEBUGLOG(7, "rPos:%u : no match found", cur);
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u",
++                                (int)(inr-istart), cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
+                 }   }
+ 
+@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+-                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+-                                anchor - istart, (unsigned)llen, (unsigned)mlen);
++                    DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u",
++                                (int)(anchor - istart), (unsigned)llen, (unsigned)mlen);
+ 
+                     if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                         assert(storePos == storeEnd);   /* must be last sequence */
+@@ -1308,11 +1426,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,42 +1441,51 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms,
++                          SeqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ }
+ 
+ size_t ZSTD_compressBlock_btultra(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra(
+ }
+ 
+ size_t ZSTD_compressBlock_btultra2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2(
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..fbdc540ec9d1 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,40 +12,62 @@
+ #ifndef ZSTD_OPT_H
+ #define ZSTD_OPT_H
+ 
+-
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_dictMatchState(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++size_t ZSTD_compressBlock_btopt_extDict(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ #endif /* ZSTD_OPT_H */
+diff --git a/lib/zstd/compress/zstd_preSplit.c b/lib/zstd/compress/zstd_preSplit.c
+new file mode 100644
+index 000000000000..7d9403c9a3bc
+--- /dev/null
++++ b/lib/zstd/compress/zstd_preSplit.c
+@@ -0,0 +1,239 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#include "../common/compiler.h" /* ZSTD_ALIGNOF */
++#include "../common/mem.h" /* S64 */
++#include "../common/zstd_deps.h" /* ZSTD_memset */
++#include "../common/zstd_internal.h" /* ZSTD_STATIC_ASSERT */
++#include "hist.h" /* HIST_add */
++#include "zstd_preSplit.h"
++
++
++#define BLOCKSIZE_MIN 3500
++#define THRESHOLD_PENALTY_RATE 16
++#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2)
++#define THRESHOLD_PENALTY 3
++
++#define HASHLENGTH 2
++#define HASHLOG_MAX 10
++#define HASHTABLESIZE (1 << HASHLOG_MAX)
++#define HASHMASK (HASHTABLESIZE - 1)
++#define KNUTH 0x9e3779b9
++
++/* for hashLog > 8, hash 2 bytes.
++ * for hashLog == 8, just take the byte, no hashing.
++ * The speed of this method relies on compile-time constant propagation */
++FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog)
++{
++    assert(hashLog >= 8);
++    if (hashLog == 8) return (U32)((const BYTE*)p)[0];
++    assert(hashLog <= HASHLOG_MAX);
++    return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog);
++}
++
++
++typedef struct {
++  unsigned events[HASHTABLESIZE];
++  size_t nbEvents;
++} Fingerprint;
++typedef struct {
++    Fingerprint pastEvents;
++    Fingerprint newEvents;
++} FPStats;
++
++static void initStats(FPStats* fpstats)
++{
++    ZSTD_memset(fpstats, 0, sizeof(FPStats));
++}
++
++FORCE_INLINE_TEMPLATE void
++addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog)
++{
++    const char* p = (const char*)src;
++    size_t limit = srcSize - HASHLENGTH + 1;
++    size_t n;
++    assert(srcSize >= HASHLENGTH);
++    for (n = 0; n < limit; n+=samplingRate) {
++        fp->events[hash2(p+n, hashLog)]++;
++    }
++    fp->nbEvents += limit/samplingRate;
++}
++
++FORCE_INLINE_TEMPLATE void
++recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog)
++{
++    ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog));
++    fp->nbEvents = 0;
++    addEvents_generic(fp, src, srcSize, samplingRate, hashLog);
++}
++
++typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize);
++
++#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate
++
++#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize)                                 \
++    static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \
++    {                                                                              \
++        recordFingerprint_generic(fp, src, srcSize, _rate, _hSize);                \
++    }
++
++ZSTD_GEN_RECORD_FINGERPRINT(1, 10)
++ZSTD_GEN_RECORD_FINGERPRINT(5, 10)
++ZSTD_GEN_RECORD_FINGERPRINT(11, 9)
++ZSTD_GEN_RECORD_FINGERPRINT(43, 8)
++
++
++static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); }
++
++static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog)
++{
++    U64 distance = 0;
++    size_t n;
++    assert(hashLog <= HASHLOG_MAX);
++    for (n = 0; n < ((size_t)1 << hashLog); n++) {
++        distance +=
++            abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents);
++    }
++    return distance;
++}
++
++/* Compare newEvents with pastEvents
++ * return 1 when considered "too different"
++ */
++static int compareFingerprints(const Fingerprint* ref,
++                            const Fingerprint* newfp,
++                            int penalty,
++                            unsigned hashLog)
++{
++    assert(ref->nbEvents > 0);
++    assert(newfp->nbEvents > 0);
++    {   U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents;
++        U64 deviation = fpDistance(ref, newfp, hashLog);
++        U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE;
++        return deviation >= threshold;
++    }
++}
++
++static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        acc->events[n] += newfp->events[n];
++    }
++    acc->nbEvents += newfp->nbEvents;
++}
++
++static void flushEvents(FPStats* fpstats)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        fpstats->pastEvents.events[n] = fpstats->newEvents.events[n];
++    }
++    fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents;
++    ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents));
++}
++
++static void removeEvents(Fingerprint* acc, const Fingerprint* slice)
++{
++    size_t n;
++    for (n = 0; n < HASHTABLESIZE; n++) {
++        assert(acc->events[n] >= slice->events[n]);
++        acc->events[n] -= slice->events[n];
++    }
++    acc->nbEvents -= slice->nbEvents;
++}
++
++#define CHUNKSIZE (8 << 10)
++static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize,
++                        int level,
++                        void* workspace, size_t wkspSize)
++{
++    static const RecordEvents_f records_fs[] = {
++        FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1)
++    };
++    static const unsigned hashParams[] = { 8, 9, 10, 10 };
++    const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]);
++    FPStats* const fpstats = (FPStats*)workspace;
++    const char* p = (const char*)blockStart;
++    int penalty = THRESHOLD_PENALTY;
++    size_t pos = 0;
++    assert(blockSize == (128 << 10));
++    assert(workspace != NULL);
++    assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0);
++    ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats));
++    assert(wkspSize >= sizeof(FPStats)); (void)wkspSize;
++
++    initStats(fpstats);
++    record_f(&fpstats->pastEvents, p, CHUNKSIZE);
++    for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) {
++        record_f(&fpstats->newEvents, p + pos, CHUNKSIZE);
++        if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) {
++            return pos;
++        } else {
++            mergeEvents(&fpstats->pastEvents, &fpstats->newEvents);
++            if (penalty > 0) penalty--;
++        }
++    }
++    assert(pos == blockSize);
++    return blockSize;
++    (void)flushEvents; (void)removeEvents;
++}
++
++/* ZSTD_splitBlock_fromBorders(): very fast strategy :
++ * compare fingerprint from beginning and end of the block,
++ * derive from their difference if it's preferable to split in the middle,
++ * repeat the process a second time, for finer grained decision.
++ * 3 times did not brought improvements, so I stopped at 2.
++ * Benefits are good enough for a cheap heuristic.
++ * More accurate splitting saves more, but speed impact is also more perceptible.
++ * For better accuracy, use more elaborate variant *_byChunks.
++ */
++static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize,
++                        void* workspace, size_t wkspSize)
++{
++#define SEGMENT_SIZE 512
++    FPStats* const fpstats = (FPStats*)workspace;
++    Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned));
++    assert(blockSize == (128 << 10));
++    assert(workspace != NULL);
++    assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0);
++    ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats));
++    assert(wkspSize >= sizeof(FPStats)); (void)wkspSize;
++
++    initStats(fpstats);
++    HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE);
++    HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE);
++    fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE;
++    if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8))
++        return blockSize;
++
++    HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE);
++    middleEvents->nbEvents = SEGMENT_SIZE;
++    {   U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8);
++        U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8);
++        U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3;
++        if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance)
++            return 64 KB;
++        return (distFromBegin > distFromEnd) ? 32 KB : 96 KB;
++    }
++}
++
++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
++                    int level,
++                    void* workspace, size_t wkspSize)
++{
++    DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level);
++    assert(0<=level && level<=4);
++    if (level == 0)
++        return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize);
++    /* level >= 1*/
++    return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize);
++}
+diff --git a/lib/zstd/compress/zstd_preSplit.h b/lib/zstd/compress/zstd_preSplit.h
+new file mode 100644
+index 000000000000..f98f797fe191
+--- /dev/null
++++ b/lib/zstd/compress/zstd_preSplit.h
+@@ -0,0 +1,34 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_PRESPLIT_H
++#define ZSTD_PRESPLIT_H
++
++#include <linux/types.h>  /* size_t */
++
++#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208
++
++/* ZSTD_splitBlock():
++ * @level must be a value between 0 and 4.
++ *        higher levels spend more energy to detect block boundaries.
++ * @workspace must be aligned for size_t.
++ * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE
++ * note:
++ * For the time being, this function only accepts full 128 KB blocks.
++ * Therefore, @blockSize must be == 128 KB.
++ * While this could be extended to smaller sizes in the future,
++ * it is not yet clear if this would be useful. TBD.
++ */
++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
++                    int level,
++                    void* workspace, size_t wkspSize);
++
++#endif /* ZSTD_PRESPLIT_H */
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..ac8b87f48f84 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilowest, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+ 
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
+          */
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
+ 
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
++    bit->start = (const char*)args->ilowest;
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +379,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        assert(dstSize >= 6); /* validated above */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    do {                                                        \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    } while (0)
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    do {                                                            \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+     * overwritten any op[].
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1175,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        assert(dstSize >= 6 /* validated above */);
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilowest = args->ilowest;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilowest);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    } while (0)
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1);
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
++        } while (op[3] < olimit);
++    }
++
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+-    assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..30ef65e1ab5c 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94711..da8b4cf116e3 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -53,13 +54,15 @@
+ *  Dependencies
+ *********************************************************/
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+@@ -72,11 +75,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
++**           or an error code, which can be tested using ZSTD_isError() */
++size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -438,8 +468,10 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
+             if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+                 return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+             ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));
+-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+             zfhPtr->frameType = ZSTD_skippableFrame;
++            zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START;
++            zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE;
++            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+             return 0;
+         }
+         RETURN_ERROR(prefix_unknown, "");
+@@ -508,7 +540,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+  *           or an error code, which can be tested using ZSTD_isError() */
+-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
++size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize)
+ {
+     return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+ }
+@@ -520,7 +552,7 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src
+  *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+ {
+-    {   ZSTD_frameHeader zfh;
++    {   ZSTD_FrameHeader zfh;
+         if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+             return ZSTD_CONTENTSIZE_ERROR;
+         if (zfh.frameType == ZSTD_skippableFrame) {
+@@ -540,61 +572,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+-    unsigned long long totalDstSize = 0;
++    U64 totalDstSize = 0;
+ 
+     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+         U32 const magicNumber = MEM_readLE32(src);
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +635,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (U64_MAX - totalDstSize < fcs)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -676,13 +709,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -693,10 +726,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+         const BYTE* const ipstart = ip;
+         size_t remainingSize = srcSize;
+         size_t nbBlocks = 0;
+-        ZSTD_frameHeader zfh;
++        ZSTD_FrameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,28 +763,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+  *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame or a skippeable frame
++ *  `src` must point to the start of a ZSTD frame or a skippable frame
+  *  `srcSize` must be at least as large as the frame contained
+  *  @return : the maximum decompressed size of the compressed source
+  */
+@@ -760,7 +796,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +809,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_FrameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -815,7 +893,7 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+     return regenSize;
+ }
+ 
+-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming)
++static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming)
+ {
+     (void)dctx;
+     (void)uncompressedSize;
+@@ -856,6 +934,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +970,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +984,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1015,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr));
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1043,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1150,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1181,7 +1270,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1340,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1353,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1394,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,10 +1494,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1557,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1566,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1673,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1683,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1694,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1766,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1815,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1854,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1754,6 +1873,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1890,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1908,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1793,7 +1924,7 @@ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+ {
+     U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+-    ZSTD_frameHeader zfh;
++    ZSTD_FrameHeader zfh;
+     size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+     if (ZSTD_isError(err)) return err;
+     RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+@@ -1888,6 +2019,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     U32 someMoreWork = 1;
+ 
+     DEBUGLOG(5, "ZSTD_decompressStream");
++    assert(zds != NULL);
+     RETURN_ERROR_IF(
+         input->pos > input->size,
+         srcSize_wrong,
+@@ -1918,7 +2050,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2063,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1943,14 +2079,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -1969,7 +2106,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2123,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2174,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2189,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2198,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2212,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2236,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2249,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2287,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..710eb0ffd5a3 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+-        }
+-        else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -124,7 +140,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+-        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -317,7 +359,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  * - start from default distributions, present in /lib/common/zstd_internal.h
+  * - generate tables normally, using ZSTD_buildFSETable()
+  * - printout the content of tables
+- * - pretify output, report below, test with fuzzer to ensure it's correct */
++ * - prettify output, report below, test with fuzzer to ensure it's correct */
+ 
+ /* Default FSE distribution table for Literal Lengths */
+ static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -603,7 +646,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+  * @return : nb bytes read from src,
+  *           or an error code if it fails */
+ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+-                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
++                                 SymbolEncodingType_e type, unsigned max, U32 maxLog,
+                                  const void* src, size_t srcSize,
+                                  const U32* baseValue, const U8* nbAdditionalBits,
+                                  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,11 +719,19 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+-    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+-        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+-        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
++    {   SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6);
++        SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3);
++        SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3);
+         ip++;
+ 
+         /* Build DTables */
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,50 +1928,40 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ #endif /* DYNAMIC_BMI2 */
+ 
+-typedef size_t (*ZSTD_decompressSequences_t)(
+-                            ZSTD_DCtx* dctx,
+-                            void* dst, size_t maxDstSize,
+-                            const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
+-
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1976,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+-
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
++
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,40 +2127,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2187,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
++    dctx->isFrameDecompression = 0;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..becffbd89364 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..2a225d1811c4 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -135,7 +137,7 @@ struct ZSTD_DCtx_s
+     const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+     const void* dictEnd;          /* end of previous segment */
+     size_t expected;
+-    ZSTD_frameHeader fParams;
++    ZSTD_FrameHeader fParams;
+     U64 processedCSize;
+     U64 decodedSize;
+     blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+@@ -152,7 +154,8 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
+-#if DYNAMIC_BMI2 != 0
++    int isFrameDecompression;
++#if DYNAMIC_BMI2
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+ 
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
++    int maxBlockSizeParam;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+@@ -199,11 +204,11 @@ struct ZSTD_DCtx_s
+ };  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+ 
+ MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+-#if DYNAMIC_BMI2 != 0
+-	return dctx->bmi2;
++#if DYNAMIC_BMI2
++    return dctx->bmi2;
+ #else
+     (void)dctx;
+-	return 0;
++    return 0;
+ #endif
+ }
+ 
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..466828e35752 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index bd8784449b31..7651b53551c8 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,6 +16,7 @@
+ 
+ #include "common/zstd_deps.h"
+ #include "common/zstd_internal.h"
++#include "compress/zstd_compress_internal.h"
+ 
+ #define ZSTD_FORWARD_IF_ERR(ret)            \
+ 	do {                                \
+@@ -92,12 +93,64 @@ zstd_compression_parameters zstd_get_cparams(int level,
+ }
+ EXPORT_SYMBOL(zstd_get_cparams);
+ 
++size_t zstd_cctx_set_param(zstd_cctx *cctx, ZSTD_cParameter param, int value)
++{
++	return ZSTD_CCtx_setParameter(cctx, param, value);
++}
++EXPORT_SYMBOL(zstd_cctx_set_param);
++
+ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+ {
+ 	return ZSTD_estimateCCtxSize_usingCParams(*cparams);
+ }
+ EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+ 
++// Used by zstd_cctx_workspace_bound_with_ext_seq_prod()
++static size_t dummy_external_sequence_producer(
++	void *sequenceProducerState,
++	ZSTD_Sequence *outSeqs, size_t outSeqsCapacity,
++	const void *src, size_t srcSize,
++	const void *dict, size_t dictSize,
++	int compressionLevel,
++	size_t windowSize)
++{
++	(void)sequenceProducerState;
++	(void)outSeqs; (void)outSeqsCapacity;
++	(void)src; (void)srcSize;
++	(void)dict; (void)dictSize;
++	(void)compressionLevel;
++	(void)windowSize;
++	return ZSTD_SEQUENCE_PRODUCER_ERROR;
++}
++
++static void init_cctx_params_from_compress_params(
++	ZSTD_CCtx_params *cctx_params,
++	const zstd_compression_parameters *compress_params)
++{
++	ZSTD_parameters zstd_params;
++	memset(&zstd_params, 0, sizeof(zstd_params));
++	zstd_params.cParams = *compress_params;
++	ZSTD_CCtxParams_init_advanced(cctx_params, zstd_params);
++}
++
++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params)
++{
++	ZSTD_CCtx_params cctx_params;
++	init_cctx_params_from_compress_params(&cctx_params, compress_params);
++	ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer);
++	return ZSTD_estimateCCtxSize_usingCCtxParams(&cctx_params);
++}
++EXPORT_SYMBOL(zstd_cctx_workspace_bound_with_ext_seq_prod);
++
++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params)
++{
++	ZSTD_CCtx_params cctx_params;
++	init_cctx_params_from_compress_params(&cctx_params, compress_params);
++	ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer);
++	return ZSTD_estimateCStreamSize_usingCCtxParams(&cctx_params);
++}
++EXPORT_SYMBOL(zstd_cstream_workspace_bound_with_ext_seq_prod);
++
+ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+ {
+ 	if (workspace == NULL)
+@@ -209,5 +262,25 @@ size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+ }
+ EXPORT_SYMBOL(zstd_end_stream);
+ 
++void zstd_register_sequence_producer(
++  zstd_cctx *cctx,
++  void* sequence_producer_state,
++  zstd_sequence_producer_f sequence_producer
++) {
++	ZSTD_registerSequenceProducer(cctx, sequence_producer_state, sequence_producer);
++}
++EXPORT_SYMBOL(zstd_register_sequence_producer);
++
++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity,
++					    const zstd_sequence *in_seqs, size_t in_seqs_size,
++					    const void* literals, size_t lit_size, size_t lit_capacity,
++					    size_t decompressed_size)
++{
++	return ZSTD_compressSequencesAndLiterals(cctx, dst, dst_capacity, in_seqs,
++						 in_seqs_size, literals, lit_size,
++						 lit_capacity, decompressed_size);
++}
++EXPORT_SYMBOL(zstd_compress_sequences_and_literals);
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Compressor");
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index 469fc3059be0..0ae819f0c927 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.49.0.634.g8613c2bb6c
+
diff --git a/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15 b/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15
new file mode 120000
index 0000000..9c73995
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15
@@ -0,0 +1 @@
+gentoo-sources-6.15
\ No newline at end of file
diff --git a/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip b/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip
new file mode 100644
index 0000000..2a4aa7f
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip
@@ -0,0 +1,402 @@
+From 93b3c85030525027181d7ae26378331eeea06a29 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Wed, 4 Jun 2025 16:40:31 +0200
+Subject: [PATCH 1/8] amd-pstate
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/cpufreq/amd-pstate.c   | 111 +++++++++++++++++++++++++--------
+ drivers/cpufreq/amd-pstate.h   |   2 +
+ include/linux/sched/topology.h |   6 ++
+ kernel/sched/debug.c           |   4 ++
+ kernel/sched/fair.c            |   5 +-
+ kernel/sched/topology.c        |  58 +++++++++++++++++
+ 6 files changed, 158 insertions(+), 28 deletions(-)
+
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index b961f3a3b580..12331e127d96 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -389,7 +389,8 @@ static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy)
+ static int msr_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	union perf_cached perf = READ_ONCE(cpudata->perf);
+-	u64 cap1, numerator;
++	u64 cap1, numerator, cppc_req;
++	u8 min_perf;
+ 
+ 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
+ 				     &cap1);
+@@ -400,6 +401,22 @@ static int msr_init_perf(struct amd_cpudata *cpudata)
+ 	if (ret)
+ 		return ret;
+ 
++	ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req);
++	if (ret)
++		return ret;
++
++	WRITE_ONCE(cpudata->cppc_req_cached, cppc_req);
++	min_perf = FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cppc_req);
++
++	/*
++	 * Clear out the min_perf part to check if the rest of the MSR is 0, if yes, this is an
++	 * indication that the min_perf value is the one specified through the BIOS option
++	 */
++	cppc_req &= ~(AMD_CPPC_MIN_PERF_MASK);
++
++	if (!cppc_req)
++		perf.bios_min_perf = min_perf;
++
+ 	perf.highest_perf = numerator;
+ 	perf.max_limit_perf = numerator;
+ 	perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1);
+@@ -554,6 +571,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
+ 	if (!policy)
+ 		return;
+ 
++	/* limit the max perf when core performance boost feature is disabled */
++	if (!cpudata->boost_supported)
++		max_perf = min_t(u8, perf.nominal_perf, max_perf);
++
+ 	des_perf = clamp_t(u8, des_perf, min_perf, max_perf);
+ 
+ 	policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf);
+@@ -563,10 +584,6 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
+ 		des_perf = 0;
+ 	}
+ 
+-	/* limit the max perf when core performance boost feature is disabled */
+-	if (!cpudata->boost_supported)
+-		max_perf = min_t(u8, perf.nominal_perf, max_perf);
+-
+ 	if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
+ 		trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
+ 			cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
+@@ -580,20 +597,26 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
+ {
+ 	/*
+ 	 * Initialize lower frequency limit (i.e.policy->min) with
+-	 * lowest_nonlinear_frequency which is the most energy efficient
+-	 * frequency. Override the initial value set by cpufreq core and
+-	 * amd-pstate qos_requests.
++	 * lowest_nonlinear_frequency or the min frequency (if) specified in BIOS,
++	 * Override the initial value set by cpufreq core and amd-pstate qos_requests.
+ 	 */
+ 	if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
+ 		struct cpufreq_policy *policy __free(put_cpufreq_policy) =
+ 					      cpufreq_cpu_get(policy_data->cpu);
+ 		struct amd_cpudata *cpudata;
++		union perf_cached perf;
+ 
+ 		if (!policy)
+ 			return -EINVAL;
+ 
+ 		cpudata = policy->driver_data;
+-		policy_data->min = cpudata->lowest_nonlinear_freq;
++		perf = READ_ONCE(cpudata->perf);
++
++		if (perf.bios_min_perf)
++			policy_data->min = perf_to_freq(perf, cpudata->nominal_freq,
++							perf.bios_min_perf);
++		else
++			policy_data->min = cpudata->lowest_nonlinear_freq;
+ 	}
+ 
+ 	cpufreq_verify_within_cpu_limits(policy_data);
+@@ -831,8 +854,10 @@ static void amd_pstate_update_limits(unsigned int cpu)
+ 	if (highest_perf_changed) {
+ 		WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
+ 
+-		if (cur_high < CPPC_MAX_PERF)
++		if (cur_high < CPPC_MAX_PERF) {
+ 			sched_set_itmt_core_prio((int)cur_high, cpu);
++			sched_update_asym_prefer_cpu(cpu, prev_high, cur_high);
++		}
+ 	}
+ }
+ 
+@@ -1024,6 +1049,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
+ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
++	union perf_cached perf = READ_ONCE(cpudata->perf);
++
++	/* Reset CPPC_REQ MSR to the BIOS value */
++	amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
+ 
+ 	freq_qos_remove_request(&cpudata->req[1]);
+ 	freq_qos_remove_request(&cpudata->req[0]);
+@@ -1419,7 +1448,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 	struct amd_cpudata *cpudata;
+ 	union perf_cached perf;
+ 	struct device *dev;
+-	u64 value;
+ 	int ret;
+ 
+ 	/*
+@@ -1484,12 +1512,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 		cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE;
+ 	}
+ 
+-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+-		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
+-		if (ret)
+-			return ret;
+-		WRITE_ONCE(cpudata->cppc_req_cached, value);
+-	}
+ 	ret = amd_pstate_set_epp(policy, cpudata->epp_default);
+ 	if (ret)
+ 		return ret;
+@@ -1509,6 +1531,11 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+ 	if (cpudata) {
++		union perf_cached perf = READ_ONCE(cpudata->perf);
++
++		/* Reset CPPC_REQ MSR to the BIOS value */
++		amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
++
+ 		kfree(cpudata);
+ 		policy->driver_data = NULL;
+ 	}
+@@ -1559,21 +1586,38 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
+ 	return 0;
+ }
+ 
+-static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
++static int amd_pstate_cpu_online(struct cpufreq_policy *policy)
+ {
+-	pr_debug("AMD CPU Core %d going online\n", policy->cpu);
+-
+ 	return amd_pstate_cppc_enable(policy);
+ }
+ 
+-static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
++static int amd_pstate_cpu_offline(struct cpufreq_policy *policy)
+ {
+-	return 0;
++	struct amd_cpudata *cpudata = policy->driver_data;
++	union perf_cached perf = READ_ONCE(cpudata->perf);
++
++	/*
++	 * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified
++	 * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the
++	 * limits, epp and desired perf will get reset to the cached values in cpudata struct
++	 */
++	return amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
+ }
+ 
+-static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
++static int amd_pstate_suspend(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
++	union perf_cached perf = READ_ONCE(cpudata->perf);
++	int ret;
++
++	/*
++	 * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified
++	 * min_perf value across kexec reboots. If this CPU is just resumed back without kexec,
++	 * the limits, epp and desired perf will get reset to the cached values in cpudata struct
++	 */
++	ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
++	if (ret)
++		return ret;
+ 
+ 	/* invalidate to ensure it's rewritten during resume */
+ 	cpudata->cppc_req_cached = 0;
+@@ -1584,6 +1628,17 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
+ 	return 0;
+ }
+ 
++static int amd_pstate_resume(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	union perf_cached perf = READ_ONCE(cpudata->perf);
++	int cur_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->cur);
++
++	/* Set CPPC_REQ to last sane value until the governor updates it */
++	return amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf,
++				      0U, false);
++}
++
+ static int amd_pstate_epp_resume(struct cpufreq_policy *policy)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+@@ -1609,6 +1664,10 @@ static struct cpufreq_driver amd_pstate_driver = {
+ 	.fast_switch    = amd_pstate_fast_switch,
+ 	.init		= amd_pstate_cpu_init,
+ 	.exit		= amd_pstate_cpu_exit,
++	.online		= amd_pstate_cpu_online,
++	.offline	= amd_pstate_cpu_offline,
++	.suspend	= amd_pstate_suspend,
++	.resume		= amd_pstate_resume,
+ 	.set_boost	= amd_pstate_set_boost,
+ 	.update_limits	= amd_pstate_update_limits,
+ 	.name		= "amd-pstate",
+@@ -1621,9 +1680,9 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
+ 	.setpolicy	= amd_pstate_epp_set_policy,
+ 	.init		= amd_pstate_epp_cpu_init,
+ 	.exit		= amd_pstate_epp_cpu_exit,
+-	.offline	= amd_pstate_epp_cpu_offline,
+-	.online		= amd_pstate_epp_cpu_online,
+-	.suspend	= amd_pstate_epp_suspend,
++	.offline	= amd_pstate_cpu_offline,
++	.online		= amd_pstate_cpu_online,
++	.suspend	= amd_pstate_suspend,
+ 	.resume		= amd_pstate_epp_resume,
+ 	.update_limits	= amd_pstate_update_limits,
+ 	.set_boost	= amd_pstate_set_boost,
+diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h
+index fbe1c08d3f06..2f7ae364d331 100644
+--- a/drivers/cpufreq/amd-pstate.h
++++ b/drivers/cpufreq/amd-pstate.h
+@@ -30,6 +30,7 @@
+  * @lowest_perf: the absolute lowest performance level of the processor
+  * @min_limit_perf: Cached value of the performance corresponding to policy->min
+  * @max_limit_perf: Cached value of the performance corresponding to policy->max
++ * @bios_min_perf: Cached perf value corresponding to the "Requested CPU Min Frequency" BIOS option
+  */
+ union perf_cached {
+ 	struct {
+@@ -39,6 +40,7 @@ union perf_cached {
+ 		u8	lowest_perf;
+ 		u8	min_limit_perf;
+ 		u8	max_limit_perf;
++		u8	bios_min_perf;
+ 	};
+ 	u64	val;
+ };
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 7b4301b7235f..198bb5cc1774 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -195,6 +195,8 @@ struct sched_domain_topology_level {
+ };
+ 
+ extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
++extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
++
+ 
+ # define SD_INIT_NAME(type)		.name = #type
+ 
+@@ -223,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+ 	return true;
+ }
+ 
++static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
++{
++}
++
+ #endif	/* !CONFIG_SMP */
+ 
+ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 56ae54e0ce6a..557246880a7e 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -588,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
+ 	debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ 	debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+ 	debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
++
++	if (sd->flags & SD_ASYM_PACKING)
++		debugfs_create_u32("group_asym_prefer_cpu", 0444, parent,
++				   (u32 *)&sd->groups->asym_prefer_cpu);
+ }
+ 
+ void update_sched_domain_debugfs(void)
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0fb9bf995a47..8d0f462e8c8b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10256,7 +10256,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
+ 	    (sgs->group_weight - sgs->idle_cpus != 1))
+ 		return false;
+ 
+-	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
++	return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
+ }
+ 
+ /* One group has more than one SMT CPU while the other group does not */
+@@ -10493,7 +10493,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 
+ 	case group_asym_packing:
+ 		/* Prefer to move from lowest priority CPU's work */
+-		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
++		return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
++					 READ_ONCE(sg->asym_prefer_cpu));
+ 
+ 	case group_misfit_task:
+ 		/*
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index f1ebc60d967f..8426de317835 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1333,6 +1333,64 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+ 	update_group_capacity(sd, cpu);
+ }
+ 
++#ifdef CONFIG_SMP
++
++/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
++void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
++{
++	int asym_prefer_cpu = cpu;
++	struct sched_domain *sd;
++
++	guard(rcu)();
++
++	for_each_domain(cpu, sd) {
++		struct sched_group *sg;
++		int group_cpu;
++
++		if (!(sd->flags & SD_ASYM_PACKING))
++			continue;
++
++		/*
++		 * Groups of overlapping domain are replicated per NUMA
++		 * node and will require updating "asym_prefer_cpu" on
++		 * each local copy.
++		 *
++		 * If you are hitting this warning, consider moving
++		 * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
++		 * which is shared by all the overlapping groups.
++		 */
++		WARN_ON_ONCE(sd->flags & SD_OVERLAP);
++
++		sg = sd->groups;
++		if (cpu != sg->asym_prefer_cpu) {
++			/*
++			 * Since the parent is a superset of the current group,
++			 * if the cpu is not the "asym_prefer_cpu" at the
++			 * current level, it cannot be the preferred CPU at a
++			 * higher levels either.
++			 */
++			if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu))
++				return;
++
++			WRITE_ONCE(sg->asym_prefer_cpu, cpu);
++			continue;
++		}
++
++		/* Ranking has improved; CPU is still the preferred one. */
++		if (new_prio >= old_prio)
++			continue;
++
++		for_each_cpu(group_cpu, sched_group_span(sg)) {
++			if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
++				asym_prefer_cpu = group_cpu;
++		}
++
++		WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
++	}
++}
++
++#endif /* CONFIG_SMP */
++
+ /*
+  * Set of available CPUs grouped by their corresponding capacities
+  * Each list entry contains a CPU mask reflecting CPUs that share the same
+-- 
+2.50.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch
new file mode 100644
index 0000000..4a0e492
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch
@@ -0,0 +1,3404 @@
+From 103efa50b54199447f56196e0b1b2f6d13db2a54 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Wed, 4 Jun 2025 16:41:07 +0200
+Subject: [PATCH 4/8] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   73 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2232 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    4 +-
+ 16 files changed, 1941 insertions(+), 555 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 1669d95bb0f9..951a5ed55a27 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -248,7 +248,8 @@ struct tcp_sock {
+ 	void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq);
+ #endif
+ 	u32	snd_ssthresh;	/* Slow start size threshold		*/
+-	u8	recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++	u32	recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_rx);
+ 
+ 	/* TX read-write hotpath cache lines */
+@@ -305,7 +306,8 @@ struct tcp_sock {
+  */
+ 	struct tcp_options_received rx_opt;
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	__cacheline_group_end(tcp_sock_write_txrx);
+ 
+ 	/* RX read-write hotpath cache lines */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index 1735db332aab..2c4a94af7093 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -132,8 +132,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 4450c384ef17..61f73ca30be3 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -379,11 +379,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_DEMAND_CWR	BIT(2)
+ #define	TCP_ECN_SEEN		BIT(3)
+ #define	TCP_ECN_MODE_ACCECN	BIT(4)
++#define	TCP_ECN_LOW		BIT(5)
++#define	TCP_ECN_ECT_PERMANENT	BIT(6)
+ 
+ #define	TCP_ECN_DISABLED	0
+ #define	TCP_ECN_MODE_PENDING	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ #define	TCP_ECN_MODE_ANY	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ 
++
+ static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp)
+ {
+ 	return tp->ecn_flags & TCP_ECN_MODE_ANY;
+@@ -840,6 +843,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -945,6 +957,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -1043,9 +1060,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1158,6 +1180,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1180,7 +1203,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED		BIT(0)
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN		BIT(1)
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	BIT(2)
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1200,10 +1227,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1214,7 +1244,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1238,8 +1270,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1305,6 +1340,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1324,6 +1367,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1336,6 +1380,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2489,7 +2548,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index dab9493c791b..cce4975fdcfe 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -517,12 +517,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dc8fdc80e16b..6b2003dbae81 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
++#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN enabled at conn init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 6d2c97f8e9ef..ddc116ef22cb 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index e01492234b0b..27893b774e08 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 6edc441b3702..bf52c5744acf 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3411,6 +3411,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4158,6 +4159,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..066da5e5747c 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++
++	return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index a35018e2d0ba..b849d76b24da 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1139,7 +1139,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1511,6 +1516,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3848,7 +3864,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3865,6 +3882,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3875,6 +3893,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3994,6 +4017,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4059,7 +4083,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_in_ack_event(sk, flag);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4083,6 +4107,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4103,7 +4128,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5782,13 +5807,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index fb9349be36b8..3c53e39f8201 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -472,6 +472,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 13295a59d22e..3effb6e51e96 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1614,7 +1617,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	u16 flags;
+ 	int nlen;
+@@ -1689,6 +1692,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2045,13 +2072,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2777,6 +2803,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2989,6 +3016,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index e4c616bbd727..e4a7a25d667d 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk)
+ 		struct inet_sock *inet = inet_sk(sk);
+ 		u32 rtx_delta;
+ 
+-		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
++		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
+ 				tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+ 		if (tp->tcp_usec_ts)
+ 			rtx_delta /= USEC_PER_MSEC;
+@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk)
+ 			       icsk_timeout(icsk));
+ 		return;
+ 	}
++
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.50.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip b/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip
new file mode 100644
index 0000000..2b076d6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip
@@ -0,0 +1,288 @@
+From 4ef24b41f1c812f829943ac1b0f2f245cee2eba8 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Wed, 4 Jun 2025 16:41:18 +0200
+Subject: [PATCH 5/8] block
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------
+ block/bfq-iosched.h | 12 +++++++++--
+ block/mq-deadline.c | 48 +++++++++++++++++++++++++++++++++++------
+ 3 files changed, 96 insertions(+), 16 deletions(-)
+
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
+index abd80dc13562..cd06c79c4e92 100644
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
+ 	return icq;
+ }
+ 
++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
++{
++	if (!current->io_context)
++		return NULL;
++	if (spin_trylock_irq(&q->queue_lock)) {
++		struct bfq_io_cq *icq;
++
++		icq = icq_to_bic(ioc_lookup_icq(q));
++		spin_unlock_irq(&q->queue_lock);
++		return icq;
++	}
++
++	return NULL;
++}
++
+ /*
+  * Scheduler run of queue, if there are requests pending and no one in the
+  * driver that will restart queueing.
+@@ -2465,10 +2480,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
+ 	 * returned by bfq_bic_lookup does not go away before
+ 	 * bfqd->lock is taken.
+ 	 */
+-	struct bfq_io_cq *bic = bfq_bic_lookup(q);
++	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
+ 	bool ret;
+ 
+-	spin_lock_irq(&bfqd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock_irq(&bfqd->lock))
++		return false;
+ 
+ 	if (bic) {
+ 		/*
+@@ -5317,6 +5343,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	struct bfq_queue *in_serv_queue;
+ 	bool waiting_rq, idle_timer_disabled = false;
+ 
++	/*
++	 * If someone else is already dispatching, skip this one. This will
++	 * defer the next dispatch event to when something completes, and could
++	 * potentially lower the queue depth for contended cases.
++	 *
++	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
++	 * retries if nothing is dispatched.
++	 */
++	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
++	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
++		return NULL;
++
+ 	spin_lock_irq(&bfqd->lock);
+ 
+ 	in_serv_queue = bfqd->in_service_queue;
+@@ -5328,6 +5366,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ 	}
+ 
++	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
+ 	spin_unlock_irq(&bfqd->lock);
+ 	bfq_update_dispatch_stats(hctx->queue, rq,
+ 			idle_timer_disabled ? in_serv_queue : NULL,
+@@ -6250,10 +6289,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
+ 
+ static struct bfq_queue *bfq_init_rq(struct request *rq);
+ 
+-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void bfq_insert_request(struct request_queue *q, struct request *rq,
+ 			       blk_insert_t flags)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct bfq_data *bfqd = q->elevator->elevator_data;
+ 	struct bfq_queue *bfqq;
+ 	bool idle_timer_disabled = false;
+@@ -6315,7 +6353,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		bfq_insert_request(hctx, rq, flags);
++		bfq_insert_request(hctx->queue, rq, flags);
+ 	}
+ }
+ 
+@@ -7254,6 +7292,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	q->elevator = eq;
+ 	spin_unlock_irq(&q->queue_lock);
+ 
++	spin_lock_init(&bfqd->lock);
++
+ 	/*
+ 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ 	 * Grab a permanent reference to it, so that the normal code flow
+@@ -7371,8 +7411,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	/* see comments on the definition of next field inside bfq_data */
+ 	bfqd->actuator_load_threshold = 4;
+ 
+-	spin_lock_init(&bfqd->lock);
+-
+ 	/*
+ 	 * The invocation of the next bfq_create_group_hierarchy
+ 	 * function is the head of a chain of function calls
+diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
+index 687a3a7ba784..8589b58af79f 100644
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -504,12 +504,22 @@ struct bfq_io_cq {
+ 	unsigned int requests;	/* Number of requests this process has in flight */
+ };
+ 
++enum {
++	BFQ_DISPATCHING	= 0,
++};
++
+ /**
+  * struct bfq_data - per-device data structure.
+  *
+  * All the fields are protected by @lock.
+  */
+ struct bfq_data {
++	struct {
++		spinlock_t lock;
++	} ____cacheline_aligned_in_smp;
++
++	unsigned long run_state;
++
+ 	/* device request queue */
+ 	struct request_queue *queue;
+ 	/* dispatch queue */
+@@ -795,8 +805,6 @@ struct bfq_data {
+ 	/* fallback dummy bfqq for extreme OOM conditions */
+ 	struct bfq_queue oom_bfqq;
+ 
+-	spinlock_t lock;
+-
+ 	/*
+ 	 * bic associated with the task issuing current bio for
+ 	 * merging. This and the next field are used as a support to
+diff --git a/block/mq-deadline.c b/block/mq-deadline.c
+index 754f6b7415cd..a5fa8f86178d 100644
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -79,10 +79,20 @@ struct dd_per_prio {
+ 	struct io_stats_per_prio stats;
+ };
+ 
++enum {
++	DD_DISPATCHING	= 0,
++};
++
+ struct deadline_data {
+ 	/*
+ 	 * run time data
+ 	 */
++	struct {
++		spinlock_t lock;
++		spinlock_t zone_lock;
++	} ____cacheline_aligned_in_smp;
++
++	unsigned long run_state;
+ 
+ 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
+ 
+@@ -100,8 +110,6 @@ struct deadline_data {
+ 	int front_merges;
+ 	u32 async_depth;
+ 	int prio_aging_expire;
+-
+-	spinlock_t lock;
+ };
+ 
+ /* Maps an I/O priority class to a deadline scheduler priority. */
+@@ -466,6 +474,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	struct request *rq;
+ 	enum dd_prio prio;
+ 
++	/*
++	 * If someone else is already dispatching, skip this one. This will
++	 * defer the next dispatch event to when something completes, and could
++	 * potentially lower the queue depth for contended cases.
++	 *
++	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
++	 * retries if nothing is dispatched.
++	 */
++	if (test_bit(DD_DISPATCHING, &dd->run_state) ||
++	    test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state))
++		return NULL;
++
+ 	spin_lock(&dd->lock);
+ 	rq = dd_dispatch_prio_aged_requests(dd, now);
+ 	if (rq)
+@@ -482,6 +502,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	}
+ 
+ unlock:
++	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
+ 	spin_unlock(&dd->lock);
+ 
+ 	return rq;
+@@ -585,6 +606,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+ 
+ 	eq->elevator_data = dd;
+ 
++	spin_lock_init(&dd->lock);
++	spin_lock_init(&dd->zone_lock);
++
+ 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ 
+@@ -601,7 +625,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+ 	dd->last_dir = DD_WRITE;
+ 	dd->fifo_batch = fifo_batch;
+ 	dd->prio_aging_expire = prio_aging_expire;
+-	spin_lock_init(&dd->lock);
+ 
+ 	/* We dispatch from request queue wide instead of hw queue */
+ 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+@@ -657,7 +680,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ 	struct request *free = NULL;
+ 	bool ret;
+ 
+-	spin_lock(&dd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock(&dd->lock))
++		return false;
++
+ 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ 	spin_unlock(&dd->lock);
+ 
+@@ -670,10 +705,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ /*
+  * add rq to rbtree and fifo
+  */
+-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void dd_insert_request(struct request_queue *q, struct request *rq,
+ 			      blk_insert_t flags, struct list_head *free)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct deadline_data *dd = q->elevator->elevator_data;
+ 	const enum dd_data_dir data_dir = rq_data_dir(rq);
+ 	u16 ioprio = req_get_ioprio(rq);
+@@ -731,7 +765,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		dd_insert_request(hctx, rq, flags, &free);
++		dd_insert_request(q, rq, flags, &free);
+ 	}
+ 	spin_unlock(&dd->lock);
+ 
+-- 
+2.50.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch
new file mode 100644
index 0000000..4350db3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch
@@ -0,0 +1,803 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0EE3F2F4A0C
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:36 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270898; cv=none; b=i6UpioJgMk5GxDDiJNU6ym/ql7fYtIxc3m0laytI789opI8LDjTvyDtwqrIQyQ1c4ZCnekjBz/wO4Aujx1CK9ipZtczqav2p9tw1Hd3Voibb0lwiXLdi8v6PAAo2cAsX9FlCAQdMHkE7TrPq3hrfK8cTUOewsHi11k8otsoUoF8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270898; c=relaxed/simple;
+	bh=TYBL04yFFQ2m3vahwqxlPWkj0L0Itx8xkCpecBkZyTo=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=NvFXEMhOMy2ly3nACqhZx/B7xBuxxjLzhrCAPQhizdE6auJ2An7O1CCizePDbPUt3x9R8OXvvONlUeFHXfcGpYcXF9IC+2ogKFFrxsqx07VvjHSs8Ud1keSWy0e/3m5mCyNcyiv9/x9sdp1mHOJsM6ZH+h1t1P9HV+a9nVVwfeU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=m8zYzaEs; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="m8zYzaEs"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270896; x=1781806896;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=TYBL04yFFQ2m3vahwqxlPWkj0L0Itx8xkCpecBkZyTo=;
+  b=m8zYzaEsszmRk8d10p09dWuoM0QHBZbGZnY8ZU7YQ5oc73eP1UDpRePw
+   Iu/jPTNT8uBPQfAA79j/b4tbxQImw3Vm2EFuhqgbCGzn8WuO4p6CfGONf
+   nNIOGop6F+y60hE6rmyGD7GqoufIf/Xz+S8d55r5HVm1AioodkMfJsCMq
+   p01kqQ4AItdgoMkEnocPNDlyafzx3MsZtFHCCqR6F929sYF5LFax3HY4G
+   ozjUU62bKBmlkBn1eho1JY0ZSLQPetm2LAIQE4QynDS4MQ7bq112tEsWH
+   BJ/IXkd445+qfY1nWMwW+JO0pyelws29KEOXY7BvPQVHFy5P4DCp3UAI/
+   w==;
+X-CSE-ConnectionGUID: 97R+yvzpS6K/KMRnntFzgg==
+X-CSE-MsgGUID: WI5jQwWESQSCLf6+tddXlg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931324"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931324"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:35 -0700
+X-CSE-ConnectionGUID: 3Wla1slbSPWqbG0bkSSkVA==
+X-CSE-MsgGUID: c3ximr2GR+C5Bp6cvb58uA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959507"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:34 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 01/20] sched: Cache aware load-balancing
+Date: Wed, 18 Jun 2025 11:27:49 -0700
+Message-Id: <cbe56cb89cca4c3ddcbc9956a82bda23f5b81c57.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+Hi all,
+
+One of the many things on the eternal todo list has been finishing the
+below hackery.
+
+It is an attempt at modelling cache affinity -- and while the patch
+really only targets LLC, it could very well be extended to also apply to
+clusters (L2). Specifically any case of multiple cache domains inside a
+node.
+
+Anyway, I wrote this about a year ago, and I mentioned this at the
+recent OSPM conf where Gautham and Prateek expressed interest in playing
+with this code.
+
+So here goes, very rough and largely unproven code ahead :-)
+
+It applies to current tip/master, but I know it will fail the __percpu
+validation that sits in -next, although that shouldn't be terribly hard
+to fix up.
+
+As is, it only computes a CPU inside the LLC that has the highest recent
+runtime, this CPU is then used in the wake-up path to steer towards this
+LLC and in task_hot() to limit migrations away from it.
+
+More elaborate things could be done, notably there is an XXX in there
+somewhere about finding the best LLC inside a NODE (interaction with
+NUMA_BALANCING).
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ include/linux/mm_types.h |  44 ++++++
+ include/linux/sched.h    |   4 +
+ init/Kconfig             |   4 +
+ kernel/fork.c            |   5 +
+ kernel/sched/core.c      |  13 +-
+ kernel/sched/fair.c      | 330 +++++++++++++++++++++++++++++++++++++--
+ kernel/sched/sched.h     |   8 +
+ 7 files changed, 388 insertions(+), 20 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 56d07edd01f9..013291c6aaa2 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -893,6 +893,12 @@ struct mm_cid {
+ };
+ #endif
+ 
++struct mm_sched {
++	u64 runtime;
++	unsigned long epoch;
++	unsigned long occ;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -983,6 +989,17 @@ struct mm_struct {
+ 		 */
+ 		raw_spinlock_t cpus_allowed_lock;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
++		 * See account_mm_sched() and ...
++		 */
++		struct mm_sched __percpu *pcpu_sched;
++		raw_spinlock_t mm_sched_lock;
++		unsigned long mm_sched_epoch;
++		int mm_sched_cpu;
++#endif
++
+ #ifdef CONFIG_MMU
+ 		atomic_long_t pgtables_bytes;	/* size of all page tables */
+ #endif
+@@ -1393,6 +1410,33 @@ static inline unsigned int mm_cid_size(void)
+ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
++
++static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
++{
++	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++	if (!pcpu_sched)
++		return -ENOMEM;
++
++	mm_init_sched(mm, pcpu_sched);
++	return 0;
++}
++
++#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
++
++static inline void mm_destroy_sched(struct mm_struct *mm)
++{
++	free_percpu(mm->pcpu_sched);
++	mm->pcpu_sched = NULL;
++}
++#else /* !CONFIG_SCHED_CACHE */
++
++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
++static inline void mm_destroy_sched(struct mm_struct *mm) { }
++
++#endif /* CONFIG_SCHED_CACHE */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index f96ac1982893..d0e4cda2b3cd 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1399,6 +1399,10 @@ struct task_struct {
+ 	unsigned long			numa_pages_migrated;
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	struct callback_head		cache_work;
++#endif
++
+ #ifdef CONFIG_RSEQ
+ 	struct rseq __user *rseq;
+ 	u32 rseq_len;
+diff --git a/init/Kconfig b/init/Kconfig
+index bf3a920064be..e2509127b6f9 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -953,6 +953,10 @@ config NUMA_BALANCING
+ 
+ 	  This system will be inactive on UMA systems.
+ 
++config SCHED_CACHE
++	bool "Cache aware scheduler"
++	default y
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 168681fc4b25..da1387823b9e 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1332,6 +1332,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	if (mm_alloc_cid(mm, p))
+ 		goto fail_cid;
+ 
++	if (mm_alloc_sched(mm))
++		goto fail_sched;
++
+ 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ 				     NR_MM_COUNTERS))
+ 		goto fail_pcpu;
+@@ -1341,6 +1344,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	return mm;
+ 
+ fail_pcpu:
++	mm_destroy_sched(mm);
++fail_sched:
+ 	mm_destroy_cid(mm);
+ fail_cid:
+ 	destroy_context(mm);
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index c81cf642dba0..d9c3e75f79d1 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4524,6 +4524,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->migration_pending = NULL;
+ #endif
+ 	init_sched_mm_cid(p);
++	init_sched_mm(p);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+@@ -8526,6 +8527,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
+ 
+ void __init sched_init(void)
+ {
++	unsigned long now = jiffies;
+ 	unsigned long ptr = 0;
+ 	int i;
+ 
+@@ -8600,7 +8602,7 @@ void __init sched_init(void)
+ 		raw_spin_lock_init(&rq->__lock);
+ 		rq->nr_running = 0;
+ 		rq->calc_load_active = 0;
+-		rq->calc_load_update = jiffies + LOAD_FREQ;
++		rq->calc_load_update = now + LOAD_FREQ;
+ 		init_cfs_rq(&rq->cfs);
+ 		init_rt_rq(&rq->rt);
+ 		init_dl_rq(&rq->dl);
+@@ -8644,7 +8646,7 @@ void __init sched_init(void)
+ 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ 		rq->balance_callback = &balance_push_callback;
+ 		rq->active_balance = 0;
+-		rq->next_balance = jiffies;
++		rq->next_balance = now;
+ 		rq->push_cpu = 0;
+ 		rq->cpu = i;
+ 		rq->online = 0;
+@@ -8656,7 +8658,7 @@ void __init sched_init(void)
+ 
+ 		rq_attach_root(rq, &def_root_domain);
+ #ifdef CONFIG_NO_HZ_COMMON
+-		rq->last_blocked_load_update_tick = jiffies;
++		rq->last_blocked_load_update_tick = now;
+ 		atomic_set(&rq->nohz_flags, 0);
+ 
+ 		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+@@ -8681,6 +8683,11 @@ void __init sched_init(void)
+ 
+ 		rq->core_cookie = 0UL;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		raw_spin_lock_init(&rq->cpu_epoch_lock);
++		rq->cpu_epoch_next = now;
++#endif
++
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+ 	}
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0fb9bf995a47..df7d4a324fbe 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ 	return delta_exec;
+ }
+ 
+-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
++#ifdef CONFIG_SCHED_CACHE
++
++/*
++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
++ * tunable or so.
++ */
++#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
++#define EPOCH_OLD	5		/* 50 ms */
++
++void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
++{
++	unsigned long epoch;
++	int i;
++
++	for_each_possible_cpu(i) {
++		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct rq *rq = cpu_rq(i);
++
++		pcpu_sched->runtime = 0;
++		pcpu_sched->epoch = epoch = rq->cpu_epoch;
++		pcpu_sched->occ = -1;
++	}
++
++	raw_spin_lock_init(&mm->mm_sched_lock);
++	mm->mm_sched_epoch = epoch;
++	mm->mm_sched_cpu = -1;
++
++	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++}
++
++/* because why would C be fully specified */
++static __always_inline void __shr_u64(u64 *val, unsigned int n)
++{
++	if (n >= 64) {
++		*val = 0;
++		return;
++	}
++	*val >>= n;
++}
++
++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	lockdep_assert_held(&rq->cpu_epoch_lock);
++
++	unsigned long n, now = jiffies;
++	long delta = now - rq->cpu_epoch_next;
++
++	if (delta > 0) {
++		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		rq->cpu_epoch += n;
++		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		__shr_u64(&rq->cpu_runtime, n);
++	}
++
++	n = rq->cpu_epoch - pcpu_sched->epoch;
++	if (n) {
++		pcpu_sched->epoch += n;
++		__shr_u64(&pcpu_sched->runtime, n);
++	}
++}
++
++static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
++
++	__update_mm_sched(rq, pcpu_sched);
++
++	/*
++	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
++	 * the accumulation period, this means the multiplcation here should
++	 * not overflow.
++	 */
++	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
++}
++
++static inline
++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_sched *pcpu_sched;
++	unsigned long epoch;
++
++	/*
++	 * init_task and kthreads don't be having no mm
++	 */
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
++
++	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
++		__update_mm_sched(rq, pcpu_sched);
++		pcpu_sched->runtime += delta_exec;
++		rq->cpu_runtime += delta_exec;
++		epoch = rq->cpu_epoch;
++	}
++
++	/*
++	 * If this task hasn't hit task_cache_work() for a while, invalidate
++	 * it's preferred state.
++	 */
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
++		mm->mm_sched_cpu = -1;
++		pcpu_sched->occ = -1;
++	}
++}
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	struct mm_struct *mm = p->mm;
++
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	guard(raw_spinlock)(&mm->mm_sched_lock);
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	if (work->next == work) {
++		task_work_add(p, work, TWA_RESUME);
++		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
++	}
++}
++
++static void task_cache_work(struct callback_head *work)
++{
++	struct task_struct *p = current;
++	struct mm_struct *mm = p->mm;
++	unsigned long m_a_occ = 0;
++	int cpu, m_a_cpu = -1;
++	cpumask_var_t cpus;
++
++	WARN_ON_ONCE(work != &p->cache_work);
++
++	work->next = work;
++
++	if (p->flags & PF_EXITING)
++		return;
++
++	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
++		return;
++
++	scoped_guard (cpus_read_lock) {
++		cpumask_copy(cpus, cpu_online_mask);
++
++		for_each_cpu(cpu, cpus) {
++			/* XXX sched_cluster_active */
++			struct sched_domain *sd = per_cpu(sd_llc, cpu);
++			unsigned long occ, m_occ = 0, a_occ = 0;
++			int m_cpu = -1, nr = 0, i;
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				occ = fraction_mm_sched(cpu_rq(i),
++							per_cpu_ptr(mm->pcpu_sched, i));
++				a_occ += occ;
++				if (occ > m_occ) {
++					m_occ = occ;
++					m_cpu = i;
++				}
++				nr++;
++				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
++					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
++			}
++
++			a_occ /= nr;
++			if (a_occ > m_a_occ) {
++				m_a_occ = a_occ;
++				m_a_cpu = m_cpu;
++			}
++
++			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
++				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				/* XXX threshold ? */
++				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
++			}
++
++			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
++		}
++	}
++
++	/*
++	 * If the max average cache occupancy is 'small' we don't care.
++	 */
++	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
++		m_a_cpu = -1;
++
++	mm->mm_sched_cpu = m_a_cpu;
++
++	free_cpumask_var(cpus);
++}
++
++void init_sched_mm(struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	init_task_work(work, task_cache_work);
++	work->next = work;
++}
++
++#else
++
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
++				    s64 delta_exec) { }
++
++
++void init_sched_mm(struct task_struct *p) { }
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
++
++#endif
++
++static inline
++void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+ 	trace_sched_stat_runtime(p, delta_exec);
+ 	account_group_exec_runtime(p, delta_exec);
++	account_mm_sched(rq, p, delta_exec);
+ 	cgroup_account_cputime(p, delta_exec);
+ }
+ 
+@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
+ 
+ 	delta_exec = update_curr_se(rq, &donor->se);
+ 	if (likely(delta_exec > 0))
+-		update_curr_task(donor, delta_exec);
++		update_curr_task(rq, donor, delta_exec);
+ 
+ 	return delta_exec;
+ }
+@@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	if (entity_is_task(curr)) {
+ 		struct task_struct *p = task_of(curr);
+ 
+-		update_curr_task(p, delta_exec);
++		update_curr_task(rq, p, delta_exec);
+ 
+ 		/*
+ 		 * If the fair_server is active, we need to account for the
+@@ -7848,7 +8067,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	 * per-cpu select_rq_mask usage
+ 	 */
+ 	lockdep_assert_irqs_disabled();
+-
++again:
+ 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ 	    asym_fits_cpu(task_util, util_min, util_max, target))
+ 		return target;
+@@ -7886,7 +8105,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	/* Check a recently used CPU as a potential idle candidate: */
+ 	recent_used_cpu = p->recent_used_cpu;
+ 	p->recent_used_cpu = prev;
+-	if (recent_used_cpu != prev &&
++	if (prev == p->wake_cpu &&
++	    recent_used_cpu != prev &&
+ 	    recent_used_cpu != target &&
+ 	    cpus_share_cache(recent_used_cpu, target) &&
+ 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+@@ -7939,6 +8159,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	if ((unsigned)i < nr_cpumask_bits)
+ 		return i;
+ 
++	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
++		/*
++		 * Most likely select_cache_cpu() will have re-directed
++		 * the wakeup, but getting here means the preferred cache is
++		 * too busy, so re-try with the actual previous.
++		 *
++		 * XXX wake_affine is lost for this pass.
++		 */
++		prev = target = p->wake_cpu;
++		goto again;
++	}
++
+ 	/*
+ 	 * For cluster machines which have lower sharing cache like L2 or
+ 	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+@@ -8561,6 +8793,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 	return target;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
++
++static int select_cache_cpu(struct task_struct *p, int prev_cpu)
++{
++	struct mm_struct *mm = p->mm;
++	int cpu;
++
++	if (!mm || p->nr_cpus_allowed == 1)
++		return prev_cpu;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0)
++		return prev_cpu;
++
++
++	if (static_branch_likely(&sched_numa_balancing) &&
++	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
++		/*
++		 * XXX look for max occupancy inside prev_cpu's node
++		 */
++		return prev_cpu;
++	}
++
++	return cpu;
++}
++#else
++static int select_cache_cpu(struct task_struct *p, int prev_cpu)
++{
++	return prev_cpu;
++}
++#endif
++
++
+ /*
+  * select_task_rq_fair: Select target runqueue for the waking task in domains
+  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
+@@ -8586,6 +8852,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 	 * required for stable ->cpus_allowed
+ 	 */
+ 	lockdep_assert_held(&p->pi_lock);
++	guard(rcu)();
++
+ 	if (wake_flags & WF_TTWU) {
+ 		record_wakee(p);
+ 
+@@ -8593,6 +8861,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		    cpumask_test_cpu(cpu, p->cpus_ptr))
+ 			return cpu;
+ 
++		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
++
+ 		if (!is_rd_overutilized(this_rq()->rd)) {
+ 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ 			if (new_cpu >= 0)
+@@ -8603,7 +8873,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
+ 	}
+ 
+-	rcu_read_lock();
+ 	for_each_domain(cpu, tmp) {
+ 		/*
+ 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
+@@ -8636,7 +8905,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		/* Fast path */
+ 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ 	}
+-	rcu_read_unlock();
+ 
+ 	return new_cpu;
+ }
+@@ -9286,6 +9554,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 	if (sysctl_sched_migration_cost == 0)
+ 		return 0;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (p->mm && p->mm->pcpu_sched) {
++		/*
++		 * XXX things like Skylake have non-inclusive L3 and might not
++		 * like this L3 centric view. What to do about L2 stickyness ?
++		 */
++		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
++		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
++	}
++#endif
++
+ 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+ 
+ 	return delta < (s64)sysctl_sched_migration_cost;
+@@ -9297,27 +9576,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+  * Returns 0, if task migration is not affected by locality.
+  * Returns a negative value, if task migration improves locality i.e migration preferred.
+  */
+-static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
+ {
+ 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ 	unsigned long src_weight, dst_weight;
+ 	int src_nid, dst_nid, dist;
+ 
+-	if (!static_branch_likely(&sched_numa_balancing))
+-		return 0;
+-
+-	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
++	if (!p->numa_faults)
+ 		return 0;
+ 
+-	src_nid = cpu_to_node(env->src_cpu);
+-	dst_nid = cpu_to_node(env->dst_cpu);
++	src_nid = cpu_to_node(src_cpu);
++	dst_nid = cpu_to_node(dst_cpu);
+ 
+ 	if (src_nid == dst_nid)
+ 		return 0;
+ 
+ 	/* Migrating away from the preferred node is always bad. */
+ 	if (src_nid == p->numa_preferred_nid) {
+-		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
++		struct rq *src_rq = cpu_rq(src_cpu);
++		if (src_rq->nr_running > src_rq->nr_preferred_running)
+ 			return 1;
+ 		else
+ 			return 0;
+@@ -9328,7 +9605,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ 		return -1;
+ 
+ 	/* Leaving a core idle is often worse than degrading locality. */
+-	if (env->idle == CPU_IDLE)
++	if (idle)
+ 		return 0;
+ 
+ 	dist = node_distance(src_nid, dst_nid);
+@@ -9343,7 +9620,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ 	return src_weight - dst_weight;
+ }
+ 
++static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
++{
++	if (!static_branch_likely(&sched_numa_balancing))
++		return 0;
++
++	if (!(env->sd->flags & SD_NUMA))
++		return 0;
++
++	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
++					   env->idle == CPU_IDLE);
++}
++
+ #else
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
++{
++	return 0;
++}
++
+ static inline long migrate_degrades_locality(struct task_struct *p,
+ 					     struct lb_env *env)
+ {
+@@ -13102,8 +13396,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+  */
+ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ {
+-	struct cfs_rq *cfs_rq;
+ 	struct sched_entity *se = &curr->se;
++	struct cfs_rq *cfs_rq;
+ 
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+@@ -13113,6 +13407,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ 	if (static_branch_unlikely(&sched_numa_balancing))
+ 		task_tick_numa(rq, curr);
+ 
++	task_tick_cache(rq, curr);
++
+ 	update_misfit_status(curr, rq);
+ 	check_update_overutilized_status(task_rq(curr));
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 47972f34ea70..d16ccd66ca07 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1171,6 +1171,12 @@ struct rq {
+ 	u64			clock_pelt_idle_copy;
+ 	u64			clock_idle_copy;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	raw_spinlock_t		cpu_epoch_lock;
++	u64			cpu_runtime;
++	unsigned long		cpu_epoch;
++	unsigned long		cpu_epoch_next;
++#endif
+ 
+ 	atomic_t		nr_iowait;
+ 
+@@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
+ 
++extern void init_sched_mm(struct task_struct *p);
++
+ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+ extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ #ifdef CONFIG_SMP
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch
new file mode 100644
index 0000000..2527cbc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch
@@ -0,0 +1,230 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4E492F4A06
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:47 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270909; cv=none; b=OfHuG3LBktIYQ17A0wezDcBygFYIqQGnVMGi+J74hfqxXToXJOkcfe/QAshk8VQr3iHhepGalcue2+Gh9lXUo6YIap3bPlMoXEKyEF/uKj/HOqBRTfPfSFVzKLCzuG1BPrKVWm/9VqF9CtRE/PxcAkoHlqkKJK38pOqYBIlTkpU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270909; c=relaxed/simple;
+	bh=bDfBIxEdnv+hYygaV+u3o+TV5wT/EFTyHlaYTI7nFpw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=UKMgGxYucBL+0GlD7p3d/Zom0U72DS+gR1yhIB67WE5LMqkpy3l5lREKGfo/WMbkvVplPyT3O4LIWAcMuzVNNgwy1U2yRPrfXUYrbe55jB5Ido4zUO7riYoUV38Tur4ZomgT4/03W4QWQnIXvK43x1VsVDq4rpLILrJkc0MLRz4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZPYgHNmn; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZPYgHNmn"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270907; x=1781806907;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=bDfBIxEdnv+hYygaV+u3o+TV5wT/EFTyHlaYTI7nFpw=;
+  b=ZPYgHNmnGkft4GpxmGCyeKoUkQ215gIizexSy0kMiXa/NTiZo/gGlfyT
+   kUMF1ZHUyeWicU44K6z5ga9ude/u1b7dOInRSMexJBl7xg2wWzt43htgN
+   0SZD8bKm1Psl9VbJQzK0J75KsgRFBxuGgnNUxw1QIktunEAn4cwXF97df
+   RDtco1RU/rA4YaqRY//20xf5f/vHjT5XptItMKZ8mzjiE8ikBvrqvp2HQ
+   6dxxSgUzxm0LyOJqknyMhoG24RtRQixTDOHtv0zR/a3Etu3Yfy4fbLcgR
+   4fJb3VSDSPIj3CLVFTTh/id7nCY7gUsIPDC9MnN4GNRnUeVr8jGQaTFWv
+   Q==;
+X-CSE-ConnectionGUID: YR1pxp3JSTSWW1r6sY2fCw==
+X-CSE-MsgGUID: N3vXqxYJSWukJNGgS8J6EQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931471"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931471"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:47 -0700
+X-CSE-ConnectionGUID: 4Lza8fDiSzyAbCqKOh54VQ==
+X-CSE-MsgGUID: sumQVaeBRRG5ZeV1Vg8XHA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959777"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:46 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling
+Date: Wed, 18 Jun 2025 11:27:50 -0700
+Message-Id: <d73418022de76dab9f60c0c5432d783b3b2833dc.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+1. Fix compile error on percpu allocation.
+2. Enqueue to the target CPU rather than the current CPU.
+3. NULL LLC sched domain check(Libo Chen).
+4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
+5. Fix unsigned occupancy initialization to -1.
+6. If there is only 1 thread in the process, no need to enable cache
+   awareness
+7. Add __maybe_unused to __migrate_degrades_locality() to
+   avoid compile warnings.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/mm_types.h |  4 ++--
+ kernel/sched/fair.c      | 27 ++++++++++++++++-----------
+ kernel/sched/features.h  |  1 +
+ 3 files changed, 19 insertions(+), 13 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 013291c6aaa2..9de4a0a13c4d 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
+ 
+ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+ {
+-	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+ 	if (!pcpu_sched)
+ 		return -ENOMEM;
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index df7d4a324fbe..89db97f8ef02 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
+-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+ 	int i;
+@@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
+ 
+ 		pcpu_sched->runtime = 0;
+ 		pcpu_sched->epoch = epoch = rq->cpu_epoch;
+-		pcpu_sched->occ = -1;
++		pcpu_sched->occ = 0;
+ 	}
+ 
+ 	raw_spin_lock_init(&mm->mm_sched_lock);
+@@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (!mm || !mm->pcpu_sched)
+ 		return;
+ 
+-	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
++	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
+ 
+ 	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ 		__update_mm_sched(rq, pcpu_sched);
+@@ -1264,12 +1264,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	}
+ 
+ 	/*
+-	 * If this task hasn't hit task_cache_work() for a while, invalidate
++	 * If this task hasn't hit task_cache_work() for a while, or it
++	 * has only 1 thread, invalidate
+ 	 * it's preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
++	    get_nr_threads(p) <= 1) {
+ 		mm->mm_sched_cpu = -1;
+-		pcpu_sched->occ = -1;
++		pcpu_sched->occ = 0;
+ 	}
+ }
+ 
+@@ -1286,9 +1288,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 
+ 	guard(raw_spinlock)(&mm->mm_sched_lock);
+ 
+-	if (mm->mm_sched_epoch == rq->cpu_epoch)
+-		return;
+-
+ 	if (work->next == work) {
+ 		task_work_add(p, work, TWA_RESUME);
+ 		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
+@@ -1322,6 +1321,9 @@ static void task_cache_work(struct callback_head *work)
+ 			unsigned long occ, m_occ = 0, a_occ = 0;
+ 			int m_cpu = -1, nr = 0, i;
+ 
++			if (!sd)
++				continue;
++
+ 			for_each_cpu(i, sched_domain_span(sd)) {
+ 				occ = fraction_mm_sched(cpu_rq(i),
+ 							per_cpu_ptr(mm->pcpu_sched, i));
+@@ -8801,6 +8803,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	struct mm_struct *mm = p->mm;
+ 	int cpu;
+ 
++	if (!sched_feat(SCHED_CACHE))
++		return prev_cpu;
++
+ 	if (!mm || p->nr_cpus_allowed == 1)
+ 		return prev_cpu;
+ 
+@@ -9555,7 +9560,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 		return 0;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	if (p->mm && p->mm->pcpu_sched) {
++	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
+ 		/*
+ 		 * XXX things like Skylake have non-inclusive L3 and might not
+ 		 * like this L3 centric view. What to do about L2 stickyness ?
+@@ -9633,7 +9638,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ }
+ 
+ #else
+-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
++static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
+ {
+ 	return 0;
+ }
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 3c12d9f93331..d2af7bfd36bf 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_UTIL, true)
+ 
++SCHED_FEAT(SCHED_CACHE, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch
new file mode 100644
index 0000000..152e3a3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch
@@ -0,0 +1,112 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B98A42F4A13
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:49 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270911; cv=none; b=H37a6GaMCpKVbBfru0xhkv/YQMjuzakfh40XV8mJ06HkLTiVswK7M40TUc0iJ2+QdHbjvIsa3fkD0Ch9hrzqgWR417U/tS7He62fpoRnc/RWieBtEAO7KEIcS4LI+2bm+YmBVIN6m7jaZ7yUlmNHWqu6HcD8VDmZ1CHMeSMizgk=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270911; c=relaxed/simple;
+	bh=56rh2PM2yAL35gap+jzrhdtnDXnsh7kQnStk2sSwL5M=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=G2idI3SEXApx4GgXZ/P+aOwo15Jk0qYkNXdo1GgZkBwTVB4/wZnkC8GaXlcdpiVIBcXRH+vvno/YvO528eUsbhW6TDkWnRait/B5YQRy1pg5uGy44IKpVEiwxTH7cssQJgE1Tsmt4x5g8AlJKz8IC1CaADUdr9RqjRgXQ9GaDLs=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=nWZpvDE7; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="nWZpvDE7"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270909; x=1781806909;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=56rh2PM2yAL35gap+jzrhdtnDXnsh7kQnStk2sSwL5M=;
+  b=nWZpvDE7SLXGGfo3jHHGEomAw3ClY8hUPhWrfFErNHzBVjyIHSQntZd2
+   1nZXjrvyQxuRk9ZxQYH3QHfm14LYHc70BTSraxKI+8chQcuVj6tsAgrD1
+   RWdBKfVBbXjt+LaiRwCbUYMll6u+jjqnmEMHSrUpZujg3klH237md9SXa
+   37yFQxyarddD1nF8E+ny40AEdtC3cTGt5Ar19Wsp+W6417mEmx6ktZkZd
+   2s4JDPZkFpV3gOOZumfVaiUV2iM+gjXUhTEiJqzIpaoYdgOiBcYYH7tlm
+   I/na3T9fEDfmsq6JxtGp9O8CDC/E09i7K1m27+a7hskfivj2Uj+kXW6uY
+   w==;
+X-CSE-ConnectionGUID: iTxb4W8TSr+JVDmjhHc1nA==
+X-CSE-MsgGUID: 3TLdAYAnTC6j+0rzbFr1IQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931484"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931484"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:47 -0700
+X-CSE-ConnectionGUID: VnM+trhjTJSnHQCXxzcG2w==
+X-CSE-MsgGUID: xdjg5U3US8yEOUARte8LqQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959793"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:47 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC patch v3 03/20] sched: Avoid task migration within its preferred LLC
+Date: Wed, 18 Jun 2025 11:27:51 -0700
+Message-Id: <284223928844c9ae64de1fa142f8db89775de27b.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+It was found that when running schbench, there is a
+significant amount of in-LLC task migrations, even if
+the wakee is woken up on its preferred LLC. This
+leads to core-to-core latency and impairs performance.
+
+Inhibit task migration if the wakee is already in its
+preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/fair.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 89db97f8ef02..567ad2a0cfa2 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8813,6 +8813,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	if (cpu < 0)
+ 		return prev_cpu;
+ 
++	if (cpus_share_cache(cpu, prev_cpu))
++		return prev_cpu;
+ 
+ 	if (static_branch_likely(&sched_numa_balancing) &&
+ 	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
new file mode 100644
index 0000000..313ef8d
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
@@ -0,0 +1,122 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8C23B2F4A0D
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:49 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270911; cv=none; b=qgOqtbwj/b1lT+pk59OtjXjeOpPuI2zhfn0D21JXULqPLorw/ZgDKmNNS7Urzt/SfBFCUz801jIRaBO9Cslv2B7LxeJe//HjIB3+4P845payLN3vcYxAOxAbfLaUARgyfK6W4UUcOiOk0TZHcE9SbDMxaEDC/rvvNnXsMH1W0ug=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270911; c=relaxed/simple;
+	bh=nwpdkAoJpW4EOqPZQh+uhJ+qmZILgurvv6g4rDaWrYo=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=HAZIoXDlwzbldKWFUzY/Q4Jaxhhc2Mkt0bb9WOBfRPLAMS3DspRW02WEau/R32ErYRWHNC9ZlNYoWyuZiO4M7Awr204pB/+urb4Knb8pmbJ2BIdcVmJE5vaTPh/a1tQlov0Ea2J7+pXzMfBW5Wl8AAuJzrs80wakQeKZDNOWJJ0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ivStBZKK; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ivStBZKK"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270909; x=1781806909;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=nwpdkAoJpW4EOqPZQh+uhJ+qmZILgurvv6g4rDaWrYo=;
+  b=ivStBZKKr1kLat+jKvOOQxbf9d4bC1I/SJy7LDib/zgN3n589asFL4+r
+   PYsbbyBX6DjZAdbQ1Ik/G6Sc6usq0dD9Ziu/7QfHFyk6vz2whi6PSRkGc
+   qh0Xwo9lT9BAKy1pR/Oo42AYGiTgM5CVtwmFP8HFWa4TdxGk4w9dRkMX3
+   4DxdIZo1ar93mu1DEN1+6WJ0elyBKkJVxfHeC50jqR9/1tsGUnexjnQ7X
+   3xWpf6BNsr2eTOb+JyArdlGpErCa4hDLn0ptngc6kRC2FeOvceeJAY/Rw
+   iKQTxZNYHsLyWzeH16AJJGbNG66qTaf60D44RqGpvBPrHgL3ZhdDghMpz
+   A==;
+X-CSE-ConnectionGUID: x7oHEr0dTrOTH6OFzpdkdg==
+X-CSE-MsgGUID: UfwNMRYcRZKjKAWmE406aQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931496"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931496"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:48 -0700
+X-CSE-ConnectionGUID: FdoRwVvNTIiAzpt945cGzw==
+X-CSE-MsgGUID: TkAAX+2KQ6qAC2qWqI6SgA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959815"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:48 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 04/20] sched: Avoid calculating the cpumask if the system is overloaded
+Date: Wed, 18 Jun 2025 11:27:52 -0700
+Message-Id: <2901db739c1792eb9ece08a2b8d3b8de32efd0eb.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: K Prateek Nayak <kprateek.nayak@amd.com>
+
+If the SIS_UTIL cuts off idle cpu search, result of the cpumask_and() is
+of no use. Since select_idle_cpu() can now be called twice per wake up
+in the select_idle_sibling() due to cache aware wake up, this overhead
+can be visible in benchmarks like hackbench.
+
+To save some additional cycles, especially in cases where we target
+the LLC frequently and the search bails out because the LLC is busy,
+only calculate the cpumask if the system is not overloaded.
+
+Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
+---
+ kernel/sched/fair.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 567ad2a0cfa2..6a2678f9d44a 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -7918,8 +7918,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+-
+ 	if (sched_feat(SIS_UTIL)) {
+ 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ 		if (sd_share) {
+@@ -7931,6 +7929,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 		}
+ 	}
+ 
++	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
++
+ 	if (static_branch_unlikely(&sched_cluster_active)) {
+ 		struct sched_group *sg = sd->groups;
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
new file mode 100644
index 0000000..3bcee46
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
@@ -0,0 +1,157 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 546862F948D
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:51 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270915; cv=none; b=gCit7OdxmL/z+sHqvOYlmAu0jpeZwqhZeORAcAbKfGEmC7Cut1d8DS6/6wZGZvCrl4vEp3HCh8qjH6ozHZZ6tcNiYj+z1Y0m+CEOlg676Q1clxzwBbJ2P+CbCmGGNeEg3rwixeD6+R0v81UsIKJXJfMsf8UU4IJzxCienwRrWeA=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270915; c=relaxed/simple;
+	bh=GRt1KcpJc3uX3j1lW75IjTwwtuVzCFPIjuwEQmJ7o0c=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=FBJTd0ukxWRW+/MIo7zsf0SmGsGIBox8sqZYavmOM5fv6LOEqVyOr4wp2ndfb2WlEQFkw2Pp36oF0gqWn9mhyDwfdEsnpnqdcv1XeDKQshvz9ZWQHVjdtgZQp4BkNGTp0CFAVo9mAHC/VnVDZvOjoYt4QE/rqzevU8YFJ+7bEko=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MSgKa/Xs; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MSgKa/Xs"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270911; x=1781806911;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=GRt1KcpJc3uX3j1lW75IjTwwtuVzCFPIjuwEQmJ7o0c=;
+  b=MSgKa/Xs8zOJ4H2lqLt5AoI5bQ2Z5hgnpUup28q98ByliGPYLMtTuVoT
+   +lSE1UHq2qbmWe+CGFnXXNN0O11daPjgIRfzwTIAXeYYAwWhWm/SJOst4
+   2yClxpLCWgokA1/yxRHLW2J/20uBhmoIokqluLvohhQNoEZU8oeZgagEC
+   Urji0g1zMpdionTkeyTJrvrZh+ExyPjKEjVPQFLk6s+JnHq/wiwVKWRjm
+   iKAY3vicJCdgEZaqexyIOwSVKYAdj5Ds+qaro+e1pYLQIVMXZfJCil89W
+   2TwNI95OlQaBFc3aLSCKuvjf4TUtpWBzOnftomqkcPANu/uLnxs7/ZZJ5
+   Q==;
+X-CSE-ConnectionGUID: OsOzpi1fQVmQCEIlEJnnaA==
+X-CSE-MsgGUID: b5N1VeS7R72ao2jICSSBCQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931510"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931510"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:49 -0700
+X-CSE-ConnectionGUID: 03mrFPsdTSSv8opMC44BDA==
+X-CSE-MsgGUID: vBS5/z1NSra3RrHanVjnzw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959835"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:49 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 05/20] sched: Add hysteresis to switch a task's preferred LLC
+Date: Wed, 18 Jun 2025 11:27:53 -0700
+Message-Id: <7371f30196b317c0c5a0ae3fa463ec76a4dc69ef.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Switching a process's preferred LLC generates lots of task
+migrations across LLCs. To avoid frequent switches
+of home LLC, implement the following policy:
+
+1. Require a 2x occ change threshold to switch preferred LLC
+2. Don't discard preferred LLC for a task
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6a2678f9d44a..7fb2322c5d9e 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,6 +1175,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
++static int llc_id(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1299,6 +1307,7 @@ static void task_cache_work(struct callback_head *work)
+ 	struct task_struct *p = current;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
++	unsigned long last_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1;
+ 	cpumask_var_t cpus;
+ 
+@@ -1337,11 +1346,13 @@ static void task_cache_work(struct callback_head *work)
+ 					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+ 			}
+ 
+-			a_occ /= nr;
++			// a_occ /= nr;
+ 			if (a_occ > m_a_occ) {
+ 				m_a_occ = a_occ;
+ 				m_a_cpu = m_cpu;
+ 			}
++			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
++				last_m_a_occ = a_occ;
+ 
+ 			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+ 				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+@@ -1355,13 +1366,10 @@ static void task_cache_work(struct callback_head *work)
+ 		}
+ 	}
+ 
+-	/*
+-	 * If the max average cache occupancy is 'small' we don't care.
+-	 */
+-	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
+-		m_a_cpu = -1;
+-
+-	mm->mm_sched_cpu = m_a_cpu;
++	if (m_a_occ > (2 * last_m_a_occ)) {
++		/* avoid the bouncing of mm_sched_cpu */
++		mm->mm_sched_cpu = m_a_cpu;
++	}
+ 
+ 	free_cpumask_var(cpus);
+ }
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
new file mode 100644
index 0000000..c2d0adb
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
@@ -0,0 +1,195 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 229A22F9488
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:51 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270912; cv=none; b=trjx3OlHRA8G6/xha0gDitGyeQDoeJ3CWSLlk3i5tVcRLQ172YBtygleCjw//E9Ox3BBaWN+ph357z6VKUSjNpOMgeWNiH6GkUqOMtdSlowllHpGMFXca9dnbLicNEyUsvDUBI3SRpuyiOyhA3wQi6hex0PK3QdUEepICMIjXZg=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270912; c=relaxed/simple;
+	bh=o9ssIKZoupCUY+RNkWM8+C2a5S5kPuUisQEegnB07u4=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=iGg9GEruTAvKeZoWaAvOBUnqouWOODirEVRQZXGYRmNvKmUBtFTbO+hnJ2kyJkTtn96ZqswISYcW/8MaFKP99lSrk/CuZH9xxItsJABocbfde14vreP3VS50k7ELI9JacoChlVu0tLEaIQCSl73iwrgLFU0W1jkrM5FMDR2/bjQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gB32GMi+; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gB32GMi+"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270911; x=1781806911;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=o9ssIKZoupCUY+RNkWM8+C2a5S5kPuUisQEegnB07u4=;
+  b=gB32GMi+ZsCEz3HcthojXq5IZgpULXknP3fum+zJzV6sVIWDquFR+WL7
+   p4aRxUvmzJTtGtb6DbOhdfWcTMwfYPeBoThNWTLkO+kN/Gx/5mzI4RDN3
+   3JDAP4eXcOHwI+Xgzs+L46NGuc3oyWxTcDIB8oNEL0esdpvR2zH9nzgTk
+   s/AtWUC3ubNeM+NWKgu756KSw8M2pErJkISkQA7CeZMciVqZKfTbgApS0
+   EUcI9uAwecjVzzaaA+BEUO8jqFgqqlw3NL3G9rPT6t50c2BEIqPjRs/ed
+   abvo42eYS3OUZKzeI3iJnnkUKHv/OSzErtpgNUBNHgkLcvq3ke4HuEw9u
+   A==;
+X-CSE-ConnectionGUID: PSo7rU+bQcqeonMNyHQ+LA==
+X-CSE-MsgGUID: ps4xpG3nQJSMcwDcH4WDhg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931523"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931523"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:50 -0700
+X-CSE-ConnectionGUID: SALh/A1xQRqjZut3f9eZCA==
+X-CSE-MsgGUID: 8pi5QrQxSdKOJKN1adsLzw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959857"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:49 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC patch v3 06/20] sched: Save the per LLC utilization for better cache aware scheduling
+Date: Wed, 18 Jun 2025 11:27:54 -0700
+Message-Id: <22f5c52b3e904bd782c43bc4bfc6fcd4b447ea54.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+When a system gets busy and a process's preferred LLC
+is saturated by too many threads within this process, there are significant
+in-LLC task migrations within its preferred LLC. This leads to migration
+latency and degrades performance. Ideally, task aggregation should be
+inhibited if the task's preferred LLC is overloaded. This implies that a
+metric is needed to indicate whether the LLC is busy.
+
+Store the per-LLC utilization calculated via periodic load
+balancing. These statistics will be used in subsequent patches to
+determine whether tasks should be aggregated to their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/sched/topology.h |  3 ++
+ kernel/sched/fair.c            | 53 ++++++++++++++++++++++++++++++++++
+ 2 files changed, 56 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 7b4301b7235f..b3115bc1cbc0 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -78,6 +78,9 @@ struct sched_domain_shared {
+ 	atomic_t	nr_busy_cpus;
+ 	int		has_idle_cores;
+ 	int		nr_idle_scan;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned long	util_avg;
++#endif
+ };
+ 
+ struct sched_domain {
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 7fb2322c5d9e..02f104414b9a 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8806,6 +8806,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ #ifdef CONFIG_SCHED_CACHE
+ static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
+ 
++/* expected to be protected by rcu_read_lock() */
++static bool get_llc_stats(int cpu, unsigned long *util,
++			  unsigned long *cap)
++{
++	struct sched_domain_shared *sd_share;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share)
++		return false;
++
++	*util = READ_ONCE(sd_share->util_avg);
++	*cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE;
++
++	return true;
++}
++
+ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ {
+ 	struct mm_struct *mm = p->mm;
+@@ -10646,6 +10662,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Save this sched group's statistic for later use:
++ * The task wakeup and load balance can make better
++ * decision based on these statistics.
++ */
++static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
++			     struct sched_group *group)
++{
++	/* Find the sched domain that spans this group. */
++	struct sched_domain *sd = env->sd->child;
++	struct sched_domain_shared *sd_share;
++
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++		return;
++
++	/* only care the sched domain that spans 1 LLC */
++	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
++	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
++		return;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
++				   cpumask_first(sched_group_span(group))));
++	if (!sd_share)
++		return;
++
++	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
++		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
++}
++#else
++static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
++				    struct sched_group *group)
++{
++}
++#endif
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10735,6 +10787,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	update_sg_if_llc(env, sgs, group);
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
new file mode 100644
index 0000000..10724fe
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
@@ -0,0 +1,279 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CFC872F949E
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:51 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270915; cv=none; b=Z36ep254EkAHaKvpr8i7KwV4mxwpymtfd2E0A6r1XAf82xXaqo+m0qmyyZX4NW84q+tYHhFu/VEpulhnEvbYzsslAlOsEdbj/cgL3d/z0RoO88Yz4a6r2b06VMNmTH912fsGsTfN/YW+VYbD10CiJVQVXES+s/I6OYnWE4BIsqk=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270915; c=relaxed/simple;
+	bh=HEb10UrMOCgohlL9Nxhp873UqhXKYVZZ5sEwo42BWsw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=V5Z/EUYBBE/1u61niCwmDRwy0KcujoUnnkZEXjAt3pPVOfg6lPDVWI8oDLucO2nivKtjepMD9B9l2n0gliPOmSpL08JDjYQMNdIIyu7FMX+A1RKWk/VwrZpb/ie9Q5tCUnYK9oOIFmI9VCF2TdqXlfQyE/9gjbxg71N+r5vzmpE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iTHUAET4; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iTHUAET4"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270911; x=1781806911;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=HEb10UrMOCgohlL9Nxhp873UqhXKYVZZ5sEwo42BWsw=;
+  b=iTHUAET4QNzU15KipInKDKa553rGd8p4EGv5ueHYEHtVMDJBGVWb9jyb
+   0f+AUwD60jIkZnWy1Hh0X/GY6o6skz2X0cFXL5PYANaViVdrNCYYcBI9+
+   93Yy0pV59RjM4ec4buLe0pykhrmHjPvgPH4t2P2rCGCOr9UAxesex95B2
+   ljR8tWCmfhg2uyQELaySWiA4N1O7lUWXcjia1sXNj2D47V0T1Gu0IqDrm
+   dg1Y/Am2QjVh/PycKohb5TLEWxUNqGp1dzcMX1OUkpYS92qdY5o4yxWJp
+   mPLxjSRGG+jTFdBLrVXXU3kBfrZAbV2I+WaHLGmP5yRhH+/iv/CBputDH
+   g==;
+X-CSE-ConnectionGUID: nGL6WccUTluIsoPploYN4A==
+X-CSE-MsgGUID: vdU2TrNrS76S5tDix/iXow==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931537"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931537"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:51 -0700
+X-CSE-ConnectionGUID: 2qoc+SXgS1igWBhKnpTp2g==
+X-CSE-MsgGUID: +fdgV0C1R2CwRkSohE+TnA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959878"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:50 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 07/20] sched: Add helper function to decide whether to allow cache aware scheduling
+Date: Wed, 18 Jun 2025 11:27:55 -0700
+Message-Id: <abb433c3345587d068e5381c65e9d0b3f50828d9.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Cache-aware scheduling is designed to aggregate threads into their
+preferred LLC, either via the task wake up path or the load balancing
+path. One side effect is that when the preferred LLC is saturated,
+more threads will continue to be stacked on it, degrading the workload's
+latency. A strategy is needed to prevent this aggregation from going too
+far such that the preferred LLC is too overloaded.
+
+Introduce helper function _get_migrate_hint() to implement the LLC
+migration policy:
+
+1) A task is aggregated to its preferred LLC if both source/dest LLC
+   are not too busy (<50% utilization, tunable), or the preferred
+   LLC will not be too out of balanced from the non preferred LLC
+   (>20% utilization, tunable, close to imbalance_pct of the LLC
+   domain).
+2) Allow a task to be moved from the preferred LLC to the
+   non-preferred one if the non-preferred LLC will not be too out
+   of balanced from the preferred prompting an aggregation task
+   migration later.  We are still experimenting with the aggregation
+   and migration policy. Some other possibilities are policy based
+   on LLC's load or average number of tasks running.  Those could
+   be tried out by tweaking _get_migrate_hint().
+
+The function _get_migrate_hint() returns migration suggestions for the upper-level
+functions.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/debug.c |   4 ++
+ kernel/sched/fair.c  | 110 ++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |   5 ++
+ 3 files changed, 118 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 56ae54e0ce6a..7271ad1152af 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -532,6 +532,10 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ #endif
+ 
++#ifdef CONFIG_SCHED_CACHE
++	debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap);
++	debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb);
++#endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+ 	debugfs_fair_server_init();
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 02f104414b9a..10ea408d0e40 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8804,7 +8804,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ }
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
++static long __migrate_degrades_locality(struct task_struct *p,
++					int src_cpu, int dst_cpu,
++					bool idle);
++__read_mostly unsigned int sysctl_llc_aggr_cap       = 50;
++__read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
++
++/*
++ * The margin used when comparing LLC utilization with CPU capacity.
++ * Parameter sysctl_llc_aggr_cap determines the LLC load level where
++ * active LLC aggregation is done.
++ * Derived from fits_capacity().
++ *
++ * (default: ~50%)
++ */
++#define fits_llc_capacity(util, max)	\
++	((util) * 100 < (max) * sysctl_llc_aggr_cap)
++
++/*
++ * The margin used when comparing utilization.
++ * is 'util1' noticeably greater than 'util2'
++ * Derived from capacity_greater().
++ * Bias is in perentage.
++ */
++/* Allows dst util to be bigger than src util by up to bias percent */
++#define util_greater(util1, util2) \
++	((util1) * 100 > (util2) * (100 + sysctl_llc_aggr_imb))
++
++enum llc_mig_hint {
++	mig_allow = 0,
++	mig_ignore,
++	mig_forbid
++};
++
+ 
+ /* expected to be protected by rcu_read_lock() */
+ static bool get_llc_stats(int cpu, unsigned long *util,
+@@ -8822,6 +8854,82 @@ static bool get_llc_stats(int cpu, unsigned long *util,
+ 	return true;
+ }
+ 
++static enum llc_mig_hint _get_migrate_hint(int src_cpu, int dst_cpu,
++					   unsigned long tsk_util,
++					   bool to_pref)
++{
++	unsigned long src_util, dst_util, src_cap, dst_cap;
++
++	if (cpus_share_cache(src_cpu, dst_cpu))
++		return mig_allow;
++
++	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
++	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
++		return mig_allow;
++
++	if (!fits_llc_capacity(dst_util, dst_cap) &&
++	    !fits_llc_capacity(src_util, src_cap))
++		return mig_ignore;
++
++	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
++	dst_util = dst_util + tsk_util;
++	if (to_pref) {
++		/*
++		 * sysctl_llc_aggr_imb is the imbalance allowed between
++		 * preferred LLC and non-preferred LLC.
++		 * Don't migrate if we will get preferred LLC too
++		 * heavily loaded and if the dest is much busier
++		 * than the src, in which case migration will
++		 * increase the imbalance too much.
++		 */
++		if (!fits_llc_capacity(dst_util, dst_cap) &&
++		    util_greater(dst_util, src_util))
++			return mig_forbid;
++	} else {
++		/*
++		 * Don't migrate if we will leave preferred LLC
++		 * too idle, or if this migration leads to the
++		 * non-preferred LLC falls within sysctl_aggr_imb percent
++		 * of preferred LLC, leading to migration again
++		 * back to preferred LLC.
++		 */
++		if (fits_llc_capacity(src_util, src_cap) ||
++		    !util_greater(src_util, dst_util))
++			return mig_forbid;
++	}
++	return mig_allow;
++}
++
++/*
++ * Give suggestion when task p is migrated from src_cpu to dst_cpu.
++ */
++static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cpu,
++							 struct task_struct *p)
++{
++	struct mm_struct *mm;
++	int cpu;
++
++	if (cpus_share_cache(src_cpu, dst_cpu))
++		return mig_allow;
++
++	mm = p->mm;
++	if (!mm)
++		return mig_allow;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0)
++		return mig_allow;
++
++	if (cpus_share_cache(dst_cpu, cpu))
++		return _get_migrate_hint(src_cpu, dst_cpu,
++					 task_util(p), true);
++	else if (cpus_share_cache(src_cpu, cpu))
++		return _get_migrate_hint(src_cpu, dst_cpu,
++					 task_util(p), false);
++	else
++		return mig_allow;
++}
++
+ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ {
+ 	struct mm_struct *mm = p->mm;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index d16ccd66ca07..1c6fd45c7f62 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2818,6 +2818,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern unsigned int sysctl_llc_aggr_cap;
++extern unsigned int sysctl_llc_aggr_imb;
++#endif
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch
new file mode 100644
index 0000000..5998f9e
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch
@@ -0,0 +1,224 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D922E2F94BC
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:52 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270914; cv=none; b=qo1W6boom/FPcJM6aUbqBPYFsJsbF683DNq+T76orD1BhUuPT/cDgxLm/IdIt7lsAwwvhls6rRgrRp3wVI2a2orhxiRxH4pzTcUnStzKQ94lhDbiQkmwAnBP+Oe6i31HfDZbyBWWJXZl9duCrd/52c4F5rx8/huBgPpKES9g+o0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270914; c=relaxed/simple;
+	bh=cQUCpe4LZfLrzrqrRvhJ4zn32opdkosRxPW3YpSdQpw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=k8VqwMymU4J6e9GkRGLqFZapA+h/KmaiCqV6pWjSVsWwmgnsiVhiYCMQUYaIYCuRy8xdvkKxXLmpnnj81wxXCZFpX6tgVD+1igdqcNGnsw+8Dd7OREe7hmOl9DpLac08ZCVRrBBzXbfElsFBTfBsioIq0k6loXV4uYpQZrYBrMU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FUltCdk8; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FUltCdk8"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270912; x=1781806912;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=cQUCpe4LZfLrzrqrRvhJ4zn32opdkosRxPW3YpSdQpw=;
+  b=FUltCdk86P1n8b/xtaRLXyRNaWFyT3up5/DOjijJa7AptKtTq7F17sT9
+   +qgOnAsLS4hrBcE0M3b/NgTR2SCnVrsRwXJKozVz4N9t01io3n/dvVQKq
+   38gbrgGuDv4YYXh0s0Tdj8hQgPW825VDrCKW2iASUc/Zz+VmPLgQmKiPp
+   FyR41eBRrDbzEAAwNxvUiMbjT740rIgIieuCoK8C/tv7tcqrUHNVi1T/k
+   b/vnTpMgt+sYhmf2tlLBsLIZRkLEKBWUqEj3rUfk1D31j6gtYWu/kqjBU
+   asxz7novbH7ygWHnG2F/F9OvSjwzEc0+3fkEynQRMLwKacbHoHNVa0CrB
+   w==;
+X-CSE-ConnectionGUID: pkqYpBEfS3upf7WK/GoLIw==
+X-CSE-MsgGUID: paR/450tR6eOwDodmnSOag==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931550"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931550"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:52 -0700
+X-CSE-ConnectionGUID: uNM5F0vKRkCxQO9WvZjBoQ==
+X-CSE-MsgGUID: qccG1toMTWyEIVjk+nl3kg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959901"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:51 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 08/20] sched: Set up LLC indexing
+Date: Wed, 18 Jun 2025 11:27:56 -0700
+Message-Id: <71e251a086be786fb2a0480bbab69142d14bd22d.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Prepare for indexing arrays that track in each run queue: the number
+of tasks preferring current LLC and each of the other LLC.
+
+The reason to introduce LLC index is because the per LLC-scope data
+is needed to do cache aware load balancing. However, the native lld_id
+is usually the first CPU of that LLC domain, which is not continuous,
+which might waste the space if the per LLC-scope data is stored
+in an array (in current implementation).
+
+In the future, this LLC index could be removed after
+the native llc_id is used as the key to search into xarray based
+array.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h   |  3 +++
+ kernel/sched/fair.c     | 12 ++++++++++++
+ kernel/sched/sched.h    |  2 ++
+ kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++
+ 4 files changed, 46 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index d0e4cda2b3cd..7ce95a32e9ff 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -810,6 +810,9 @@ struct kmap_ctrl {
+ #endif
+ };
+ 
++/* XXX need fix to not use magic number */
++#define MAX_LLC 64
++
+ struct task_struct {
+ #ifdef CONFIG_THREAD_INFO_IN_TASK
+ 	/*
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 10ea408d0e40..5549710d95cf 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1183,6 +1183,18 @@ static int llc_id(int cpu)
+ 	return per_cpu(sd_llc_id, cpu);
+ }
+ 
++/*
++ * continous index.
++ * TBD: replace by xarray with key llc_id()
++ */
++static inline int llc_idx(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_idx, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 1c6fd45c7f62..74eb2f3615aa 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2037,6 +2037,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DECLARE_PER_CPU(int, sd_llc_size);
+ DECLARE_PER_CPU(int, sd_llc_id);
++DECLARE_PER_CPU(int, sd_llc_idx);
+ DECLARE_PER_CPU(int, sd_share_id);
+ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -2045,6 +2046,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ 
+ extern struct static_key_false sched_asym_cpucapacity;
+ extern struct static_key_false sched_cluster_active;
++extern int max_llcs;
+ 
+ static __always_inline bool sched_asym_cpucap_active(void)
+ {
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index f1ebc60d967f..b7bb13045dd8 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -672,6 +672,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DEFINE_PER_CPU(int, sd_llc_size);
+ DEFINE_PER_CPU(int, sd_llc_id);
++DEFINE_PER_CPU(int, sd_llc_idx);
+ DEFINE_PER_CPU(int, sd_share_id);
+ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -681,6 +682,25 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+ 
++int max_llcs = -1;
++
++static void update_llc_idx(int cpu)
++{
++#ifdef CONFIG_SCHED_CACHE
++	int idx = -1, llc_id = -1;
++
++	llc_id = per_cpu(sd_llc_id, cpu);
++	idx = per_cpu(sd_llc_idx, llc_id);
++
++	if (idx < 0) {
++		idx = max_llcs++;
++		BUG_ON(idx > MAX_LLC);
++		per_cpu(sd_llc_idx, llc_id) = idx;
++	}
++	per_cpu(sd_llc_idx, cpu) = idx;
++#endif
++}
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+@@ -699,6 +719,7 @@ static void update_top_cache_domain(int cpu)
+ 	per_cpu(sd_llc_size, cpu) = size;
+ 	per_cpu(sd_llc_id, cpu) = id;
+ 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
++	update_llc_idx(cpu);
+ 
+ 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ 	if (sd)
+@@ -2394,6 +2415,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (max_llcs < 0) {
++		for_each_possible_cpu(i)
++			per_cpu(sd_llc_idx, i) = -1;
++		max_llcs = 0;
++	}
++#endif
++
+ 	if (WARN_ON(cpumask_empty(cpu_map)))
+ 		goto error;
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch
new file mode 100644
index 0000000..043d343
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch
@@ -0,0 +1,148 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B94F12FA65E
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:54 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270916; cv=none; b=UeyCyGRjZM/3la5K7W5DZr7Fqbx/pXCPlXhvjgFOgYIXCDUEtBpJd57eBNDqgWvGJv8lBL+mUxf5kMWOHyA8RedaqjM+j02Jn9B78T1lChIZ2n/HcQ4ovyIdvMDIjh6GJZ0rAG3+mVxq530ReordkAU/8zkHRYpmSCOgeBTL+I4=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270916; c=relaxed/simple;
+	bh=qCkIhL6rGvZ+dzcJ6E4keINBL3DRXFHBoE2OhjjXt84=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=HdR2Xf4V3RIfo4sN94qeqMUmpg9AvXnNLA7hey4fd0WeAkTF0T4KDeRwQToF9rBlqaacSEzIC3ldfo3gUF0PSPBuQsHrIDtfOE0rb1e2syBB9Uy0m0Pyuh26N9i8RFDaGsxHG8i9HPtFTwvfAZ4PLiBu3T6sgQXJ8rTnQ7LikE0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ctebiIFc; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ctebiIFc"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270914; x=1781806914;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=qCkIhL6rGvZ+dzcJ6E4keINBL3DRXFHBoE2OhjjXt84=;
+  b=ctebiIFceLs/6XgbLjwR55dJzRZbzapx502Dxi0l/NvCbPchL4pYfpTv
+   mX4fL7dCkAt2uRGyN8gw9ioXWlpiuXifnjj+0MNxYIis5NlGMWrVuELt9
+   p2k9M3g2gYolFGidjsV123j/xwGYMbxHzvGAIu7gZe6H/GBXSmGkX0BuE
+   BbuXEWcCw0iqTOGZJwjotpQh0+0BjJedRDEyx/wJT4zQv28fNmgWwOtv1
+   f4suB3nLhc82MQOzvFx7z7nB0rbHQlioxhlaZW+cZpn776eX5rSkia9jE
+   XjCTqRmxuGrPP0O3C/HkP/FgXNG323aYUO7tahFuWXcmZCHjHmALri07M
+   Q==;
+X-CSE-ConnectionGUID: NPicx326QWy/zNraeo2kXQ==
+X-CSE-MsgGUID: V9D14qHKQa6053W/dqsxug==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931566"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931566"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:53 -0700
+X-CSE-ConnectionGUID: 7bYdnBGCRGq67Ukuexzzfg==
+X-CSE-MsgGUID: bMqkNepHSg24kSctAmlfHw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959924"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:52 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 09/20] sched: Introduce task preferred LLC field
+Date: Wed, 18 Jun 2025 11:27:57 -0700
+Message-Id: <7b9df4433d73ce415a27925ce42cec53774debaf.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+With cache aware scheduling enabled, each process is assigned
+a preferred LLC id, which will be used to quickly identify
+the LLC domain this thread prefers to run. This is similar to
+numa_preferred_nid for NUMA balance.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h | 1 +
+ init/init_task.c      | 3 +++
+ kernel/sched/fair.c   | 7 +++++++
+ 3 files changed, 11 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 7ce95a32e9ff..2f1cb7445733 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1404,6 +1404,7 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	int				preferred_llc;
+ #endif
+ 
+ #ifdef CONFIG_RSEQ
+diff --git a/init/init_task.c b/init/init_task.c
+index e557f622bd90..5fffbe766f57 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_group	= NULL,
+ 	.numa_faults	= NULL,
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	.preferred_llc  = -1,
++#endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ 	.kasan_depth	= 1,
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 5549710d95cf..cc804a8c7061 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1267,6 +1267,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
++	int mm_sched_llc = -1;
+ 
+ 	/*
+ 	 * init_task and kthreads don't be having no mm
+@@ -1293,6 +1294,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 		mm->mm_sched_cpu = -1;
+ 		pcpu_sched->occ = 0;
+ 	}
++
++	if (mm->mm_sched_cpu != -1)
++		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
++
++	if (p->preferred_llc != mm_sched_llc)
++		p->preferred_llc = mm_sched_llc;
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
new file mode 100644
index 0000000..cd8cf95
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
@@ -0,0 +1,238 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 378672FBFE4
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:55 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270916; cv=none; b=e05IFwV/MLqo2ClgaOGWLNSlY10NaanqCNFgdhzJHiSJKN+sh3Zlln5UdTmioRlRhRjp/nMlzDaeMob8JkL6Vnprcb7T4E++++CTqUtUCV4CFP4PSK0vw5A7hPHtk/OEkDT1g3ZfjDaU9iC9y5xW8mEtVA1goM0jAs9Hl7McPfo=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270916; c=relaxed/simple;
+	bh=WuCB5SIKo2iM8nX5ebWteeRjxdPAXbYteS+EwA2pE0w=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=oAGm3igrNrOpxMrHQQoLXScf+6Qhzs+QT0b8NDUn/Z1Lg6wRKdWBLm3Z8cBcgmvttwNrNTS1WRshJkfEt9buiwj9p9r7b5+8Pgfu5tiqdhoABS40DvzWpr6d+nOBznhgOieUV5aoD2LXxBNoP4rNF0a9Ez39F7B7HtKZvpqaKlk=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=NHLCwYLq; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="NHLCwYLq"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270915; x=1781806915;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=WuCB5SIKo2iM8nX5ebWteeRjxdPAXbYteS+EwA2pE0w=;
+  b=NHLCwYLqVTRJAuEFNcV41R7MnvuS9z2QmW5K059zPETnEpDlEodn/VG8
+   IZzAuJmXE4uhiAbjaKpwaneVSv8DpipBx12ro57h14bYGeGVOnAEqtRC0
+   MYOOelRwA6HB65s8wwQlNHjKsFH8Px5CvzkVOr9zfB2+Cf3ZCiBWNjFvJ
+   ia6JfMeXbhgywU5X/aCFqFVuO0i7U1S4e/3PZ/4lISImQE5ptcYQvqsVE
+   7frWf0qbM3P8Z3xratwf4AuiFQOa5n18Y0HGvqXbmBvbVIX5w1NHG2f4g
+   7u4XdLdk1q19T6udH+vKwbKmJFB9NiWgUD4pYJZpJbmpavs+mwAM+IGJz
+   Q==;
+X-CSE-ConnectionGUID: ABRohcKYRLuwXUvFn04ETg==
+X-CSE-MsgGUID: ajqbr60WT6mWJDBrVQIQ4w==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931579"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931579"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:54 -0700
+X-CSE-ConnectionGUID: wKSilpu8SRKhyBsxbDtmNA==
+X-CSE-MsgGUID: LTGYQY+lTBaZzmctTTPKPQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959948"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:53 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 10/20] sched: Calculate the number of tasks that have LLC preference on a runqueue
+Date: Wed, 18 Jun 2025 11:27:58 -0700
+Message-Id: <0664be8a3e805ed93eb930131951b1a84cebed66.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Track for each run queue, the number of tasks that have a LLC preference
+and how many of those tasks are running in its preferred LLC.  This is
+similar to nr_numa_running and nr_preferred_running for NUMA balance,
+and will be used by the cache-aware load balancing in subsequent patches.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/core.c  | 12 ++++++++++++
+ kernel/sched/fair.c  | 42 +++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |  7 +++++++
+ 3 files changed, 60 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index d9c3e75f79d1..34056eb79ef2 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -498,6 +498,18 @@ void __trace_set_current_state(int state_value)
+ }
+ EXPORT_SYMBOL(__trace_set_current_state);
+ 
++#ifdef CONFIG_SMP
++int task_llc(const struct task_struct *p)
++{
++	return per_cpu(sd_llc_id, task_cpu(p));
++}
++#else
++int task_llc(const struct task_struct *p)
++{
++	return 0;
++}
++#endif
++
+ /*
+  * Serialization rules:
+  *
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cc804a8c7061..88ff47194faa 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1195,6 +1195,18 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++	rq->nr_llc_running += (p->preferred_llc != -1);
++	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++	rq->nr_llc_running -= (p->preferred_llc != -1);
++	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1298,8 +1310,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (mm->mm_sched_cpu != -1)
+ 		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
+ 
+-	if (p->preferred_llc != mm_sched_llc)
++	if (p->preferred_llc != mm_sched_llc) {
++		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
++		account_llc_enqueue(rq, p);
++	}
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+@@ -1400,6 +1415,14 @@ void init_sched_mm(struct task_struct *p)
+ 	work->next = work;
+ }
+ 
++void reset_llc_stats(struct rq *rq)
++{
++	if (rq->nr_llc_running)
++		rq->nr_llc_running = 0;
++
++	rq->nr_pref_llc_running = 0;
++}
++
+ #else
+ 
+ static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+@@ -1410,6 +1433,17 @@ void init_sched_mm(struct task_struct *p) { }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++}
++
++void reset_llc_stats(struct rq *rq)
++{
++}
+ #endif
+ 
+ static inline
+@@ -3939,6 +3973,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+ 		account_numa_enqueue(rq, task_of(se));
++		account_llc_enqueue(rq, task_of(se));
+ 		list_add(&se->group_node, &rq->cfs_tasks);
+ 	}
+ #endif
+@@ -3952,10 +3987,15 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ #ifdef CONFIG_SMP
+ 	if (entity_is_task(se)) {
+ 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
++		account_llc_dequeue(rq_of(cfs_rq), task_of(se));
+ 		list_del_init(&se->group_node);
+ 	}
+ #endif
+ 	cfs_rq->nr_queued--;
++
++	/* safeguard? */
++	if (!parent_entity(se) && !cfs_rq->nr_queued)
++		reset_llc_stats(rq_of(cfs_rq));
+ }
+ 
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 74eb2f3615aa..6c83a71ac8ca 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1104,6 +1104,10 @@ struct rq {
+ 	unsigned int		nr_preferred_running;
+ 	unsigned int		numa_migrate_on;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int		nr_pref_llc_running;
++	unsigned int		nr_llc_running;
++#endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ #ifdef CONFIG_SMP
+ 	unsigned long		last_blocked_load_update_tick;
+@@ -1948,6 +1952,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ 
+ #endif /* !CONFIG_NUMA_BALANCING */
+ 
++extern void reset_llc_stats(struct rq *rq);
++extern int task_llc(const struct task_struct *p);
++
+ #ifdef CONFIG_SMP
+ 
+ static inline void
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
new file mode 100644
index 0000000..101e114
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
@@ -0,0 +1,180 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 88EE22FBFED
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:55 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270917; cv=none; b=P5gHhCOHlQM9p9ku+pEKNmIWE5uT5+S4kV5oxYtsqpEFVlr9fc0D6NpdlPtZ/gPYaVzFIEml4c8bRLDf/rApQ0P+4X3sXKjvOocZZdUlhKVWOp4g7Z3DkjKRfUK4EbZevwf1AguUUNQhOhr+jz43UGTNA9B35kqwXwKY13QXM6M=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270917; c=relaxed/simple;
+	bh=RUIbYeV38UehaAavCqtUVWreQbqIvMzQPunFd7dVPyE=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Nzm7Ku6skORzgLcSoIMVnTsqDhfLNjvSWDaPtFERhQETbGDLm9wp9WChfMfmJr8ewsuksm4tdeOptPipC31yfOalmbU4lZM/tCb9mFe/8h4/7fz9qSGQznPwU1NdTCxhel20eNqlBKW2RqU7JuzYJfo7KMA4C1hPgt5AyDArxK4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Mvd9Vfzt; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Mvd9Vfzt"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270915; x=1781806915;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=RUIbYeV38UehaAavCqtUVWreQbqIvMzQPunFd7dVPyE=;
+  b=Mvd9VfztrhHTmbjDNIxNZ/GGBzT7vnFVGVF55+uq3hkxTLk1MtmoMPMI
+   U1wXH7aIGF8CNiN6VHqa6PsrvDZd1CfkDD23bWW38C2q0vuFWUdOR1rsg
+   nQS1Vx/AFI6+tsMY9N0jzPGLqIf//4y/teLgExUZvlOCdWkv+ZRBOa19l
+   Q/hMdcFdmtGM8n1dub+WeL8RYjxLFhZ3ifnf7sPjEA0wCKGpnuAk0VE1o
+   xK9Vp73JH2YBrGV5TiS4D6hJAPIirmfsd5xm4xtPojrdCuhbtq4bZsQTB
+   QF0NjbAxGEPgdEHlXrMy/bsHGHXWbrQ1UYQwmOgyrnmyyqbinzuyILx3v
+   g==;
+X-CSE-ConnectionGUID: s21awipcQP25JSLcWRQsFw==
+X-CSE-MsgGUID: 8CcDA+u7TbaNoc5uJcqMDw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931593"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931593"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:55 -0700
+X-CSE-ConnectionGUID: SW2wbjbRTeKDU7dLzQyhpA==
+X-CSE-MsgGUID: /CMNaZ0xTte1NASZbF1kDA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959969"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:54 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 11/20] sched: Introduce per runqueue task LLC preference counter
+Date: Wed, 18 Jun 2025 11:27:59 -0700
+Message-Id: <5334cbd97788ba58938444f6e6f07e6c433a9e1c.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Each runqueue is assigned a static array, where each element indicates
+the number of tasks preferring a particular LLC mapped to the
+array index.
+
+For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
+this runqueue which prefer to run within LLC3 (indexed from 0 to MAX_LLC
+across the entire system). With this information, the load balancer can
+make better decisions to select the busiest runqueue and migrate tasks
+to their preferred LLC domains.
+
+Note: The static array could be converted to an xarray in the future.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c  | 36 +++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |  1 +
+ 2 files changed, 36 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 88ff47194faa..ba62b445bbbb 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1195,16 +1195,45 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static inline int pref_llc_idx(struct task_struct *p)
++{
++	return llc_idx(p->preferred_llc);
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	rq->nr_llc_running += (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	++rq->nr_pref_llc[pref_llc];
+ }
+ 
+ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	rq->nr_llc_running -= (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	/* avoid negative counter */
++	if (rq->nr_pref_llc[pref_llc] > 0)
++		--rq->nr_pref_llc[pref_llc];
+ }
+ 
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+@@ -1417,8 +1446,13 @@ void init_sched_mm(struct task_struct *p)
+ 
+ void reset_llc_stats(struct rq *rq)
+ {
+-	if (rq->nr_llc_running)
++	int i;
++
++	if (rq->nr_llc_running) {
++		for (i = 0; i < MAX_LLC; ++i)
++			rq->nr_pref_llc[i] = 0;
+ 		rq->nr_llc_running = 0;
++	}
+ 
+ 	rq->nr_pref_llc_running = 0;
+ }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 6c83a71ac8ca..391ddc0195f8 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1107,6 +1107,7 @@ struct rq {
+ #ifdef CONFIG_SCHED_CACHE
+ 	unsigned int		nr_pref_llc_running;
+ 	unsigned int		nr_llc_running;
++	unsigned int		nr_pref_llc[MAX_LLC];
+ #endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ #ifdef CONFIG_SMP
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
new file mode 100644
index 0000000..959d150
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
@@ -0,0 +1,139 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A23042FC004
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:56 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270918; cv=none; b=prYV6kiZ/g5CU/Hn41eTTdcf+nsgsejRMF9YlOFqiewBWHbnrWCTP9kxBZckxiRQ1VvQpER8tjN7QgbQ4c0zij9LcckrJVkX+Cpu6SZazEmgx+hiz7gehO5ul8BA5MMZlqJwJ29H/mrdXyUEt1ZTi0aUDrhm0/ch8vBT2HSUIiY=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270918; c=relaxed/simple;
+	bh=iM3ABT5TB/b+NuSHpzRR+MTBRchvN6UYmnNwwhN0XIw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=UorTPRN8/VKWenu+tPzkgHwZNthp7FChWvQMxBlnU9ZXHkqWwmKJBieJwmssJMgRaYf3QdYXGefsj9yI6+t1biKPLv8Rtoe+CX8vRIiRQisArNkktnElOHhLTlNzPEMBul5M5VTszzGE4dDKi0vulBDpxRNY+j7NieHcUzTr6NQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=dvIH6C13; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="dvIH6C13"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270916; x=1781806916;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=iM3ABT5TB/b+NuSHpzRR+MTBRchvN6UYmnNwwhN0XIw=;
+  b=dvIH6C13m8eg38MbrhvutO7tNOXJjoru+H/dxlcGqGgIEqt/3lnui+ls
+   Ax0AdHjwUeIvAv0vKW02IFqBb62BDr2GZeTL5v+KcPecWocJqc8AwLJPW
+   p6Re4BEEp9c3O5ht3z8Rh9lsWPW/V46p2aLbDPxAIC/89O9nQObGsK7fd
+   S23TsGqyhc3rr4+MaCrD+MN/GwL1Up9gi5S59wfKXiFZTw2VyXU6i/ieb
+   p3W93cwc0GbptCXluULNXuQNFNSSINbzdZ13xvmBr8sQkUjlHJttZa6ng
+   jxyrfFMwPCHG2cw2U4W0FjchU7U1sQuFxJb51T+CMtt3NmPwYWM3XaTsO
+   Q==;
+X-CSE-ConnectionGUID: WGpTAeg9TUi2Z/6fkCAarg==
+X-CSE-MsgGUID: 7dA/FD/PQxGrwYmk5u2dRA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931607"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931607"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:56 -0700
+X-CSE-ConnectionGUID: bXFqvuY3S6+9Zd3SiPYU/Q==
+X-CSE-MsgGUID: /yrcI1KcQO2+tdFpydQB5w==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180959988"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:55 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 12/20] sched: Calculate the total number of preferred LLC tasks during load balance
+Date: Wed, 18 Jun 2025 11:28:00 -0700
+Message-Id: <4a37811c12bbca8cb669904ad67dad3b7e99a552.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During load balancing between LLCs, gather the number of tasks
+on each runqueue of a source LLC.
+
+For example, consider a system with 4 sched groups LLC0, LLC1,
+..., LLC3. We are balancing towards LLC3 and LLC0 has 3 tasks
+preferring LLC3, LLC1 has 2 tasks preferring LLC3 and LLC2 has
+1 task preferring LLC3. LLC0 with most tasks preferring LLC3
+will be chosen as the busiest LLC to pick the tasks from.
+
+The number of tasks preferring the destination LLC are gathered
+from each run queue for a source LLC.
+
+For example, consider the sched_group LLC0 with two CPUs, CPU0
+and CPU1. On CPU0, 2 tasks prefer to run on LLC3, and on CPU1,
+one task prefers LLC3. The total number of tasks preferring
+LLC3 in LLC0 is 2 + 1 = 3.
+
+These statistics enable the load balancer to select tasks from
+a sched_group that best aligns tasks with their preferred LLCs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index ba62b445bbbb..99f3cee7b276 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10459,6 +10459,9 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int nr_pref_llc[MAX_LLC];
++#endif
+ };
+ 
+ /*
+@@ -10937,6 +10940,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		if (sched_feat(SCHED_CACHE)) {
++			int j;
++
++			for (j = 0; j < max_llcs; ++j)
++				sgs->nr_pref_llc[j] += rq->nr_pref_llc[j];
++		}
++#endif
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
new file mode 100644
index 0000000..06504e2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
@@ -0,0 +1,169 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 514DE2FC01C
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:57 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270919; cv=none; b=nmqdaWDatMrhBkfjuY3zJis51UO9eAa5aRb1rJdWVySUjW3tfYRxyj1Xkvi+fNpajS95RQl1kNM/Uc2yZ/0qy4Yr0n5zWNCB62WmrDP+LPoiGxGjwroeiGueYQuwTtAOG6KXPOSjIfKn4GM4dEwjzo3+VttU3Mxq2/vSBP1gjkQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270919; c=relaxed/simple;
+	bh=qbjVDBu0+RR9cVBkMVV/EEaCntO2T94kbrD7rZkO8yA=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=XnMghoHFlyLEgu+Wh96kFEDjFxOvIdG1kYivt+ooFQzL7JHqy7Y2tRCCgBmmjgXcODTYXXNN9TLYbc2t4TSsUmKPzAY7GwTWviiMJPDQpqTfLl+bgoY1YdlK7e1ynWuUJ9NxwRUCfO0asQkBgDzntM+cRVZ3lV7tz/MiGA6JuHE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=coUsxsM/; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="coUsxsM/"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270917; x=1781806917;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=qbjVDBu0+RR9cVBkMVV/EEaCntO2T94kbrD7rZkO8yA=;
+  b=coUsxsM/ixxY43XGQ61Df5pL4/CwC9wk7zMajcpFP2eKll8eBCTuGhU1
+   TDmuCcPGg5tMI5ZhS8hwToyQBxfmHALCjIHPMRTTN7NWZkIjloQEW5hzf
+   8OM/inZ27wXqGy9oddWdVppotNblwyx73zjRCiYiilRwXBDqWBWSJby2f
+   mn56QOTvTT4uucpyocRsNzlz0tvki+S25xv2mNIZJ1GFIXdpAREJ2ZZvQ
+   7hlrzMUkv6jPGBx21WWsulHPgdDzFpzgrgy7hSF/p1HI793hc8L9jfEZv
+   KcS4ylrKsFNBqYOFqL6hfs7PvPzeeEHhVD6z0cM0apx9kBQCg3dCDjTKK
+   Q==;
+X-CSE-ConnectionGUID: tZPMCIUPT5iVlY3/j2c6gg==
+X-CSE-MsgGUID: qbFq77/RT1KHBqt/bJ6sEA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931622"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931622"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:57 -0700
+X-CSE-ConnectionGUID: btZ9nZi5QCuWcK+hX1K3bQ==
+X-CSE-MsgGUID: 68bF5Nl8R3yXe/gNzr0YHw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960011"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:56 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 13/20] sched: Tag the sched group as llc_balance if it has tasks prefer other LLC
+Date: Wed, 18 Jun 2025 11:28:01 -0700
+Message-Id: <936c261e6283b8fa8c2d7e60493721f6594ce176.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During load balancing between LLCs, check whether there are tasks
+preferring the destination LLC. If so, balance those tasks to the
+destination LLC first.
+
+Tag the sched_group that has tasks preferring to run on other LLCs
+(non-local) with the group_llc_balance flag. This way, the load
+balancer will later attempt to pull/push these tasks to their
+preferred LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 99f3cee7b276..48a090c6e885 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10454,6 +10454,7 @@ struct sg_lb_stats {
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
++	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	unsigned int nr_numa_running;
+@@ -10818,6 +10819,43 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	return false;
+ }
+ 
++/*
++ * Do LLC balance on sched group that contains LLC, and have tasks preferring
++ * to run on LLC in idle dst_cpu.
++ */
++#ifdef CONFIG_SCHED_CACHE
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	struct sched_domain *child = env->sd->child;
++	int llc;
++
++	if (!sched_feat(SCHED_CACHE))
++		return false;
++
++	if (env->sd->flags & SD_SHARE_LLC)
++		return false;
++
++	/* only care about task migration among LLCs */
++	if (child && !(child->flags & SD_SHARE_LLC))
++		return false;
++
++	llc = llc_idx(env->dst_cpu);
++	if (sgs->nr_pref_llc[llc] > 0 &&
++	    _get_migrate_hint(env->src_cpu, env->dst_cpu,
++			      0, true) == mig_allow)
++		return true;
++
++	return false;
++}
++#else
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	return false;
++}
++#endif
++
+ static inline long sibling_imbalance(struct lb_env *env,
+ 				    struct sd_lb_stats *sds,
+ 				    struct sg_lb_stats *busiest,
+@@ -11000,6 +11038,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
+ 	update_sg_if_llc(env, sgs, group);
++
++	/* Check for tasks in this group can be moved to their preferred LLC */
++	if (!local_group && llc_balance(env, sgs, group))
++		sgs->group_llc_balance = 1;
++
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
new file mode 100644
index 0000000..b51a45b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
@@ -0,0 +1,173 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 378E62FCE1A
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:58 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270920; cv=none; b=ASxthUrBX6Z81qbsQocazhyRsc3w4KjSaibX0r0fmO/uPp3e/rDgPPSjzptpfRM65fVEmwToh+9nY+/mmo0DpYzyL1hx2NIpj7GZfFXAuMz7beZVpYQkrh2HNY8gyzOoVYXLKhwer420hvK4In5+4ah/Az0BdRL3g8Qqt51fikc=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270920; c=relaxed/simple;
+	bh=cMgIIHv7v3o6hwLDWo7qzlIUuIJ0MkqCBDVD8FqJjxk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=EWQtFnwscFIHJgsfTtDl3lp0BZZED+6rJ4mkP26EG71H0KTH1swfv+jnlTEZByRE4fdbCqlRsxWJKQE1P2n1+rbfG/iNowQx5qRuzogKgl+wAixpKa+2O1Es/si7+y0czWh1Gp4kfwIn7pT6wQ0T9XCYr9+UWmHFzVEOypFRnzs=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BpQBSzXY; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BpQBSzXY"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270918; x=1781806918;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=cMgIIHv7v3o6hwLDWo7qzlIUuIJ0MkqCBDVD8FqJjxk=;
+  b=BpQBSzXYfQCuuNQrY07i0oxTUklWpFxgVOI3T2jR9DS0iIIrA0R9YJ83
+   A9emVOApgFn/Vtg45BEuMxyBgA2TiW4xddTrQIm5gKRorVrmWRGguFZxO
+   nCW/eG3N/h/KeRxeDQDhVLByESmAqIOMi1VfU1gEw2Y77ZQX7MjFWlXNH
+   OUxB74DFQr31EirYxBp+QPY8d/5S5jyj2WR0Nq+yVEz01jtl24VXePQsv
+   wBg0aK2thwbQ070vTU2iI+McTBs29ChLZRqwba7zv7kzEGNCrqmDUK6Zg
+   bhc7UBL3FUSUKcR5z/7hq6ahpD4cObaTjU9buWfjUFBWJ1k3FZrl+UJpI
+   g==;
+X-CSE-ConnectionGUID: 3Abw6Bt+Qx+VsWy2ZRBi4Q==
+X-CSE-MsgGUID: HFq6k7fqT72cxCbUZ+ATOA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931636"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931636"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:58 -0700
+X-CSE-ConnectionGUID: ul8Mak7HRui0K/AZNo//iw==
+X-CSE-MsgGUID: NFSnEWS8RbuYiY0YXOwNnQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960034"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:57 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 14/20] sched: Introduce update_llc_busiest() to deal with groups having preferred LLC tasks
+Date: Wed, 18 Jun 2025 11:28:02 -0700
+Message-Id: <e5b77a2e33a6a98de0468c999e8c94d226b8e6d3.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+The load balancer attempts to identify the busiest sched_group with
+the highest load and migrates some tasks to a less busy sched_group
+to distribute the load across different CPUs.
+
+When cache-aware scheduling is enabled, the busiest sched_group is
+defined as the one with the highest number of tasks preferring to run
+on the destination LLC. If the busiest group has llc_balance tag,
+the cache aware load balance will be launched.
+
+Introduce the helper function update_llc_busiest() to identify
+such sched group with most tasks preferring the destination LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 48a090c6e885..ab3d1239d6e4 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10848,12 +10848,36 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	int idx;
++
++	/* Only the candidate with llc_balance need to be taken care of */
++	if (!sgs->group_llc_balance)
++		return false;
++
++	/*
++	 * There are more tasks that want to run on dst_cpu's LLC.
++	 */
++	idx = llc_idx(env->dst_cpu);
++	return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx];
++}
+ #else
+ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 			       struct sched_group *group)
+ {
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	return false;
++}
+ #endif
+ 
+ static inline long sibling_imbalance(struct lb_env *env,
+@@ -11085,6 +11109,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 	     sds->local_stat.group_type != group_has_spare))
+ 		return false;
+ 
++	/* deal with prefer LLC load balance, if failed, fall into normal load balance */
++	if (update_llc_busiest(env, busiest, sgs))
++		return true;
++
++	/* if there is already a busy group, skip the normal load balance */
++	if (busiest->group_llc_balance)
++		return false;
++
+ 	if (sgs->group_type > busiest->group_type)
+ 		return true;
+ 
+@@ -11991,9 +12023,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
+ 	/*
+ 	 * Try to move all excess tasks to a sibling domain of the busiest
+ 	 * group's child domain.
++	 * Also do so if we can move some tasks that prefer the local LLC.
+ 	 */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+-	    sibling_imbalance(env, &sds, busiest, local) > 1)
++	    (busiest->group_llc_balance ||
++	    sibling_imbalance(env, &sds, busiest, local) > 1))
+ 		goto force_balance;
+ 
+ 	if (busiest->group_type != group_overloaded) {
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
new file mode 100644
index 0000000..8a0371a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
@@ -0,0 +1,183 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 571022FCFC2
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:21:59 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270921; cv=none; b=Au/8Jdq57vaG6zpgwbWDMgAuubIJAHPnTlsoAXwGoHognpeK/aWGydhvQxM3536916CeCjNp7EH7OJ1j+rscZhPywV3siybixACVKTWmKknqhXSmK9iQja3rE6sE7M29Xk/pKSsaah9dw+I+23TM1f6VNcw/zxHYJJuvbu42ScY=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270921; c=relaxed/simple;
+	bh=qHzPg7pOAdSMp76icLDVAZqOGBB1+iXyIxtSLlESXR0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=UvCpUAB1cF+/DlV2OPMao3wito5w7p/P7XCMH0zVQpdX7ISAPe7+UYDSTlR5CXGTWmTgG7MhjDnYZB0VvoII8J7ZwG7QGcKzF1ITC8sBcvoSR2nl05LkQA/9d/FIPodpuCurin5CPmjX8yQEcG/PuH0gr8OoT0oFfJQ9PTGL73c=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FmFBFr6Y; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FmFBFr6Y"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270919; x=1781806919;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=qHzPg7pOAdSMp76icLDVAZqOGBB1+iXyIxtSLlESXR0=;
+  b=FmFBFr6Yj94ghXvPX0OCmpCOy98F1E44OpxMfBpTuE01Up5uaW3BC4dp
+   LKM2y1rnTUzZVvsXBUk+n0OQLTLEDTa762KmotgATQyk408JVWd7CeTmx
+   a5qvM/9qZL3kEomZaLdyET8OE/W/+gBaxg35o/VfV60g6iC8kUriAFUIK
+   FnOkrknbKEmGtpNieAKL4Z11kucxta5+z0O7A4asBMEslen5BktgpvTBS
+   OaNU8TXkSuVwDP/FVVia7CCMK0h99Xst5sxVHgrZZz/hLD2iZNRH8LdJs
+   at3EQiEbK/gun5R/uTtPhw5w9l5xV9iGjFYl1aRfV6FTSQSAQS5govoCZ
+   w==;
+X-CSE-ConnectionGUID: l1gKsZfFSOGB2PlCgz10uA==
+X-CSE-MsgGUID: vayaBofJRAyAxRdcDQ65+A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931650"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931650"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:58 -0700
+X-CSE-ConnectionGUID: K6PGgga3SImXHYmzMR4M/w==
+X-CSE-MsgGUID: l0kLMoLZQamEcnOsk8UfMg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960060"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:58 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 15/20] sched: Introduce a new migration_type to track the preferred LLC load balance
+Date: Wed, 18 Jun 2025 11:28:03 -0700
+Message-Id: <5b9c5a9ddb5b8b16ad20fbba9d41288de95741bc.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce a new migration type named migrate_llc_task to facilitate
+cache-aware load balancing.
+
+After the busiest sched_group is identified as the one that needs
+migration due to having most tasks preferring destination LLC, tag the
+migration type as the newly introduced migrate_llc_task. During load
+balancing, each runqueue within the busiest preferred-LLC sched_group
+is checked, and the runqueue with the highest number of tasks preferring
+to run on the destination CPU is chosen as the busiest runqueue.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 37 ++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 36 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index ab3d1239d6e4..42222364ad9c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9711,7 +9711,8 @@ enum migration_type {
+ 	migrate_load = 0,
+ 	migrate_util,
+ 	migrate_task,
+-	migrate_misfit
++	migrate_misfit,
++	migrate_llc_task
+ };
+ 
+ #define LBF_ALL_PINNED	0x01
+@@ -10143,6 +10144,15 @@ static int detach_tasks(struct lb_env *env)
+ 			env->imbalance -= util;
+ 			break;
+ 
++		case migrate_llc_task:
++			/*
++			 * Since can_migrate_task() succeed, when we reach here, it means that p
++			 * can be migrated even if dst_cpu is not p's preferred_llc, because there
++			 * are no idle cores for p to do in-llc load balance.
++			 */
++			env->imbalance--;
++			break;
++
+ 		case migrate_task:
+ 			env->imbalance--;
+ 			break;
+@@ -11779,6 +11789,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (busiest->group_llc_balance) {
++		/* Move a task that prefer local LLC */
++		env->migration_type = migrate_llc_task;
++		env->imbalance = 1;
++		return;
++	}
++#endif
++
+ 	if (busiest->group_type == group_imbalanced) {
+ 		/*
+ 		 * In the group_imb case we cannot rely on group-wide averages
+@@ -12087,6 +12106,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ 	unsigned int busiest_nr = 0;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int busiest_pref_llc = 0;
++	int dst_llc;
++#endif
+ 	int i;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+@@ -12195,6 +12218,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 			}
+ 			break;
+ 
++		case migrate_llc_task:
++#ifdef CONFIG_SCHED_CACHE
++			dst_llc = llc_idx(env->dst_cpu);
++			if (!cpus_share_cache(env->dst_cpu, rq->cpu) &&
++			    busiest_pref_llc < rq->nr_pref_llc[dst_llc]) {
++				busiest_pref_llc = rq->nr_pref_llc[dst_llc];
++				busiest = rq;
++			}
++#endif
++			break;
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
+ 				busiest_nr = nr_running;
+@@ -12377,6 +12410,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 	case migrate_misfit:
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
++	case migrate_llc_task:
++		break;
+ 	}
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch
new file mode 100644
index 0000000..7a821da
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch
@@ -0,0 +1,182 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 10A9D2FCFE0
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:22:00 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270921; cv=none; b=P+N3O5LY+75YpxuXAxCziwrhSLux4hrlWTDJ+f8IcG0rzPNOVsWLmvsBedk/2+jdPdqDw1wzG7atrrNckzm5Yyg74mjwX53XlvX5jdoIe7rAPpy4h3viBerEuO6WUgh96xv+h8Lwf0GEIyOdryyHXJYAAnIgHDvJPZSQuXjuTaw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270921; c=relaxed/simple;
+	bh=v21t2Zwrb1Nh/wAuSkK34PJS3ovpgrP9JoFKO8zGmaY=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ajQTLgiN8PnOKWjIVod1oKlrsTKQVI7OTOmzFXtG+73wlaVQvWGhqk0WMJJXjNaJRuZH21mpCLXZaDLmAm/qdrOjIx4FzgPZZPymYhkjL6psVeBZGckQhtba+IMm3kDkbtawrmcsQMOZ+Zwe6kb6VuOntBmYomWGuxm13ODEJMI=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eNKkiPXx; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eNKkiPXx"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270920; x=1781806920;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=v21t2Zwrb1Nh/wAuSkK34PJS3ovpgrP9JoFKO8zGmaY=;
+  b=eNKkiPXxlLeUp9MrxUVzTWIyvBs9ufToarQslgq0bYk5s/czMppb7IvI
+   h6f2ZKRMsVlGKRrHwIGJjWC2qE8qaTMXUEdQj3r8D+h0SN43VWG+hMsrT
+   RYlk/KgtxAMp9QrgVWboKznJ7vUI8egFwzA9KbPGigmWN87qDwRCRj+PI
+   1DVrQqjN70cltKuzihCwLLBClt53KvfL9NiCkywt0JuRcLKIP2iJEokrX
+   ajvz3jtkrzeg38383zwJHxSMbB8WBku1/QPExsvxhxX+x84ckOTN+YWR0
+   r0W/M6eCut34E/W0ufbrbXos01UbUqSL73VaxS852oWlwl340CVxjghCK
+   g==;
+X-CSE-ConnectionGUID: WN1cDgRBRjePAqtNlvvThA==
+X-CSE-MsgGUID: GG9U87oTRii/aNSvr7x6sQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931663"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931663"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:59 -0700
+X-CSE-ConnectionGUID: RBeMMmkDTB+RrdOWQJsvcA==
+X-CSE-MsgGUID: 9zB/bWW2R3GKVE6WyIDfGA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960082"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:59 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 16/20] sched: Consider LLC locality for active balance
+Date: Wed, 18 Jun 2025 11:28:04 -0700
+Message-Id: <1ce821178bf178ce841ea94bb8139fd9a197b86b.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If busiest run queue has only one task, active balance is enlisted
+to actually move the task.  However, before moving the task,
+we should consider whether we are moving the task from its preferred
+LLC.
+
+Don't move the single running task in a run queue to another LLC, if
+we are moving it from its desired LLC, or moving it will cause too much
+imbalance between the LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 48 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 42222364ad9c..3a8f6fc52055 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -12294,10 +12294,43 @@ imbalanced_active_balance(struct lb_env *env)
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	if (!sched_feat(SCHED_CACHE))
++		return 0;
++
++	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
++		return 0;
++	/*
++	 * All tasks want to stay put. Move only if LLC is
++	 * heavily loaded or don't pull a task from its
++	 * preferred CPU if it is the only one running.
++	 */
++	if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable &&
++	    (env->src_rq->nr_running <= 1 ||
++	    _get_migrate_hint(env->src_cpu, env->dst_cpu,
++			      0, false) == mig_forbid))
++		return 1;
++
++	return 0;
++}
++#else
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	return 0;
++}
++#endif
++
+ static int need_active_balance(struct lb_env *env)
+ {
+ 	struct sched_domain *sd = env->sd;
+ 
++	if (break_llc_locality(env))
++		return 0;
++
+ 	if (asym_active_balance(env))
+ 		return 1;
+ 
+@@ -12317,7 +12350,8 @@ static int need_active_balance(struct lb_env *env)
+ 			return 1;
+ 	}
+ 
+-	if (env->migration_type == migrate_misfit)
++	if (env->migration_type == migrate_misfit ||
++	    env->migration_type == migrate_llc_task)
+ 		return 1;
+ 
+ 	return 0;
+@@ -12762,9 +12796,20 @@ static int active_load_balance_cpu_stop(void *data)
+ 		goto out_unlock;
+ 
+ 	/* Is there any task to move? */
+-	if (busiest_rq->nr_running <= 1)
+-		goto out_unlock;
++	if (busiest_rq->nr_running <= 1) {
++#ifdef CONFIG_SCHED_CACHE
++		int llc = llc_idx(target_cpu);
+ 
++		if (!sched_feat(SCHED_CACHE))
++			goto out_unlock;
++
++		if (llc < 0)
++			goto out_unlock;
++		/* don't migrate if task does not prefer target */
++		if (busiest_rq->nr_pref_llc[llc] < 1)
++#endif
++			goto out_unlock;
++	}
+ 	/*
+ 	 * This condition is "impossible", if it occurs
+ 	 * we need to fix it. Originally reported by
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
new file mode 100644
index 0000000..abd082e
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
@@ -0,0 +1,193 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1B6EE2F5476
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:22:00 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270922; cv=none; b=MFTVX6qRlvnmcnypqJAnNUvjVFnCj+BYWehMpFkjTBU+YkJWvxgKrAJGcPlOnWFlULsIE0HJF5adxlSs+4NcBPZqPwLUEpp3DyzPS31YqqskBVjcvxtKVfWLg48hqUzzgp9v2j0fKtLs13VTywRh7Dh2csNg/XDFtX5FiqAZvbY=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270922; c=relaxed/simple;
+	bh=UQoBqN95xnQudsJ44o6C5oD7PSQHCIXgA6EcclP4fcc=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=nDrGI2/xX7+VWoIaZPAxSQ62lLIbd2FIobNRajXdY8S5xE+UkDaqRZV9oSWRZyNefE1ch1lBYfvNcBa+4ghO/kDKZP04UYkGh8gv4TzDurIYTenC4Ns0bIWzJq0lXhvHvCBGCuNffM2eKs5JKCc2O0pb2ptRCWrh8hgx0OzEDJU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gwPPS4Py; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gwPPS4Py"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270921; x=1781806921;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=UQoBqN95xnQudsJ44o6C5oD7PSQHCIXgA6EcclP4fcc=;
+  b=gwPPS4PyJxuWZjpqppzzQNy6oNFS2apouvtFBoztM3FRMsIZNoXOCYZS
+   4ZGsvXZ1GBKYWiosJJLy3Afvyz6rPjZGp6kTCMd3SEk6QElyc++ZHbpeH
+   U+87HjtVKO3MPeHlo5eycdT091abyiOHsWsk02bh++KLCXtrZChonH3SN
+   EXN9QhBQhTsKkKvGvzRjZJXx+5ylM+EmAu0SlP86VdBwSp8bjkVa10OXt
+   tZ/lEtGqbQUS8nYQOIluXmFXapZZs3teRfaTMOdaD+49KrQPZjXrq2Ex4
+   mkaL045bqiOr9hiagrNO4Meh5T5RKF9itXcTr61PJGxchTt1XfKGFw5XC
+   g==;
+X-CSE-ConnectionGUID: Yl7CRBbAQ5iJFKjFcNyJkw==
+X-CSE-MsgGUID: JmWjpB8BQ8SyduLVhShd6A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931677"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931677"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:00 -0700
+X-CSE-ConnectionGUID: PriWcHtzQ+CMXjb98vjSvg==
+X-CSE-MsgGUID: 4FJAHvYoQNmNJWCnl/pw1g==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960102"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:00 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 17/20] sched: Consider LLC preference when picking tasks from busiest queue
+Date: Wed, 18 Jun 2025 11:28:05 -0700
+Message-Id: <9d28a5a892f0413a96498bbf711eaa9b354ca895.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+When picking tasks from busiest queue for load balance, we currently
+do not consider LLC preference.
+
+Order the task in the busiest queue such that we picked the tasks in the
+following order:
+	1. tasks that prefer dst cpu's LLC
+	2. tasks that have no preference in LLC
+	3. tasks that prefer LLC other than the ones they are on
+	4. tasks that prefer the LLC that they are currently on
+
+This will allow tasks better chances to wind up in its preferred LLC.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 3a8f6fc52055..c9db32c2df63 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10056,6 +10056,68 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+ 	return NULL;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Prepare lists to detach tasks in the following order:
++ * 1. tasks that prefer dst cpu's LLC
++ * 2. tasks that have no preference in LLC
++ * 3. tasks that prefer LLC other than the ones they are on
++ * 4. tasks that prefer the LLC that they are currently on.
++ */
++static struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	struct task_struct *p;
++	LIST_HEAD(pref_old_llc);
++	LIST_HEAD(pref_new_llc);
++	LIST_HEAD(no_pref_llc);
++	LIST_HEAD(pref_other_llc);
++
++	if (!sched_feat(SCHED_CACHE))
++		return tasks;
++
++	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
++		return tasks;
++
++	while (!list_empty(tasks)) {
++		p = list_last_entry(tasks, struct task_struct, se.group_node);
++
++		if (p->preferred_llc == llc_id(env->dst_cpu)) {
++			list_move(&p->se.group_node, &pref_new_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == llc_id(env->src_cpu)) {
++			list_move(&p->se.group_node, &pref_old_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == -1) {
++			list_move(&p->se.group_node, &no_pref_llc);
++			continue;
++		}
++
++		list_move(&p->se.group_node, &pref_other_llc);
++	}
++
++	/*
++	 * We detach tasks from list tail in detach tasks.  Put tasks
++	 * to be chosen first at end of list.
++	 */
++	list_splice(&pref_new_llc, tasks);
++	list_splice(&no_pref_llc, tasks);
++	list_splice(&pref_other_llc, tasks);
++	list_splice(&pref_old_llc, tasks);
++	return tasks;
++}
++#else
++static inline struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	return tasks;
++}
++#endif
++
+ /*
+  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
+  * busiest_rq, as part of a balancing operation within domain "sd".
+@@ -10064,7 +10126,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+  */
+ static int detach_tasks(struct lb_env *env)
+ {
+-	struct list_head *tasks = &env->src_rq->cfs_tasks;
++	struct list_head *tasks;
+ 	unsigned long util, load;
+ 	struct task_struct *p;
+ 	int detached = 0;
+@@ -10083,6 +10145,8 @@ static int detach_tasks(struct lb_env *env)
+ 	if (env->imbalance <= 0)
+ 		return 0;
+ 
++	tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks);
++
+ 	while (!list_empty(tasks)) {
+ 		/*
+ 		 * We don't want to steal all, otherwise we may be treated likewise,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
new file mode 100644
index 0000000..e8d24ba
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
@@ -0,0 +1,155 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9EB802F5473
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:22:02 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270924; cv=none; b=UcUwkmFrXm1QZiHhRd9nLRLaJcXdFq15Quaiz8ZBN1nnL9SrnbVlLxUTqIyE9whgxAiEKu2+OgsC5VVcnjsA8wMU0p6jVlFPPQ7qmeBTzB6VM8FM85LAnq7ENrafpJvlDPCDIM9KyyIse0EZlGPKURu465AkRFJXtSxWwqh6Huo=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270924; c=relaxed/simple;
+	bh=LvkJ+My0UyW3xwEVx+qylSLEWrcmxbixkOEWZ10FTps=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=OB9jn7VvZ/rPmdAtTnDXoDgCyo72RKBXKCzeOdkrfLO6e85bK8hUMUUUOJEZalXFcdLxjZn/HiIycPeaDEtK0UPQwahP6NKXPIqG2XHiIkz2fC4a83E4onHZ8UGT50ZPOtl9Nzhvr5GlVAWllkK6TCvxOkmzUawTYoseeve+8+0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GlYX1oli; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GlYX1oli"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270922; x=1781806922;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=LvkJ+My0UyW3xwEVx+qylSLEWrcmxbixkOEWZ10FTps=;
+  b=GlYX1olin1BGUE11MHHJj9SNVdaVYXouh/gB+4N21ppPm9OJp1+kXEIW
+   jYKsD82DRpSUkmfbSWF6qlHl4i9BpZM+/sHr7a8DHXRrYaxO/Rj5jbXOw
+   +J6epKBAznqSQbDha14UPGm8Z7tWVAxbi3VVbxuqnoizc+7JuMcPHjd8u
+   wYx2yGauKvj0wQL3aVlaHP9Wp4NxgHk3BFgHplWMZc9XEc1wUSkwxQzqV
+   T9whL5z31EzCyXebtiORr7A5MGjv8KhiLdLGBw82yyfUiT62ZER6bvm4y
+   FqiKWIa9GZLlg4Z0hqOBSzlk3RseQZUvCgYBNujCvQGV07FE+Rdzvhqlv
+   Q==;
+X-CSE-ConnectionGUID: 4oIUGMDbTxGXwK/Erd3pww==
+X-CSE-MsgGUID: tbl/Cv7HTzy00jIWW42WkQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931694"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931694"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:02 -0700
+X-CSE-ConnectionGUID: dreUdLK/Snq9Tth5AMDo9w==
+X-CSE-MsgGUID: FI0VhmRdTFe0V4qTkbZ9qg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960127"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:00 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 18/20] sched: Do not migrate task if it is moving out of its preferred LLC
+Date: Wed, 18 Jun 2025 11:28:06 -0700
+Message-Id: <cfd95c4e502c5987141afcabc38566de9a5a34e1.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+In the final step of task migration during load balancing,
+can_migrate_task() is used to determine whether a task can
+be moved to the destination. If the task has an LLC preference,
+consider this preference when moving it out of its preferred LLC.
+With this check in place, there is no need to retain the task's
+cache-hot CPU check in task_hot(); remove it accordingly.
+
+Besides, add more checks in detach_tasks() to avoid choosing
+tasks that prefer their current LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 28 +++++++++++++++++-----------
+ 1 file changed, 17 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index c9db32c2df63..e342524481ed 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9787,17 +9787,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 	if (sysctl_sched_migration_cost == 0)
+ 		return 0;
+ 
+-#ifdef CONFIG_SCHED_CACHE
+-	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
+-		/*
+-		 * XXX things like Skylake have non-inclusive L3 and might not
+-		 * like this L3 centric view. What to do about L2 stickyness ?
+-		 */
+-		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
+-		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
+-	}
+-#endif
+-
+ 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+ 
+ 	return delta < (s64)sysctl_sched_migration_cost;
+@@ -9992,6 +9981,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (env->flags & LBF_ACTIVE_LB)
+ 		return 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_feat(SCHED_CACHE) &&
++	    get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid)
++		return 0;
++#endif
++
+ 	degrades = migrate_degrades_locality(p, env);
+ 	if (!degrades)
+ 		hot = task_hot(p, env);
+@@ -10252,6 +10247,17 @@ static int detach_tasks(struct lb_env *env)
+ 		if (env->imbalance <= 0)
+ 			break;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Don't detach more tasks if remaining tasks want to stay:
++		 * The tasks have already been sorted by order_tasks_by_llc(),
++		 * they are tasks that prefer the current LLC.
++		 */
++		if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 &&
++		    llc_id(env->src_cpu) == p->preferred_llc)
++			break;
++#endif
++
+ 		continue;
+ next:
+ 		if (p->sched_task_hot)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
new file mode 100644
index 0000000..d0ac1f3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
@@ -0,0 +1,185 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 36AF22FD899
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:22:03 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270924; cv=none; b=rqcnz4lr519vfb2X2M9DeLf5O+erdaRAJSoZ2E9S2odeoi76dMp/OZU1NB58Qjs+uncaH3qLdMqonjZ3kQl6htfGCrXwxMWgW2YZT8y6e/FYDEkDc76bmoSAGQbtAHi6zdd/a0QMbqOAiPFDTuQ/Av7Zd2Z3POZHWg4NK1E6gso=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270924; c=relaxed/simple;
+	bh=RBEoEDu2A+FwLyh++5jokg1I7TUhNUbkFf08/S4koCY=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=u++l74ct5wI88qe+ZhTxahgVxibbraZWIJUXAiGrfYKDyKJFt9lgn/tcRZtplKvjvmasXpdfWkYumF3dDOdoABJmyCFInhfV661Idkc/VE7bFkTegUBPg18Oyk856hhDCaV4uOx+JU6Wj+pkcN21wugnWmWN1myHTXggm5UEej4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=OxZqIyLs; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="OxZqIyLs"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270923; x=1781806923;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=RBEoEDu2A+FwLyh++5jokg1I7TUhNUbkFf08/S4koCY=;
+  b=OxZqIyLsJSroJ2i9MdsrROxH70E3NsG6kvAKifMPTRvyyrGMmXsUYxD3
+   CPzVxg8vQa7ptoVtXf4Q8V1g+8odAq77fXL+wB1Yz3cOwa5oIFmdnB5YW
+   BortfRVvhpa+xJaYKcO1/iYGTjoGzZlBvQ4DqinF+ijFvIH3FXFHUY7Yw
+   dnqNv+RspKaZf5GkEERusnRttKQTb+Ybdex2YDNmVMcMaLi3YqDVwQEd+
+   zvko7J7nf4iHqzRFD8LqvQWYwg1aAy+yQ4qBaHEh90PM1XJHSY8jbNW6c
+   NQxsij/EBLJiRtqClKTlBCTYmaEChOOO3OgR1tIMqHZLc+QmYryVKIQJ7
+   Q==;
+X-CSE-ConnectionGUID: DbGqwajpS0SQ1kimRCqNOw==
+X-CSE-MsgGUID: IxguJ0uxQf2FmLjSrT5dxA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931707"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931707"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:02 -0700
+X-CSE-ConnectionGUID: 55fSL6zAQZGOckEnYdnVsw==
+X-CSE-MsgGUID: LUEEbBOqRk6qNZaG5v1xSg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960148"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:02 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 19/20] sched: Introduce SCHED_CACHE_LB to control cache aware load balance
+Date: Wed, 18 Jun 2025 11:28:07 -0700
+Message-Id: <ac0e58b587dcd1e3ebe7a7a97973c5763d02a5f8.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce the SCHED_CACHE_LB sched feature to enable or disable
+cache aware load balance in the schduler.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 18 ++++++++++--------
+ kernel/sched/features.h |  1 +
+ 2 files changed, 11 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index e342524481ed..af742601f2d7 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9982,7 +9982,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 		return 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	if (sched_feat(SCHED_CACHE) &&
++	if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
+ 	    get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid)
+ 		return 0;
+ #endif
+@@ -10068,7 +10068,7 @@ static struct list_head
+ 	LIST_HEAD(no_pref_llc);
+ 	LIST_HEAD(pref_other_llc);
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return tasks;
+ 
+ 	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
+@@ -10253,7 +10253,8 @@ static int detach_tasks(struct lb_env *env)
+ 		 * The tasks have already been sorted by order_tasks_by_llc(),
+ 		 * they are tasks that prefer the current LLC.
+ 		 */
+-		if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 &&
++		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
++		    p->preferred_llc != -1 &&
+ 		    llc_id(env->src_cpu) == p->preferred_llc)
+ 			break;
+ #endif
+@@ -10910,7 +10911,7 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain *child = env->sd->child;
+ 	int llc;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return false;
+ 
+ 	if (env->sd->flags & SD_SHARE_LLC)
+@@ -11021,7 +11022,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain *sd = env->sd->child;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE ||
++	    !sched_feat(SCHED_CACHE_LB))
+ 		return;
+ 
+ 	/* only care the sched domain that spans 1 LLC */
+@@ -11083,7 +11085,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 			*sg_overutilized = 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-		if (sched_feat(SCHED_CACHE)) {
++		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) {
+ 			int j;
+ 
+ 			for (j = 0; j < max_llcs; ++j)
+@@ -12368,7 +12370,7 @@ imbalanced_active_balance(struct lb_env *env)
+ static inline bool
+ break_llc_locality(struct lb_env *env)
+ {
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return 0;
+ 
+ 	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+@@ -12870,7 +12872,7 @@ static int active_load_balance_cpu_stop(void *data)
+ #ifdef CONFIG_SCHED_CACHE
+ 		int llc = llc_idx(target_cpu);
+ 
+-		if (!sched_feat(SCHED_CACHE))
++		if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 			goto out_unlock;
+ 
+ 		if (llc < 0)
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index d2af7bfd36bf..11dbd74cd365 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -88,6 +88,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+ SCHED_FEAT(SIS_UTIL, true)
+ 
+ SCHED_FEAT(SCHED_CACHE, true)
++SCHED_FEAT(SCHED_CACHE_LB, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
new file mode 100644
index 0000000..1ae2586
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
@@ -0,0 +1,136 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 37FAE2FE315
+	for <linux-kernel@vger.kernel.org>; Wed, 18 Jun 2025 18:22:04 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1750270929; cv=none; b=onyBzk/JV7TsJ4rqzmPQxAynG5u8Uiv7NZBFqjIXX/vDeaVGTR6XM7u2t1DQFh6/8C8E442NQANkusHEp/W0G7MKp4l8bRLhTJDwy/WN6tGk0cfY5IF9GwVw8LyU0L2HDfqYL9FKb8t0ShAVnCE5wIeOC+RJNYKLspjyv345oV0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1750270929; c=relaxed/simple;
+	bh=GCa+xOiJ3NtDwAxf3UTuM7FdSBlF1t2VJbPM88NipeI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=lTB8OKjeVhk51AV9Z+yat4UlgOV4jPlwKTO3U5BqELX9+KB8jLro3KX85VFITsktx5Jba308vcVyotxvDMbwXzp5+qGVCYSvBcIyb/4B4Tot6SKFpcXHB6THYMdfQWeqbaG5Ds7ceyhFMgv2UbsDTiF0uZ6QidbwEUWatyQKrK0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=IXZaT3cj; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="IXZaT3cj"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1750270924; x=1781806924;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=GCa+xOiJ3NtDwAxf3UTuM7FdSBlF1t2VJbPM88NipeI=;
+  b=IXZaT3cj2WJHpQsa0eOY4RKLxD0XHxWtW3DBUY3jLShIsyHbPv6kV6PS
+   wBTZARncyqd81MhW2Dh6tAi77Kk2I1a86TYlMhKSh30I/NZi9Ohg6RQEG
+   B2e6bpm5YRM81JbZP0vAzdhRwJTJ6z+fezdmgGlo8EIBWlV8PKGUd4V1y
+   Q1K/xPtqmRaKI9stHeDWuocbpuMmO319jhINNuhdgtWOIH748A4vI8EIM
+   vIBaj9+wydAwFrFxiz/O6rePd8/Uv/i5oca2c3tnOmtRZT0khUyTei51V
+   l/RHQAM9KvyJRqc9LiIGKWsg+Dg76/187VJoJGXKdR6viVZo8MkXQOzWR
+   g==;
+X-CSE-ConnectionGUID: bOCOGOeLQ5mxgMbCQRgg+A==
+X-CSE-MsgGUID: LC2a0LmnQ2qip+retIyBQw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931720"
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="63931720"
+Received: from fmviesa001.fm.intel.com ([10.60.135.141])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:03 -0700
+X-CSE-ConnectionGUID: rby7BZr4SRm2YmXdsRLRtQ==
+X-CSE-MsgGUID: N6pda5x3S0GkUhyJrY38ZA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; 
+   d="scan'208";a="180960173"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:03 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Libo Chen <libo.chen@oracle.com>,
+	Abel Wu <wuyun.abel@bytedance.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Len Brown <len.brown@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Chen Yu <yu.c.chen@intel.com>
+Subject: [RFC patch v3 20/20] sched: Introduce SCHED_CACHE_WAKE to control LLC aggregation on wake up
+Date: Wed, 18 Jun 2025 11:28:08 -0700
+Message-Id: <1f8e7ec2d84a94ac0a31ca6182218ffaf7e166df.1750268218.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+References: <cover.1750268218.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce SCHED_CACHE_WAKE feature to enable or disable cache-aware
+wake up. Disable this feature by default because cache-aware wakeup
+is overly aggressive in stacking wakees of the same process on the same LLC,
+if they are frequently woken up.
+
+The wake ups can be much more frequent than load balances, adding
+much overhead when load balance alone for LLC aggregation is sufficient.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 6 +++++-
+ kernel/sched/features.h | 1 +
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index af742601f2d7..32c90fab0d63 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9028,7 +9028,7 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	struct mm_struct *mm = p->mm;
+ 	int cpu;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE))
+ 		return prev_cpu;
+ 
+ 	if (!mm || p->nr_cpus_allowed == 1)
+@@ -9041,6 +9041,10 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	if (cpus_share_cache(cpu, prev_cpu))
+ 		return prev_cpu;
+ 
++	if (_get_migrate_hint(prev_cpu, cpu,
++			      task_util(p), true) == mig_forbid)
++		return prev_cpu;
++
+ 	if (static_branch_likely(&sched_numa_balancing) &&
+ 	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
+ 		/*
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 11dbd74cd365..44b408cf0dd4 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -89,6 +89,7 @@ SCHED_FEAT(SIS_UTIL, true)
+ 
+ SCHED_FEAT(SCHED_CACHE, true)
+ SCHED_FEAT(SCHED_CACHE_LB, true)
++SCHED_FEAT(SCHED_CACHE_WAKE, false)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip b/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip
new file mode 100644
index 0000000..d4243e9
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip
@@ -0,0 +1,1032 @@
+From 22d4c29e7e688b17f8c7b25324c6b4bbfb07d52e Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Mon, 21 Jul 2025 21:13:03 +0200
+Subject: [PATCH] bore
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ include/linux/sched.h      |  18 ++
+ include/linux/sched/bore.h |  42 ++++
+ init/Kconfig               |  17 ++
+ kernel/Kconfig.hz          |  17 ++
+ kernel/fork.c              |   8 +
+ kernel/sched/Makefile      |   1 +
+ kernel/sched/bore.c        | 425 +++++++++++++++++++++++++++++++++++++
+ kernel/sched/core.c        |   8 +
+ kernel/sched/debug.c       |  61 +++++-
+ kernel/sched/fair.c        |  88 +++++++-
+ kernel/sched/sched.h       |   9 +
+ 11 files changed, 690 insertions(+), 4 deletions(-)
+ create mode 100644 include/linux/sched/bore.h
+ create mode 100644 kernel/sched/bore.c
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index aa9c5be7a..197a58414 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -565,6 +565,14 @@ struct sched_statistics {
+ #endif /* CONFIG_SCHEDSTATS */
+ } ____cacheline_aligned;
+ 
++#ifdef CONFIG_SCHED_BORE
++struct sched_burst_cache {
++	u32				value;
++	u32				count;
++	u64				timestamp;
++};
++#endif // CONFIG_SCHED_BORE
++
+ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+@@ -584,6 +592,16 @@ struct sched_entity {
+ 	u64				sum_exec_runtime;
+ 	u64				prev_sum_exec_runtime;
+ 	u64				vruntime;
++#ifdef CONFIG_SCHED_BORE
++	u64				burst_time;
++	u32				prev_burst_penalty;
++	u32				curr_burst_penalty;
++	u32				burst_penalty;
++	u8				burst_score;
++	u8				burst_count;
++	struct sched_burst_cache child_burst;
++	struct sched_burst_cache group_burst;
++#endif // CONFIG_SCHED_BORE
+ 	s64				vlag;
+ 	u64				slice;
+ 
+diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h
+new file mode 100644
+index 000000000..55c19da46
+--- /dev/null
++++ b/include/linux/sched/bore.h
+@@ -0,0 +1,42 @@
++
++#include <linux/sched.h>
++#include <linux/sched/cputime.h>
++
++#ifndef _LINUX_SCHED_BORE_H
++#define _LINUX_SCHED_BORE_H
++#define SCHED_BORE_AUTHOR   "Masahito Suzuki"
++#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification"
++
++#define SCHED_BORE_VERSION  "6.1.0"
++
++#ifdef CONFIG_SCHED_BORE
++extern u8   __read_mostly sched_bore;
++extern u8   __read_mostly sched_burst_exclude_kthreads;
++extern u8   __read_mostly sched_burst_smoothness;
++extern u8   __read_mostly sched_burst_fork_atavistic;
++extern u8   __read_mostly sched_burst_parity_threshold;
++extern u8   __read_mostly sched_burst_penalty_offset;
++extern uint __read_mostly sched_burst_penalty_scale;
++extern uint __read_mostly sched_burst_cache_stop_count;
++extern uint __read_mostly sched_burst_cache_lifetime;
++extern uint __read_mostly sched_deadline_boost_mask;
++
++extern void update_burst_score(struct sched_entity *se);
++extern void update_curr_bore(u64 delta_exec, struct sched_entity *se);
++
++extern void restart_burst(struct sched_entity *se);
++extern void restart_burst_rescale_deadline(struct sched_entity *se);
++
++extern int sched_bore_update_handler(const struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos);
++
++extern void sched_clone_bore(
++	struct task_struct *p, struct task_struct *parent, u64 clone_flags, u64 now);
++
++extern void reset_task_bore(struct task_struct *p);
++extern void sched_bore_init(void);
++
++extern void reweight_entity(struct cfs_rq *cfs_rq,
++	struct sched_entity *se, unsigned long weight, bool no_update_curr);
++#endif // CONFIG_SCHED_BORE
++#endif // _LINUX_SCHED_BORE_H
+diff --git a/init/Kconfig b/init/Kconfig
+index 666783eb5..9f32a8c27 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1381,6 +1381,23 @@ config CHECKPOINT_RESTORE
+ 
+ 	  If unsure, say N here.
+ 
++config SCHED_BORE
++	bool "Burst-Oriented Response Enhancer"
++	default y
++	help
++	  In Desktop and Mobile computing, one might prefer interactive
++	  tasks to keep responsive no matter what they run in the background.
++
++	  Enabling this kernel feature modifies the scheduler to discriminate
++	  tasks by their burst time (runtime since it last went sleeping or
++	  yielding state) and prioritize those that run less bursty.
++	  Such tasks usually include window compositor, widgets backend,
++	  terminal emulator, video playback, games and so on.
++	  With a little impact to scheduling fairness, it may improve
++	  responsiveness especially under heavy background workload.
++
++	  If unsure, say Y here.
++
+ config SCHED_AUTOGROUP
+ 	bool "Automatic process group scheduling"
+ 	select CGROUPS
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index ce1435cb0..b93d1f657 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -55,5 +55,22 @@ config HZ
+ 	default 300 if HZ_300
+ 	default 1000 if HZ_1000
+ 
++config MIN_BASE_SLICE_NS
++	int "Default value for min_base_slice_ns"
++	default 2000000
++	help
++	 The BORE Scheduler automatically calculates the optimal base
++	 slice for the configured HZ using the following equation:
++	 
++	 base_slice_ns =
++	 	1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ)
++	 
++	 This option sets the default lower bound limit of the base slice
++	 to prevent the loss of task throughput due to overscheduling.
++	 
++	 Setting this value too high can cause the system to boot with
++	 an unnecessarily large base slice, resulting in high scheduling
++	 latency and poor system responsiveness.
++
+ config SCHED_HRTICK
+ 	def_bool HIGH_RES_TIMERS
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 1ee8eb11f..2eaaaf9e8 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -115,6 +115,10 @@
+ /* For dup_mmap(). */
+ #include "../mm/internal.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif // CONFIG_SCHED_BORE
++
+ #include <trace/events/sched.h>
+ 
+ #define CREATE_TRACE_POINTS
+@@ -2313,6 +2317,10 @@ __latent_entropy struct task_struct *copy_process(
+ 	 * Need tasklist lock for parent etc handling!
+ 	 */
+ 	write_lock_irq(&tasklist_lock);
++#ifdef CONFIG_SCHED_BORE
++	if (likely(p->pid))
++		sched_clone_bore(p, current, clone_flags, p->start_time);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	/* CLONE_PARENT re-uses the old parent */
+ 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
+index 8ae86371d..b688084bc 100644
+--- a/kernel/sched/Makefile
++++ b/kernel/sched/Makefile
+@@ -37,3 +37,4 @@ obj-y += core.o
+ obj-y += fair.o
+ obj-y += build_policy.o
+ obj-y += build_utility.o
++obj-$(CONFIG_SCHED_BORE) += bore.o
+diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c
+new file mode 100644
+index 000000000..e7f80d91c
+--- /dev/null
++++ b/kernel/sched/bore.c
+@@ -0,0 +1,425 @@
++/*
++ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
++ *  Copyright (C) 2021-2025 Masahito Suzuki <firelzrd@gmail.com>
++ */
++#include <linux/cpuset.h>
++#include <linux/sched/task.h>
++#include <linux/sched/bore.h>
++#include "sched.h"
++
++#ifdef CONFIG_SCHED_BORE
++u8   __read_mostly sched_bore                   = 1;
++u8   __read_mostly sched_burst_exclude_kthreads = 1;
++u8   __read_mostly sched_burst_smoothness       = 40;
++u8   __read_mostly sched_burst_fork_atavistic   = 2;
++u8   __read_mostly sched_burst_parity_threshold = 2;
++u8   __read_mostly sched_burst_penalty_offset   = 24;
++uint __read_mostly sched_burst_penalty_scale    = 3180;
++uint __read_mostly sched_burst_cache_stop_count = 64;
++uint __read_mostly sched_burst_cache_lifetime   = 75000000;
++uint __read_mostly sched_deadline_boost_mask    = ENQUEUE_INITIAL
++                                                | ENQUEUE_WAKEUP;
++static int __maybe_unused maxval_6_bits  =   63;
++static int __maybe_unused maxval_8_bits  =  255;
++static int __maybe_unused maxval_12_bits = 4095;
++
++#define BURST_PENALTY_SHIFT 12
++#define MAX_BURST_PENALTY ((40U << BURST_PENALTY_SHIFT) - 1)
++
++static u32 log2p1_u64_u32fp(u64 v, u8 fp) {
++	if (!v) return 0;
++	u32 exponent = fls64(v);
++	u32 mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp));
++	return exponent << fp | mantissa;
++}
++
++static inline u32 calc_burst_penalty(u64 burst_time) {
++	u32 greed, tolerance, penalty, scaled_penalty;
++	
++	greed = log2p1_u64_u32fp(burst_time, BURST_PENALTY_SHIFT);
++	tolerance = sched_burst_penalty_offset << BURST_PENALTY_SHIFT;
++	penalty = max(0, (s32)(greed - tolerance));
++	scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
++
++	return min(MAX_BURST_PENALTY, scaled_penalty);
++}
++
++static inline u64 __scale_slice(u64 delta, u8 score)
++{return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);}
++
++static inline u64 __unscale_slice(u64 delta, u8 score)
++{return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);}
++
++static void reweight_task_by_prio(struct task_struct *p, int prio) {
++	struct sched_entity *se = &p->se;
++	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
++
++	reweight_entity(cfs_rq_of(se), se, weight, true);
++	se->load.inv_weight = sched_prio_to_wmult[prio];
++}
++
++static inline u8 effective_prio(struct task_struct *p) {
++	u8 prio = p->static_prio - MAX_RT_PRIO;
++	if (likely(sched_bore))
++		prio += p->se.burst_score;
++	return min(39, prio);
++}
++
++void update_burst_score(struct sched_entity *se) {
++	if (!entity_is_task(se)) return;
++	struct task_struct *p = task_of(se);
++	u8 prev_prio = effective_prio(p);
++
++	u8 burst_score = 0;
++	if (!((p->flags & PF_KTHREAD) && likely(sched_burst_exclude_kthreads)))
++		burst_score = se->burst_penalty >> BURST_PENALTY_SHIFT;
++	se->burst_score = burst_score;
++
++	u8 new_prio = effective_prio(p);
++	if (new_prio != prev_prio)
++		reweight_task_by_prio(p, new_prio);
++}
++
++void update_curr_bore(u64 delta_exec, struct sched_entity *se) {
++	if (!entity_is_task(se)) return;
++
++	se->burst_time += delta_exec;
++	se->curr_burst_penalty = calc_burst_penalty(se->burst_time);
++	if (se->curr_burst_penalty > se->prev_burst_penalty)
++		se->burst_penalty = se->prev_burst_penalty +
++		(se->curr_burst_penalty - se->prev_burst_penalty) / se->burst_count;
++	update_burst_score(se);
++}
++
++static inline u32 binary_smooth(u32 new, u32 old, u8 dumper) {
++	u32 abs_diff = (new > old)? (new - old): (old - new);
++	u32 adj_diff = (abs_diff / dumper) + ((abs_diff % dumper) != 0);
++	return (new > old)? (old + adj_diff): (old - adj_diff);
++}
++
++static void __restart_burst(struct sched_entity *se) {
++	se->prev_burst_penalty = binary_smooth(
++		se->curr_burst_penalty, se->prev_burst_penalty, se->burst_count);
++	se->burst_time = 0;
++	se->curr_burst_penalty = 0;
++
++	u8 smoothness = sched_burst_smoothness;
++	if (se->burst_count < smoothness)
++		se->burst_count++;
++	else if (unlikely(se->burst_count > smoothness))
++		se->burst_count = smoothness;
++}
++
++inline void restart_burst(struct sched_entity *se) {
++	__restart_burst(se);
++	se->burst_penalty = se->prev_burst_penalty;
++	update_burst_score(se);
++}
++
++void restart_burst_rescale_deadline(struct sched_entity *se) {
++	s64 vscaled, wremain, vremain = se->deadline - se->vruntime;
++	struct task_struct *p = task_of(se);
++	u8 prev_prio = effective_prio(p);
++	restart_burst(se);
++	u8 new_prio = effective_prio(p);
++	if (prev_prio > new_prio) {
++		wremain = __unscale_slice(abs(vremain), prev_prio);
++		vscaled = __scale_slice(wremain, new_prio);
++		if (unlikely(vremain < 0))
++			vscaled = -vscaled;
++		se->deadline = se->vruntime + vscaled;
++	}
++}
++
++static inline bool task_is_bore_eligible(struct task_struct *p)
++{return p && p->sched_class == &fair_sched_class && !p->exit_state;}
++
++static inline void reset_task_weights_bore(void) {
++	struct task_struct *task;
++	struct rq *rq;
++	struct rq_flags rf;
++
++	write_lock_irq(&tasklist_lock);
++	for_each_process(task) {
++		if (!task_is_bore_eligible(task)) continue;
++		rq = task_rq(task);
++		rq_pin_lock(rq, &rf);
++		update_rq_clock(rq);
++		reweight_task_by_prio(task, effective_prio(task));
++		rq_unpin_lock(rq, &rf);
++	}
++	write_unlock_irq(&tasklist_lock);
++}
++
++int sched_bore_update_handler(const struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos) {
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	reset_task_weights_bore();
++
++	return 0;
++}
++
++#define for_each_child(p, t) \
++	list_for_each_entry(t, &(p)->children, sibling)
++
++static inline u32 count_entries_upto2(struct list_head *head) {
++	struct list_head *next = head->next;
++	return (next != head) + (next->next != head);
++}
++
++static inline bool burst_cache_expired(struct sched_burst_cache *bc, u64 now)
++{return (s64)(bc->timestamp + sched_burst_cache_lifetime - now) < 0;}
++
++static void update_burst_cache(struct sched_burst_cache *bc,
++	struct task_struct *p, u32 cnt, u32 sum, u64 now) {
++	u32 avg = cnt ? sum / cnt : 0;
++	bc->value = max(avg, p->se.burst_penalty);
++	bc->count = cnt;
++	bc->timestamp = now;
++}
++
++static inline void update_child_burst_direct(struct task_struct *p, u64 now) {
++	u32 cnt = 0, sum = 0;
++	struct task_struct *child;
++
++	for_each_child(p, child) {
++		if (!task_is_bore_eligible(child)) continue;
++		cnt++;
++		sum += child->se.burst_penalty;
++	}
++
++	update_burst_cache(&p->se.child_burst, p, cnt, sum, now);
++}
++
++static inline u32 inherit_burst_direct(
++	struct task_struct *p, u64 now, u64 clone_flags) {
++	struct task_struct *parent = p;
++	struct sched_burst_cache *bc;
++
++	if (clone_flags & CLONE_PARENT)
++		parent = parent->real_parent;
++
++	bc = &parent->se.child_burst;
++	if (burst_cache_expired(bc, now))
++		update_child_burst_direct(parent, now);
++
++	return bc->value;
++}
++
++static void update_child_burst_topological(
++	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
++	u32 cnt = 0, dcnt = 0, sum = 0;
++	struct task_struct *child, *dec;
++	struct sched_burst_cache *bc __maybe_unused;
++
++	for_each_child(p, child) {
++		dec = child;
++		while ((dcnt = count_entries_upto2(&dec->children)) == 1)
++			dec = list_first_entry(&dec->children, struct task_struct, sibling);
++		
++		if (!dcnt || !depth) {
++			if (!task_is_bore_eligible(dec)) continue;
++			cnt++;
++			sum += dec->se.burst_penalty;
++			continue;
++		}
++		bc = &dec->se.child_burst;
++		if (!burst_cache_expired(bc, now)) {
++			cnt += bc->count;
++			sum += bc->value * bc->count;
++			if (sched_burst_cache_stop_count <= cnt) break;
++			continue;
++		}
++		update_child_burst_topological(dec, now, depth - 1, &cnt, &sum);
++	}
++
++	update_burst_cache(&p->se.child_burst, p, cnt, sum, now);
++	*acnt += cnt;
++	*asum += sum;
++}
++
++static inline u32 inherit_burst_topological(
++	struct task_struct *p, u64 now, u64 clone_flags) {
++	struct task_struct *anc = p;
++	struct sched_burst_cache *bc;
++	u32 cnt = 0, sum = 0;
++	u32 base_child_cnt = 0;
++
++	if (clone_flags & CLONE_PARENT) {
++		anc = anc->real_parent;
++		base_child_cnt = 1;
++	}
++
++	for (struct task_struct *next;
++		 anc != (next = anc->real_parent) &&
++		 	count_entries_upto2(&anc->children) <= base_child_cnt;) {
++		anc = next;
++		base_child_cnt = 1;
++	}
++
++	bc = &anc->se.child_burst;
++	if (burst_cache_expired(bc, now))
++		update_child_burst_topological(
++			anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum);
++
++	return bc->value;
++}
++
++static inline void update_tg_burst(struct task_struct *p, u64 now) {
++	struct task_struct *task;
++	u32 cnt = 0, sum = 0;
++
++	for_each_thread(p, task) {
++		if (!task_is_bore_eligible(task)) continue;
++		cnt++;
++		sum += task->se.burst_penalty;
++	}
++
++	update_burst_cache(&p->se.group_burst, p, cnt, sum, now);
++}
++
++static inline u32 inherit_burst_tg(struct task_struct *p, u64 now) {
++	struct task_struct *parent = p->group_leader;
++	struct sched_burst_cache *bc = &parent->se.group_burst;
++	if (burst_cache_expired(bc, now))
++		update_tg_burst(parent, now);
++
++	return bc->value;
++}
++
++void sched_clone_bore(struct task_struct *p,
++	struct task_struct *parent, u64 clone_flags, u64 now) {
++	struct sched_entity *se = &p->se;
++	u32 penalty;
++
++	if (!task_is_bore_eligible(p)) return;
++
++	penalty = (clone_flags & CLONE_THREAD)?
++		inherit_burst_tg(parent, now):
++		(likely(sched_burst_fork_atavistic)?
++			inherit_burst_topological(parent, now, clone_flags):
++			inherit_burst_direct(parent, now, clone_flags));
++
++	__restart_burst(se);
++	se->burst_penalty = se->prev_burst_penalty =
++		max(se->prev_burst_penalty, penalty);
++	se->burst_count = 1;
++	se->child_burst.timestamp = 0;
++	se->group_burst.timestamp = 0;
++}
++
++void reset_task_bore(struct task_struct *p) {
++	p->se.burst_time = 0;
++	p->se.prev_burst_penalty = 0;
++	p->se.curr_burst_penalty = 0;
++	p->se.burst_penalty = 0;
++	p->se.burst_score = 0;
++	p->se.burst_count = 1;
++	memset(&p->se.child_burst, 0, sizeof(struct sched_burst_cache));
++	memset(&p->se.group_burst, 0, sizeof(struct sched_burst_cache));
++}
++
++void __init sched_bore_init(void) {
++	printk(KERN_INFO "%s %s by %s\n",
++		SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR);
++	reset_task_bore(&init_task);
++}
++
++#ifdef CONFIG_SYSCTL
++static struct ctl_table sched_bore_sysctls[] = {
++	{
++		.procname	= "sched_bore",
++		.data		= &sched_bore,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = sched_bore_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_exclude_kthreads",
++		.data		= &sched_burst_exclude_kthreads,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_smoothness",
++		.data		= &sched_burst_smoothness,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ONE,
++		.extra2		= &maxval_8_bits,
++	},
++	{
++		.procname	= "sched_burst_fork_atavistic",
++		.data		= &sched_burst_fork_atavistic,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_THREE,
++	},
++	{
++		.procname	= "sched_burst_parity_threshold",
++		.data		= &sched_burst_parity_threshold,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_8_bits,
++	},
++	{
++		.procname	= "sched_burst_penalty_offset",
++		.data		= &sched_burst_penalty_offset,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_6_bits,
++	},
++	{
++		.procname	= "sched_burst_penalty_scale",
++		.data		= &sched_burst_penalty_scale,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_12_bits,
++	},
++	{
++		.procname	= "sched_burst_cache_stop_count",
++		.data		= &sched_burst_cache_stop_count,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_burst_cache_lifetime",
++		.data		= &sched_burst_cache_lifetime,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_deadline_boost_mask",
++		.data		= &sched_deadline_boost_mask,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++};
++
++static int __init sched_bore_sysctl_init(void) {
++	register_sysctl_init("kernel", sched_bore_sysctls);
++	return 0;
++}
++late_initcall(sched_bore_sysctl_init);
++#endif // CONFIG_SYSCTL
++#endif // CONFIG_SCHED_BORE
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 81c6df746..45832d151 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -97,6 +97,10 @@
+ #include "../../io_uring/io-wq.h"
+ #include "../smpboot.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif // CONFIG_SCHED_BORE
++
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
+ 
+@@ -8523,6 +8527,10 @@ void __init sched_init(void)
+ 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	sched_bore_init();
++#endif // CONFIG_SCHED_BORE
++
+ 	wait_bit_init();
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 557246880..c1f6219f2 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = {
+ };
+ 
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \
++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \
++{ \
++	char buf[16]; \
++	unsigned int value; \
++\
++	if (cnt > 15) \
++		cnt = 15; \
++\
++	if (copy_from_user(&buf, ubuf, cnt)) \
++		return -EFAULT; \
++	buf[cnt] = '\0'; \
++\
++	if (kstrtouint(buf, 10, &value)) \
++		return -EINVAL; \
++\
++	sysctl_sched_##name = value; \
++	sched_update_##update_func(); \
++\
++	*ppos += cnt; \
++	return cnt; \
++} \
++\
++static int sched_##name##_show(struct seq_file *m, void *v) \
++{ \
++	seq_printf(m, "%d\n", sysctl_sched_##name); \
++	return 0; \
++} \
++\
++static int sched_##name##_open(struct inode *inode, struct file *filp) \
++{ \
++	return single_open(filp, sched_##name##_show, NULL); \
++} \
++\
++static const struct file_operations sched_##name##_fops = { \
++	.open		= sched_##name##_open, \
++	.write		= sched_##name##_write, \
++	.read		= seq_read, \
++	.llseek		= seq_lseek, \
++	.release	= single_release, \
++};
+ 
++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice)
++
++#undef DEFINE_SYSCTL_SCHED_FUNC
++#else // !CONFIG_SCHED_BORE
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = {
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+-
++#endif // CONFIG_SCHED_BORE
+ #endif /* SMP */
+ 
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+@@ -507,13 +553,20 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
++	debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice);
++#else // !CONFIG_SCHED_BORE
+ 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+ 
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
++#endif // CONFIG_SCHED_BORE
+ 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ 
+@@ -762,6 +815,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
++#ifdef CONFIG_SCHED_BORE
++	SEQ_printf(m, " %2d", p->se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ #ifdef CONFIG_NUMA_BALANCING
+ 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
+ #endif
+@@ -1248,6 +1304,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 
+ 	P(se.load.weight);
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++	P(se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+ 	P(se.avg.util_sum);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 7a14da539..5f44bd194 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -58,6 +58,10 @@
+ #include "stats.h"
+ #include "autogroup.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif // CONFIG_SCHED_BORE
++
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -67,17 +71,30 @@
+  *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
+  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+  *
+- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant
++ * EEVDF: default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
+  */
++#ifdef CONFIG_SCHED_BORE
++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound tasks:
+  *
+- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice
++ * (default min_base_slice = 2000000 constant, units: nanoseconds)
++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds
+  */
++#ifdef CONFIG_SCHED_BORE
++static const unsigned int nsecs_per_tick       = 1000000000ULL / HZ;
++unsigned int sysctl_sched_min_base_slice       = CONFIG_MIN_BASE_SLICE_NS;
++__read_mostly uint sysctl_sched_base_slice     = nsecs_per_tick;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_base_slice			= 700000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
++#endif // CONFIG_SCHED_BORE
+ 
+ __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
+@@ -191,6 +208,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
+  *
+  * This idea comes from the SD scheduler of Con Kolivas:
+  */
++#ifdef CONFIG_SCHED_BORE
++static void update_sysctl(void) {
++	sysctl_sched_base_slice = nsecs_per_tick *
++		max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick));
++}
++void sched_update_min_base_slice(void) { update_sysctl(); }
++#else // !CONFIG_SCHED_BORE
+ static unsigned int get_update_sysctl_factor(void)
+ {
+ 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
+@@ -221,6 +245,7 @@ static void update_sysctl(void)
+ 	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
++#endif // CONFIG_SCHED_BORE
+ 
+ void __init sched_init_granularity(void)
+ {
+@@ -700,6 +725,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ 	vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++#ifdef CONFIG_SCHED_BORE
++	limit >>= !!sched_bore;
++#endif // CONFIG_SCHED_BORE
+ 
+ 	se->vlag = clamp(vlag, -limit, limit);
+ }
+@@ -940,6 +968,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+ 		curr = NULL;
+ 
+ 	if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
++#ifdef CONFIG_SCHED_BORE
++		if (!(likely(sched_bore) && likely(sched_burst_parity_threshold) &&
++			sched_burst_parity_threshold < cfs_rq->nr_queued))
++#endif // CONFIG_SCHED_BORE
+ 		return curr;
+ 
+ 	/* Pick the leftmost entity if it's eligible */
+@@ -997,6 +1029,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+  * Scheduling class statistics methods:
+  */
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ int sched_update_scaling(void)
+ {
+ 	unsigned int factor = get_update_sysctl_factor();
+@@ -1008,6 +1041,7 @@ int sched_update_scaling(void)
+ 
+ 	return 0;
+ }
++#endif // CONFIG_SCHED_BORE
+ #endif
+ 
+ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+@@ -1237,6 +1271,9 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	if (unlikely(delta_exec <= 0))
+ 		return;
+ 
++#ifdef CONFIG_SCHED_BORE
++	update_curr_bore(delta_exec, curr);
++#endif // CONFIG_SCHED_BORE
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
+ 	resched = update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+@@ -3794,13 +3831,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ 
+ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+ 
++
++#ifdef CONFIG_SCHED_BORE
++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
++			    unsigned long weight, bool no_update_curr)
++#else // !CONFIG_SCHED_BORE
+ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 			    unsigned long weight)
++#endif // CONFIG_SCHED_BORE
+ {
+ 	bool curr = cfs_rq->curr == se;
+ 
+ 	if (se->on_rq) {
+ 		/* commit outstanding execution time */
++#ifdef CONFIG_SCHED_BORE
++		if (!no_update_curr)
++#endif // CONFIG_SCHED_BORE
+ 		update_curr(cfs_rq);
+ 		update_entity_lag(cfs_rq, se);
+ 		se->deadline -= se->vruntime;
+@@ -3856,7 +3902,11 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
+ 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 	struct load_weight *load = &se->load;
+ 
++#ifdef CONFIG_SCHED_BORE
++	reweight_entity(cfs_rq, se, lw->weight, false);
++#else // !CONFIG_SCHED_BORE
+ 	reweight_entity(cfs_rq, se, lw->weight);
++#endif // CONFIG_SCHED_BORE
+ 	load->inv_weight = lw->inv_weight;
+ }
+ 
+@@ -3997,7 +4047,11 @@ static void update_cfs_group(struct sched_entity *se)
+ 	shares = calc_group_shares(gcfs_rq);
+ #endif
+ 	if (unlikely(se->load.weight != shares))
++#ifdef CONFIG_SCHED_BORE
++		reweight_entity(cfs_rq_of(se), se, shares, false);
++#else // !CONFIG_SCHED_BORE
+ 		reweight_entity(cfs_rq_of(se), se, shares);
++#endif // CONFIG_SCHED_BORE
+ }
+ 
+ #else /* CONFIG_FAIR_GROUP_SCHED */
+@@ -5295,7 +5349,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		se->rel_deadline = 0;
+ 		return;
+ 	}
+-
++#ifdef CONFIG_SCHED_BORE
++	else if (likely(sched_bore))
++		vslice >>= !!(flags & sched_deadline_boost_mask);
++	else
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * When joining the competition; the existing tasks will be,
+ 	 * on average, halfway through their slice, as such start tasks
+@@ -7190,6 +7248,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		util_est_dequeue(&rq->cfs, p);
+ 
+ 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
++#ifdef CONFIG_SCHED_BORE
++	struct cfs_rq *cfs_rq = &rq->cfs;
++	struct sched_entity *se = &p->se;
++	if (flags & DEQUEUE_SLEEP && entity_is_task(se)) {
++		if (cfs_rq->curr == se)
++			update_curr(cfs_rq);
++		restart_burst(se);
++	}
++#endif // CONFIG_SCHED_BORE
+ 	if (dequeue_entities(rq, &p->se, flags) < 0)
+ 		return false;
+ 
+@@ -9019,16 +9086,25 @@ static void yield_task_fair(struct rq *rq)
+ 	/*
+ 	 * Are we the only task in the tree?
+ 	 */
++#if !defined(CONFIG_SCHED_BORE)
+ 	if (unlikely(rq->nr_running == 1))
+ 		return;
+ 
+ 	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	update_rq_clock(rq);
+ 	/*
+ 	 * Update run-time statistics of the 'current'.
+ 	 */
+ 	update_curr(cfs_rq);
++#ifdef CONFIG_SCHED_BORE
++	restart_burst_rescale_deadline(se);
++	if (unlikely(rq->nr_running == 1))
++		return;
++
++	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * Tell update_rq_clock() that we've just updated,
+ 	 * so we don't do microscopic update in schedule()
+@@ -13142,6 +13218,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ static void task_fork_fair(struct task_struct *p)
+ {
+ 	set_task_max_allowed_capacity(p);
++#ifdef CONFIG_SCHED_BORE
++	update_burst_score(&p->se);
++#endif // CONFIG_SCHED_BORE
+ }
+ 
+ /*
+@@ -13259,6 +13338,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ {
+ 	WARN_ON_ONCE(p->se.sched_delayed);
+ 
++#ifdef CONFIG_SCHED_BORE
++	reset_task_bore(p);
++#endif // CONFIG_SCHED_BORE
+ 	attach_task_cfs_rq(p);
+ 
+ 	set_task_max_allowed_capacity(p);
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 83e3aa917..ef5d684df 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2119,7 +2119,11 @@ extern int group_balance_cpu(struct sched_group *sg);
+ extern void update_sched_domain_debugfs(void);
+ extern void dirty_sched_domain_sysctl(int cpu);
+ 
++#ifdef CONFIG_SCHED_BORE
++extern void sched_update_min_base_slice(void);
++#else // !CONFIG_SCHED_BORE
+ extern int sched_update_scaling(void);
++#endif // CONFIG_SCHED_BORE
+ 
+ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+ {
+@@ -2825,7 +2829,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+ extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+ extern __read_mostly unsigned int sysctl_sched_migration_cost;
+ 
++#ifdef CONFIG_SCHED_BORE
++extern unsigned int sysctl_sched_min_base_slice;
++extern __read_mostly uint sysctl_sched_base_slice;
++#else // !CONFIG_SCHED_BORE
+ extern unsigned int sysctl_sched_base_slice;
++#endif // CONFIG_SCHED_BORE
+ 
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
+-- 
+2.50.1
+
diff --git a/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch
new file mode 100644
index 0000000..63816a2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch
@@ -0,0 +1,3404 @@
+From 66b42eef90f200265e2fc1695808c6626d50c6c5 Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 28 Jul 2025 11:50:37 +0700
+Subject: [PATCH 2/7] bbr3
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   73 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2232 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    4 +-
+ 16 files changed, 1941 insertions(+), 555 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 29f59d50dc73..811850c240cc 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -248,7 +248,8 @@ struct tcp_sock {
+ 	void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq);
+ #endif
+ 	u32	snd_ssthresh;	/* Slow start size threshold		*/
+-	u8	recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++	u32	recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_rx);
+ 
+ 	/* TX read-write hotpath cache lines */
+@@ -305,7 +306,8 @@ struct tcp_sock {
+  */
+ 	struct tcp_options_received rx_opt;
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	__cacheline_group_end(tcp_sock_write_txrx);
+ 
+ 	/* RX read-write hotpath cache lines */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index 1735db332aab..2c4a94af7093 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -132,8 +132,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 5078ad868fee..de404e4370d4 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -379,11 +379,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_DEMAND_CWR	BIT(2)
+ #define	TCP_ECN_SEEN		BIT(3)
+ #define	TCP_ECN_MODE_ACCECN	BIT(4)
++#define	TCP_ECN_LOW		BIT(5)
++#define	TCP_ECN_ECT_PERMANENT	BIT(6)
+ 
+ #define	TCP_ECN_DISABLED	0
+ #define	TCP_ECN_MODE_PENDING	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ #define	TCP_ECN_MODE_ANY	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ 
++
+ static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp)
+ {
+ 	return tp->ecn_flags & TCP_ECN_MODE_ANY;
+@@ -841,6 +844,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -946,6 +958,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -1044,9 +1061,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1159,6 +1181,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1181,7 +1204,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED		BIT(0)
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN		BIT(1)
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	BIT(2)
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1201,10 +1228,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1215,7 +1245,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1239,8 +1271,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1306,6 +1341,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1325,6 +1368,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1337,6 +1381,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2490,7 +2549,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index dab9493c791b..cce4975fdcfe 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -517,12 +517,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index bdac8c42fa82..362644a272ba 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+ #define TCPI_OPT_TFO_CHILD	128 /* child from a Fast Open option on SYN */
++#define TCPI_OPT_ECN_LOW	256 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 12850a277251..3b8b96692fb4 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index e01492234b0b..27893b774e08 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 461a9ab540af..02ae796fa17e 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3442,6 +3442,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4189,6 +4190,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..066da5e5747c 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++
++	return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 68bc79eb9019..7991a7589109 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1134,7 +1134,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1506,6 +1511,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3856,7 +3872,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3873,6 +3890,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3883,6 +3901,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -4002,6 +4025,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4067,7 +4091,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_in_ack_event(sk, flag);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4091,6 +4115,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4111,7 +4136,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5793,13 +5818,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 43d7852ce07e..df386419b9bf 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -475,6 +475,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 3ac8d2d17e1f..cc75963b5a4c 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1614,7 +1617,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	u16 flags;
+ 	int nlen;
+@@ -1689,6 +1692,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2045,13 +2072,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2776,6 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2988,6 +3015,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index bb37e24b97a7..9adfc1131d1f 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk)
+ 		struct inet_sock *inet = inet_sk(sk);
+ 		u32 rtx_delta;
+ 
+-		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
++		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
+ 				tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+ 		if (tp->tcp_usec_ts)
+ 			rtx_delta /= USEC_PER_MSEC;
+@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk)
+ 			       icsk_timeout(icsk));
+ 		return;
+ 	}
++
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.50.1
+
diff --git a/sys-kernel/gentoo-sources-6.16/0003-block.patch b/sys-kernel/gentoo-sources-6.16/0003-block.patch
new file mode 100644
index 0000000..61b2d59
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/0003-block.patch
@@ -0,0 +1,288 @@
+From e6160758a8f7593c49db07cb995ad1c3a7eb60ff Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 28 Jul 2025 11:50:37 +0700
+Subject: [PATCH 3/7] block
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------
+ block/bfq-iosched.h | 12 +++++++++--
+ block/mq-deadline.c | 48 +++++++++++++++++++++++++++++++++++------
+ 3 files changed, 96 insertions(+), 16 deletions(-)
+
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
+index 0cb1e9873aab..4e3e4d3ce88c 100644
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
+ 	return icq;
+ }
+ 
++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
++{
++	if (!current->io_context)
++		return NULL;
++	if (spin_trylock_irq(&q->queue_lock)) {
++		struct bfq_io_cq *icq;
++
++		icq = icq_to_bic(ioc_lookup_icq(q));
++		spin_unlock_irq(&q->queue_lock);
++		return icq;
++	}
++
++	return NULL;
++}
++
+ /*
+  * Scheduler run of queue, if there are requests pending and no one in the
+  * driver that will restart queueing.
+@@ -2465,10 +2480,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
+ 	 * returned by bfq_bic_lookup does not go away before
+ 	 * bfqd->lock is taken.
+ 	 */
+-	struct bfq_io_cq *bic = bfq_bic_lookup(q);
++	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
+ 	bool ret;
+ 
+-	spin_lock_irq(&bfqd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock_irq(&bfqd->lock))
++		return false;
+ 
+ 	if (bic) {
+ 		/*
+@@ -5317,6 +5343,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	struct bfq_queue *in_serv_queue;
+ 	bool waiting_rq, idle_timer_disabled = false;
+ 
++	/*
++	 * If someone else is already dispatching, skip this one. This will
++	 * defer the next dispatch event to when something completes, and could
++	 * potentially lower the queue depth for contended cases.
++	 *
++	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
++	 * retries if nothing is dispatched.
++	 */
++	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
++	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
++		return NULL;
++
+ 	spin_lock_irq(&bfqd->lock);
+ 
+ 	in_serv_queue = bfqd->in_service_queue;
+@@ -5328,6 +5366,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ 	}
+ 
++	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
+ 	spin_unlock_irq(&bfqd->lock);
+ 	bfq_update_dispatch_stats(hctx->queue, rq,
+ 			idle_timer_disabled ? in_serv_queue : NULL,
+@@ -6250,10 +6289,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
+ 
+ static struct bfq_queue *bfq_init_rq(struct request *rq);
+ 
+-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void bfq_insert_request(struct request_queue *q, struct request *rq,
+ 			       blk_insert_t flags)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct bfq_data *bfqd = q->elevator->elevator_data;
+ 	struct bfq_queue *bfqq;
+ 	bool idle_timer_disabled = false;
+@@ -6315,7 +6353,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		bfq_insert_request(hctx, rq, flags);
++		bfq_insert_request(hctx->queue, rq, flags);
+ 	}
+ }
+ 
+@@ -7254,6 +7292,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	q->elevator = eq;
+ 	spin_unlock_irq(&q->queue_lock);
+ 
++	spin_lock_init(&bfqd->lock);
++
+ 	/*
+ 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ 	 * Grab a permanent reference to it, so that the normal code flow
+@@ -7371,8 +7411,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	/* see comments on the definition of next field inside bfq_data */
+ 	bfqd->actuator_load_threshold = 4;
+ 
+-	spin_lock_init(&bfqd->lock);
+-
+ 	/*
+ 	 * The invocation of the next bfq_create_group_hierarchy
+ 	 * function is the head of a chain of function calls
+diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
+index 687a3a7ba784..8589b58af79f 100644
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -504,12 +504,22 @@ struct bfq_io_cq {
+ 	unsigned int requests;	/* Number of requests this process has in flight */
+ };
+ 
++enum {
++	BFQ_DISPATCHING	= 0,
++};
++
+ /**
+  * struct bfq_data - per-device data structure.
+  *
+  * All the fields are protected by @lock.
+  */
+ struct bfq_data {
++	struct {
++		spinlock_t lock;
++	} ____cacheline_aligned_in_smp;
++
++	unsigned long run_state;
++
+ 	/* device request queue */
+ 	struct request_queue *queue;
+ 	/* dispatch queue */
+@@ -795,8 +805,6 @@ struct bfq_data {
+ 	/* fallback dummy bfqq for extreme OOM conditions */
+ 	struct bfq_queue oom_bfqq;
+ 
+-	spinlock_t lock;
+-
+ 	/*
+ 	 * bic associated with the task issuing current bio for
+ 	 * merging. This and the next field are used as a support to
+diff --git a/block/mq-deadline.c b/block/mq-deadline.c
+index 2edf1cac06d5..1bae19f17722 100644
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -79,10 +79,20 @@ struct dd_per_prio {
+ 	struct io_stats_per_prio stats;
+ };
+ 
++enum {
++	DD_DISPATCHING	= 0,
++};
++
+ struct deadline_data {
+ 	/*
+ 	 * run time data
+ 	 */
++	struct {
++		spinlock_t lock;
++		spinlock_t zone_lock;
++	} ____cacheline_aligned_in_smp;
++
++	unsigned long run_state;
+ 
+ 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
+ 
+@@ -100,8 +110,6 @@ struct deadline_data {
+ 	int front_merges;
+ 	u32 async_depth;
+ 	int prio_aging_expire;
+-
+-	spinlock_t lock;
+ };
+ 
+ /* Maps an I/O priority class to a deadline scheduler priority. */
+@@ -466,6 +474,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	struct request *rq;
+ 	enum dd_prio prio;
+ 
++	/*
++	 * If someone else is already dispatching, skip this one. This will
++	 * defer the next dispatch event to when something completes, and could
++	 * potentially lower the queue depth for contended cases.
++	 *
++	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
++	 * retries if nothing is dispatched.
++	 */
++	if (test_bit(DD_DISPATCHING, &dd->run_state) ||
++	    test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state))
++		return NULL;
++
+ 	spin_lock(&dd->lock);
+ 	rq = dd_dispatch_prio_aged_requests(dd, now);
+ 	if (rq)
+@@ -482,6 +502,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	}
+ 
+ unlock:
++	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
+ 	spin_unlock(&dd->lock);
+ 
+ 	return rq;
+@@ -585,6 +606,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+ 
+ 	eq->elevator_data = dd;
+ 
++	spin_lock_init(&dd->lock);
++	spin_lock_init(&dd->zone_lock);
++
+ 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ 
+@@ -601,7 +625,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+ 	dd->last_dir = DD_WRITE;
+ 	dd->fifo_batch = fifo_batch;
+ 	dd->prio_aging_expire = prio_aging_expire;
+-	spin_lock_init(&dd->lock);
+ 
+ 	/* We dispatch from request queue wide instead of hw queue */
+ 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+@@ -657,7 +680,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ 	struct request *free = NULL;
+ 	bool ret;
+ 
+-	spin_lock(&dd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock(&dd->lock))
++		return false;
++
+ 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ 	spin_unlock(&dd->lock);
+ 
+@@ -670,10 +705,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ /*
+  * add rq to rbtree and fifo
+  */
+-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void dd_insert_request(struct request_queue *q, struct request *rq,
+ 			      blk_insert_t flags, struct list_head *free)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct deadline_data *dd = q->elevator->elevator_data;
+ 	const enum dd_data_dir data_dir = rq_data_dir(rq);
+ 	u16 ioprio = req_get_ioprio(rq);
+@@ -731,7 +765,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		dd_insert_request(hctx, rq, flags, &free);
++		dd_insert_request(q, rq, flags, &free);
+ 	}
+ 	spin_unlock(&dd->lock);
+ 
+-- 
+2.50.1
+
diff --git a/sys-kernel/gentoo-sources-6.16/0005-fixes.patch b/sys-kernel/gentoo-sources-6.16/0005-fixes.patch
new file mode 100644
index 0000000..2995b13
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/0005-fixes.patch
@@ -0,0 +1,59 @@
+From cc66b41be3df74ec55f57e9fd047315384fe1052 Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 28 Jul 2025 11:50:38 +0700
+Subject: [PATCH 5/7] fixes
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ drivers/bluetooth/btusb.c                | 2 ++
+ drivers/gpu/drm/i915/display/intel_dsb.c | 4 ++++
+ scripts/package/PKGBUILD                 | 5 +++++
+ 3 files changed, 11 insertions(+)
+
+diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
+index f9eeec0aed57..6b12b44f3a0d 100644
+--- a/drivers/bluetooth/btusb.c
++++ b/drivers/bluetooth/btusb.c
+@@ -705,6 +705,8 @@ static const struct usb_device_id quirks_table[] = {
+ 						     BTUSB_WIDEBAND_SPEECH },
+ 	{ USB_DEVICE(0x0489, 0xe139), .driver_info = BTUSB_MEDIATEK |
+ 						     BTUSB_WIDEBAND_SPEECH },
++	{ USB_DEVICE(0x0489, 0xe14e), .driver_info = BTUSB_MEDIATEK |
++						     BTUSB_WIDEBAND_SPEECH },
+ 	{ USB_DEVICE(0x0489, 0xe14f), .driver_info = BTUSB_MEDIATEK |
+ 						     BTUSB_WIDEBAND_SPEECH },
+ 	{ USB_DEVICE(0x0489, 0xe150), .driver_info = BTUSB_MEDIATEK |
+diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c
+index 481488d1fe67..271229500c62 100644
+--- a/drivers/gpu/drm/i915/display/intel_dsb.c
++++ b/drivers/gpu/drm/i915/display/intel_dsb.c
+@@ -808,6 +808,10 @@ struct intel_dsb *intel_dsb_prepare(struct intel_atomic_state *state,
+ 	if (!display->params.enable_dsb)
+ 		return NULL;
+ 
++	/* TODO: DSB is broken in Xe KMD, so disabling it until fixed */
++	if (!IS_ENABLED(I915))
++		return NULL;
++
+ 	dsb = kzalloc(sizeof(*dsb), GFP_KERNEL);
+ 	if (!dsb)
+ 		goto out;
+diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD
+index 452374d63c24..08f80d7c5df0 100644
+--- a/scripts/package/PKGBUILD
++++ b/scripts/package/PKGBUILD
+@@ -90,6 +90,11 @@ _package-headers() {
+ 		"${srctree}/scripts/package/install-extmod-build" "${builddir}"
+ 	fi
+ 
++	# required when DEBUG_INFO_BTF_MODULES is enabled
++	if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then
++		install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids
++	fi
++
+ 	echo "Installing System.map and config..."
+ 	mkdir -p "${builddir}"
+ 	cp System.map "${builddir}/System.map"
+-- 
+2.50.1
+
diff --git a/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch b/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch
new file mode 100644
index 0000000..b846780
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch
@@ -0,0 +1,329 @@
+From 0fc382f7d5a69dcfabaa0d7a24b1bc1dd7af1d40 Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 28 Jul 2025 11:50:38 +0700
+Subject: [PATCH 6/7] s5-power
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ drivers/base/power/main.c                  |  7 ++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +
+ drivers/pci/pci-driver.c                   | 94 ++++++++++++++--------
+ drivers/scsi/mesh.c                        |  1 +
+ drivers/scsi/stex.c                        |  1 +
+ drivers/usb/host/sl811-hcd.c               |  1 +
+ include/linux/pm.h                         |  3 +
+ include/trace/events/power.h               |  3 +-
+ kernel/reboot.c                            |  6 ++
+ 9 files changed, 86 insertions(+), 34 deletions(-)
+
+diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
+index 7a50af416cac..b8f0343a8673 100644
+--- a/drivers/base/power/main.c
++++ b/drivers/base/power/main.c
+@@ -85,6 +85,8 @@ static const char *pm_verb(int event)
+ 		return "restore";
+ 	case PM_EVENT_RECOVER:
+ 		return "recover";
++	case PM_EVENT_POWEROFF:
++		return "poweroff";
+ 	default:
+ 		return "(unknown PM event)";
+ 	}
+@@ -355,6 +357,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
+ 	case PM_EVENT_FREEZE:
+ 	case PM_EVENT_QUIESCE:
+ 		return ops->freeze;
++	case PM_EVENT_POWEROFF:
+ 	case PM_EVENT_HIBERNATE:
+ 		return ops->poweroff;
+ 	case PM_EVENT_THAW:
+@@ -389,6 +392,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
+ 	case PM_EVENT_FREEZE:
+ 	case PM_EVENT_QUIESCE:
+ 		return ops->freeze_late;
++	case PM_EVENT_POWEROFF:
+ 	case PM_EVENT_HIBERNATE:
+ 		return ops->poweroff_late;
+ 	case PM_EVENT_THAW:
+@@ -423,6 +427,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat
+ 	case PM_EVENT_FREEZE:
+ 	case PM_EVENT_QUIESCE:
+ 		return ops->freeze_noirq;
++	case PM_EVENT_POWEROFF:
+ 	case PM_EVENT_HIBERNATE:
+ 		return ops->poweroff_noirq;
+ 	case PM_EVENT_THAW:
+@@ -1313,6 +1318,8 @@ static pm_message_t resume_event(pm_message_t sleep_state)
+ 		return PMSG_RECOVER;
+ 	case PM_EVENT_HIBERNATE:
+ 		return PMSG_RESTORE;
++	case PM_EVENT_POWEROFF:
++		return PMSG_ON;
+ 	}
+ 	return PMSG_ON;
+ }
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index aa32df7e2fb2..839117782949 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -4961,6 +4961,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
+ 	if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
+ 		return 0;
+ 
++	/* No need to evict when going to S5 through S4 callbacks */
++	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF)
++		return 0;
++
+ 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
+ 	if (ret)
+ 		DRM_WARN("evicting device resources failed\n");
+diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
+index 67db34fd10ee..b78b98133e7d 100644
+--- a/drivers/pci/pci-driver.c
++++ b/drivers/pci/pci-driver.c
+@@ -758,6 +758,56 @@ static void pci_pm_complete(struct device *dev)
+ 
+ #endif /* !CONFIG_PM_SLEEP */
+ 
++#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATE_CALLBACKS)
++/**
++ * pci_pm_set_prepare_bus_pm
++ * @pci_dev: pci device
++ *
++ * Prepare the device to go into a low power state by saving state
++ * and configure bus PM policy.
++ *
++ * Return: TRUE for bus PM will be used
++ *         FALSE for bus PM will be skipped
++ */
++static bool pci_pm_set_prepare_bus_pm(struct pci_dev *pci_dev)
++{
++	if (!pci_dev->state_saved) {
++		pci_save_state(pci_dev);
++
++		/*
++		 * If the device is a bridge with a child in D0 below it,
++		 * it needs to stay in D0, so check skip_bus_pm to avoid
++		 * putting it into a low-power state in that case.
++		 */
++		if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev))
++			pci_prepare_to_sleep(pci_dev);
++	}
++
++	pci_dbg(pci_dev, "PCI PM: Sleep power state: %s\n",
++		pci_power_name(pci_dev->current_state));
++
++	if (pci_dev->current_state == PCI_D0) {
++		pci_dev->skip_bus_pm = true;
++		/*
++		 * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any
++		 * downstream device is in D0, so avoid changing the power state
++		 * of the parent bridge by setting the skip_bus_pm flag for it.
++		 */
++		if (pci_dev->bus->self)
++			pci_dev->bus->self->skip_bus_pm = true;
++	}
++
++	if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) {
++		pci_dbg(pci_dev, "PCI PM: Skipped\n");
++		return FALSE;
++	}
++
++	pci_pm_set_unknown_state(pci_dev);
++
++	return TRUE;
++}
++#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATE_CALLBACKS */
++
+ #ifdef CONFIG_SUSPEND
+ static void pcie_pme_root_status_cleanup(struct pci_dev *pci_dev)
+ {
+@@ -877,38 +927,8 @@ static int pci_pm_suspend_noirq(struct device *dev)
+ 		}
+ 	}
+ 
+-	if (!pci_dev->state_saved) {
+-		pci_save_state(pci_dev);
+-
+-		/*
+-		 * If the device is a bridge with a child in D0 below it,
+-		 * it needs to stay in D0, so check skip_bus_pm to avoid
+-		 * putting it into a low-power state in that case.
+-		 */
+-		if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev))
+-			pci_prepare_to_sleep(pci_dev);
+-	}
+-
+-	pci_dbg(pci_dev, "PCI PM: Suspend power state: %s\n",
+-		pci_power_name(pci_dev->current_state));
+-
+-	if (pci_dev->current_state == PCI_D0) {
+-		pci_dev->skip_bus_pm = true;
+-		/*
+-		 * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any
+-		 * downstream device is in D0, so avoid changing the power state
+-		 * of the parent bridge by setting the skip_bus_pm flag for it.
+-		 */
+-		if (pci_dev->bus->self)
+-			pci_dev->bus->self->skip_bus_pm = true;
+-	}
+-
+-	if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) {
+-		pci_dbg(pci_dev, "PCI PM: Skipped\n");
++	if (!pci_pm_set_prepare_bus_pm(pci_dev))
+ 		goto Fixup;
+-	}
+-
+-	pci_pm_set_unknown_state(pci_dev);
+ 
+ 	/*
+ 	 * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's
+@@ -1135,6 +1155,8 @@ static int pci_pm_poweroff(struct device *dev)
+ 	struct pci_dev *pci_dev = to_pci_dev(dev);
+ 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+ 
++	pci_dev->skip_bus_pm = false;
++
+ 	if (pci_has_legacy_pm_support(pci_dev))
+ 		return pci_legacy_suspend(dev, PMSG_HIBERNATE);
+ 
+@@ -1198,8 +1220,8 @@ static int pci_pm_poweroff_noirq(struct device *dev)
+ 			return error;
+ 	}
+ 
+-	if (!pci_dev->state_saved && !pci_has_subordinate(pci_dev))
+-		pci_prepare_to_sleep(pci_dev);
++	if (!pci_pm_set_prepare_bus_pm(pci_dev))
++		goto Fixup;
+ 
+ 	/*
+ 	 * The reason for doing this here is the same as for the analogous code
+@@ -1208,6 +1230,7 @@ static int pci_pm_poweroff_noirq(struct device *dev)
+ 	if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ 		pci_write_config_word(pci_dev, PCI_COMMAND, 0);
+ 
++Fixup:
+ 	pci_fixup_device(pci_fixup_suspend_late, pci_dev);
+ 
+ 	return 0;
+@@ -1217,10 +1240,15 @@ static int pci_pm_restore_noirq(struct device *dev)
+ {
+ 	struct pci_dev *pci_dev = to_pci_dev(dev);
+ 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
++	pci_power_t prev_state = pci_dev->current_state;
++	bool skip_bus_pm = pci_dev->skip_bus_pm;
+ 
+ 	pci_pm_default_resume_early(pci_dev);
+ 	pci_fixup_device(pci_fixup_resume_early, pci_dev);
+ 
++	if (!skip_bus_pm && prev_state == PCI_D3cold)
++		pci_pm_bridge_power_up_actions(pci_dev);
++
+ 	if (pci_has_legacy_pm_support(pci_dev))
+ 		return 0;
+ 
+diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
+index 1c15cac41d80..768b85eecc8f 100644
+--- a/drivers/scsi/mesh.c
++++ b/drivers/scsi/mesh.c
+@@ -1762,6 +1762,7 @@ static int mesh_suspend(struct macio_dev *mdev, pm_message_t mesg)
+ 	case PM_EVENT_SUSPEND:
+ 	case PM_EVENT_HIBERNATE:
+ 	case PM_EVENT_FREEZE:
++	case PM_EVENT_POWEROFF:
+ 		break;
+ 	default:
+ 		return 0;
+diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
+index 63ed7f9aaa93..ee9372e1f7f0 100644
+--- a/drivers/scsi/stex.c
++++ b/drivers/scsi/stex.c
+@@ -1965,6 +1965,7 @@ static int stex_choice_sleep_mic(struct st_hba *hba, pm_message_t state)
+ 	case PM_EVENT_SUSPEND:
+ 		return ST_S3;
+ 	case PM_EVENT_HIBERNATE:
++	case PM_EVENT_POWEROFF:
+ 		hba->msi_lock = 0;
+ 		return ST_S4;
+ 	default:
+diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
+index ea3cab99c5d4..5d6dba681e50 100644
+--- a/drivers/usb/host/sl811-hcd.c
++++ b/drivers/usb/host/sl811-hcd.c
+@@ -1748,6 +1748,7 @@ sl811h_suspend(struct platform_device *dev, pm_message_t state)
+ 		break;
+ 	case PM_EVENT_SUSPEND:
+ 	case PM_EVENT_HIBERNATE:
++	case PM_EVENT_POWEROFF:
+ 	case PM_EVENT_PRETHAW:		/* explicitly discard hw state */
+ 		port_power(sl811, 0);
+ 		break;
+diff --git a/include/linux/pm.h b/include/linux/pm.h
+index f0bd8fbae4f2..cb66f47631a7 100644
+--- a/include/linux/pm.h
++++ b/include/linux/pm.h
+@@ -506,6 +506,7 @@ const struct dev_pm_ops name = { \
+  * RECOVER	Creation of a hibernation image or restoration of the main
+  *		memory contents from a hibernation image has failed, call
+  *		->thaw() and ->complete() for all devices.
++ * POWEROFF	System will poweroff, call ->poweroff() for all devices.
+  *
+  * The following PM_EVENT_ messages are defined for internal use by
+  * kernel subsystems.  They are never issued by the PM core.
+@@ -536,6 +537,7 @@ const struct dev_pm_ops name = { \
+ #define PM_EVENT_USER		0x0100
+ #define PM_EVENT_REMOTE		0x0200
+ #define PM_EVENT_AUTO		0x0400
++#define PM_EVENT_POWEROFF	0x0800
+ 
+ #define PM_EVENT_SLEEP		(PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
+ #define PM_EVENT_USER_SUSPEND	(PM_EVENT_USER | PM_EVENT_SUSPEND)
+@@ -550,6 +552,7 @@ const struct dev_pm_ops name = { \
+ #define PMSG_QUIESCE	((struct pm_message){ .event = PM_EVENT_QUIESCE, })
+ #define PMSG_SUSPEND	((struct pm_message){ .event = PM_EVENT_SUSPEND, })
+ #define PMSG_HIBERNATE	((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
++#define PMSG_POWEROFF	((struct pm_message){ .event = PM_EVENT_POWEROFF, })
+ #define PMSG_RESUME	((struct pm_message){ .event = PM_EVENT_RESUME, })
+ #define PMSG_THAW	((struct pm_message){ .event = PM_EVENT_THAW, })
+ #define PMSG_RESTORE	((struct pm_message){ .event = PM_EVENT_RESTORE, })
+diff --git a/include/trace/events/power.h b/include/trace/events/power.h
+index 6c631eec23e3..8fa70f239737 100644
+--- a/include/trace/events/power.h
++++ b/include/trace/events/power.h
+@@ -199,7 +199,8 @@ TRACE_EVENT(pstate_sample,
+ 		{ PM_EVENT_HIBERNATE, "hibernate" }, \
+ 		{ PM_EVENT_THAW, "thaw" }, \
+ 		{ PM_EVENT_RESTORE, "restore" }, \
+-		{ PM_EVENT_RECOVER, "recover" })
++		{ PM_EVENT_RECOVER, "recover" }, \
++		{ PM_EVENT_POWEROFF, "poweroff" })
+ 
+ DEFINE_EVENT(cpu, cpu_frequency,
+ 
+diff --git a/kernel/reboot.c b/kernel/reboot.c
+index ec087827c85c..c8835f8e5f27 100644
+--- a/kernel/reboot.c
++++ b/kernel/reboot.c
+@@ -13,6 +13,7 @@
+ #include <linux/kexec.h>
+ #include <linux/kmod.h>
+ #include <linux/kmsg_dump.h>
++#include <linux/pm.h>
+ #include <linux/reboot.h>
+ #include <linux/suspend.h>
+ #include <linux/syscalls.h>
+@@ -305,6 +306,11 @@ static void kernel_shutdown_prepare(enum system_states state)
+ 		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
+ 	system_state = state;
+ 	usermodehelper_disable();
++#ifdef CONFIG_HIBERNATE_CALLBACKS
++	if (!dpm_suspend_start(PMSG_POWEROFF) && !dpm_suspend_end(PMSG_POWEROFF))
++		return;
++	pr_emerg("Failed to power off devices, using shutdown instead.\n");
++#endif
+ 	device_shutdown();
+ }
+ /**
+-- 
+2.50.1
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch
new file mode 100644
index 0000000..0d2a7cb
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch
@@ -0,0 +1,810 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.19])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 93DAD125D6
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:07:12 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.19
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716034; cv=none; b=YUmFsTNH1uwQvpOHnTp11Akd3lgJWMfavT04pYrRO6bSLY9uShqjjFR32v7kBjYwOu9HZts4Psvms0Up5yiFkgkTpBdbC8CX/E7Z4c1Klx1PkIf3BPuhpb8ZvRx+SMdhPpzo/SQA6Ht628h/WhbmPYoJzx1WyHar5r5e0vVf1nw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716034; c=relaxed/simple;
+	bh=B8SncTRCfxokFw3HLq476F91kwXYiv+eNctY+3vgxDg=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=LoeDB94MVSwRN9DiDv9UOzgtbfNf7Z8gM97rUyOhBmJ8xbH9EBSw6tUtmKoGA8eGV7pQZyNxS628yTpoLSby3RcBvL9Nu68rCTwLJMSC6e/upA0JZGqZ1E/H3XAf1XnjpHP133kxqHoHsAf9B7kQtb8FMbTEqziO+wZWq/wHV7c=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=EoDNlyYD; arc=none smtp.client-ip=192.198.163.19
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="EoDNlyYD"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716032; x=1786252032;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=B8SncTRCfxokFw3HLq476F91kwXYiv+eNctY+3vgxDg=;
+  b=EoDNlyYDA5oPL7kQOXZZlmGPK3p6khDrsVNfQ0JxnGYMNTmoWPG3Trqv
+   G2IvQfodRnNQCYPKgy20JzG+hnRCEBJuWbDYvbBKAv1X1Y6JYcYj11fWU
+   ZEKDojM5x6NyBsP6fUSaKmteIt+dcABM+mQ1mSY84wSYIPWQMFhGWqxKi
+   6u+a+ocT6BdIAxulicFjYoaLOtii26qUbwZRgLo92ZRGMfUm3fzaPrvmE
+   Ao5J3uJtLRfBswzdorTuQV5vLeCnDshzqwFinb0JTb2FOypjk2LzTN+gp
+   1hpBBXiAasf2lovIh8TYi7x2VmGvVyHeq1JBHV/mgRFlzzU7UVpHT7daf
+   Q==;
+X-CSE-ConnectionGUID: to6lNAM4S3a+AafwUff0zw==
+X-CSE-MsgGUID: crHzkh/2S0a7G+FwBgPlVQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56091915"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56091915"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa113.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:11 -0700
+X-CSE-ConnectionGUID: BhKTydHRScuaqNFjz5BXWQ==
+X-CSE-MsgGUID: m+FrJxJNTOeCg6U8j8DjQQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165475503"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:07:06 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 01/28] sched: Cache aware load-balancing
+Date: Sat,  9 Aug 2025 13:00:59 +0800
+Message-Id: <9157186cf9e3fd541f62c637579ff736b3704c51.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+Hi all,
+
+One of the many things on the eternal todo list has been finishing the
+below hackery.
+
+It is an attempt at modelling cache affinity -- and while the patch
+really only targets LLC, it could very well be extended to also apply to
+clusters (L2). Specifically any case of multiple cache domains inside a
+node.
+
+Anyway, I wrote this about a year ago, and I mentioned this at the
+recent OSPM conf where Gautham and Prateek expressed interest in playing
+with this code.
+
+So here goes, very rough and largely unproven code ahead :-)
+
+It applies to current tip/master, but I know it will fail the __percpu
+validation that sits in -next, although that shouldn't be terribly hard
+to fix up.
+
+As is, it only computes a CPU inside the LLC that has the highest recent
+runtime, this CPU is then used in the wake-up path to steer towards this
+LLC and in task_hot() to limit migrations away from it.
+
+More elaborate things could be done, notably there is an XXX in there
+somewhere about finding the best LLC inside a NODE (interaction with
+NUMA_BALANCING).
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+---
+ include/linux/mm_types.h |  44 ++++++
+ include/linux/sched.h    |   4 +
+ init/Kconfig             |   4 +
+ kernel/fork.c            |   5 +
+ kernel/sched/core.c      |  13 +-
+ kernel/sched/fair.c      | 330 +++++++++++++++++++++++++++++++++++++--
+ kernel/sched/sched.h     |   8 +
+ 7 files changed, 388 insertions(+), 20 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index d6b91e8a66d6..cf26ad8b41ab 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -928,6 +928,12 @@ struct mm_cid {
+ };
+ #endif
+ 
++struct mm_sched {
++	u64 runtime;
++	unsigned long epoch;
++	unsigned long occ;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -1018,6 +1024,17 @@ struct mm_struct {
+ 		 */
+ 		raw_spinlock_t cpus_allowed_lock;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
++		 * See account_mm_sched() and ...
++		 */
++		struct mm_sched __percpu *pcpu_sched;
++		raw_spinlock_t mm_sched_lock;
++		unsigned long mm_sched_epoch;
++		int mm_sched_cpu;
++#endif
++
+ #ifdef CONFIG_MMU
+ 		atomic_long_t pgtables_bytes;	/* size of all page tables */
+ #endif
+@@ -1432,6 +1449,33 @@ static inline unsigned int mm_cid_size(void)
+ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
++
++static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
++{
++	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++	if (!pcpu_sched)
++		return -ENOMEM;
++
++	mm_init_sched(mm, pcpu_sched);
++	return 0;
++}
++
++#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
++
++static inline void mm_destroy_sched(struct mm_struct *mm)
++{
++	free_percpu(mm->pcpu_sched);
++	mm->pcpu_sched = NULL;
++}
++#else /* !CONFIG_SCHED_CACHE */
++
++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
++static inline void mm_destroy_sched(struct mm_struct *mm) { }
++
++#endif /* CONFIG_SCHED_CACHE */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index aa9c5be7a632..02ff8b8be25b 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1403,6 +1403,10 @@ struct task_struct {
+ 	unsigned long			numa_pages_migrated;
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	struct callback_head		cache_work;
++#endif
++
+ #ifdef CONFIG_RSEQ
+ 	struct rseq __user *rseq;
+ 	u32 rseq_len;
+diff --git a/init/Kconfig b/init/Kconfig
+index 666783eb50ab..27f4012347f9 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -947,6 +947,10 @@ config NUMA_BALANCING
+ 
+ 	  This system will be inactive on UMA systems.
+ 
++config SCHED_CACHE
++	bool "Cache aware scheduler"
++	default y
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 1ee8eb11f38b..546c49e46d48 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1073,6 +1073,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	if (mm_alloc_cid(mm, p))
+ 		goto fail_cid;
+ 
++	if (mm_alloc_sched(mm))
++		goto fail_sched;
++
+ 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ 				     NR_MM_COUNTERS))
+ 		goto fail_pcpu;
+@@ -1082,6 +1085,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	return mm;
+ 
+ fail_pcpu:
++	mm_destroy_sched(mm);
++fail_sched:
+ 	mm_destroy_cid(mm);
+ fail_cid:
+ 	destroy_context(mm);
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 81c6df746df1..a5fb3057b1c4 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4539,6 +4539,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->migration_pending = NULL;
+ #endif
+ 	init_sched_mm_cid(p);
++	init_sched_mm(p);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+@@ -8508,6 +8509,7 @@ static struct kmem_cache *task_group_cache __ro_after_init;
+ 
+ void __init sched_init(void)
+ {
++	unsigned long now = jiffies;
+ 	unsigned long ptr = 0;
+ 	int i;
+ 
+@@ -8582,7 +8584,7 @@ void __init sched_init(void)
+ 		raw_spin_lock_init(&rq->__lock);
+ 		rq->nr_running = 0;
+ 		rq->calc_load_active = 0;
+-		rq->calc_load_update = jiffies + LOAD_FREQ;
++		rq->calc_load_update = now + LOAD_FREQ;
+ 		init_cfs_rq(&rq->cfs);
+ 		init_rt_rq(&rq->rt);
+ 		init_dl_rq(&rq->dl);
+@@ -8626,7 +8628,7 @@ void __init sched_init(void)
+ 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ 		rq->balance_callback = &balance_push_callback;
+ 		rq->active_balance = 0;
+-		rq->next_balance = jiffies;
++		rq->next_balance = now;
+ 		rq->push_cpu = 0;
+ 		rq->cpu = i;
+ 		rq->online = 0;
+@@ -8638,7 +8640,7 @@ void __init sched_init(void)
+ 
+ 		rq_attach_root(rq, &def_root_domain);
+ #ifdef CONFIG_NO_HZ_COMMON
+-		rq->last_blocked_load_update_tick = jiffies;
++		rq->last_blocked_load_update_tick = now;
+ 		atomic_set(&rq->nohz_flags, 0);
+ 
+ 		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+@@ -8663,6 +8665,11 @@ void __init sched_init(void)
+ 
+ 		rq->core_cookie = 0UL;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		raw_spin_lock_init(&rq->cpu_epoch_lock);
++		rq->cpu_epoch_next = now;
++#endif
++
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+ 	}
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 7a14da5396fb..e3897cd7696d 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ 	return delta_exec;
+ }
+ 
+-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
++#ifdef CONFIG_SCHED_CACHE
++
++/*
++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
++ * tunable or so.
++ */
++#define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
++#define EPOCH_OLD	5		/* 50 ms */
++
++void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
++{
++	unsigned long epoch;
++	int i;
++
++	for_each_possible_cpu(i) {
++		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct rq *rq = cpu_rq(i);
++
++		pcpu_sched->runtime = 0;
++		pcpu_sched->epoch = epoch = rq->cpu_epoch;
++		pcpu_sched->occ = -1;
++	}
++
++	raw_spin_lock_init(&mm->mm_sched_lock);
++	mm->mm_sched_epoch = epoch;
++	mm->mm_sched_cpu = -1;
++
++	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++}
++
++/* because why would C be fully specified */
++static __always_inline void __shr_u64(u64 *val, unsigned int n)
++{
++	if (n >= 64) {
++		*val = 0;
++		return;
++	}
++	*val >>= n;
++}
++
++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	lockdep_assert_held(&rq->cpu_epoch_lock);
++
++	unsigned long n, now = jiffies;
++	long delta = now - rq->cpu_epoch_next;
++
++	if (delta > 0) {
++		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		rq->cpu_epoch += n;
++		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		__shr_u64(&rq->cpu_runtime, n);
++	}
++
++	n = rq->cpu_epoch - pcpu_sched->epoch;
++	if (n) {
++		pcpu_sched->epoch += n;
++		__shr_u64(&pcpu_sched->runtime, n);
++	}
++}
++
++static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
++
++	__update_mm_sched(rq, pcpu_sched);
++
++	/*
++	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
++	 * the accumulation period, this means the multiplcation here should
++	 * not overflow.
++	 */
++	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
++}
++
++static inline
++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_sched *pcpu_sched;
++	unsigned long epoch;
++
++	/*
++	 * init_task and kthreads don't be having no mm
++	 */
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
++
++	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
++		__update_mm_sched(rq, pcpu_sched);
++		pcpu_sched->runtime += delta_exec;
++		rq->cpu_runtime += delta_exec;
++		epoch = rq->cpu_epoch;
++	}
++
++	/*
++	 * If this task hasn't hit task_cache_work() for a while, invalidate
++	 * it's preferred state.
++	 */
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
++		mm->mm_sched_cpu = -1;
++		pcpu_sched->occ = -1;
++	}
++}
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	struct mm_struct *mm = p->mm;
++
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	guard(raw_spinlock)(&mm->mm_sched_lock);
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	if (work->next == work) {
++		task_work_add(p, work, TWA_RESUME);
++		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
++	}
++}
++
++static void task_cache_work(struct callback_head *work)
++{
++	struct task_struct *p = current;
++	struct mm_struct *mm = p->mm;
++	unsigned long m_a_occ = 0;
++	int cpu, m_a_cpu = -1;
++	cpumask_var_t cpus;
++
++	WARN_ON_ONCE(work != &p->cache_work);
++
++	work->next = work;
++
++	if (p->flags & PF_EXITING)
++		return;
++
++	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
++		return;
++
++	scoped_guard (cpus_read_lock) {
++		cpumask_copy(cpus, cpu_online_mask);
++
++		for_each_cpu(cpu, cpus) {
++			/* XXX sched_cluster_active */
++			struct sched_domain *sd = per_cpu(sd_llc, cpu);
++			unsigned long occ, m_occ = 0, a_occ = 0;
++			int m_cpu = -1, nr = 0, i;
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				occ = fraction_mm_sched(cpu_rq(i),
++							per_cpu_ptr(mm->pcpu_sched, i));
++				a_occ += occ;
++				if (occ > m_occ) {
++					m_occ = occ;
++					m_cpu = i;
++				}
++				nr++;
++				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
++					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
++			}
++
++			a_occ /= nr;
++			if (a_occ > m_a_occ) {
++				m_a_occ = a_occ;
++				m_a_cpu = m_cpu;
++			}
++
++			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
++				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				/* XXX threshold ? */
++				per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ;
++			}
++
++			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
++		}
++	}
++
++	/*
++	 * If the max average cache occupancy is 'small' we don't care.
++	 */
++	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
++		m_a_cpu = -1;
++
++	mm->mm_sched_cpu = m_a_cpu;
++
++	free_cpumask_var(cpus);
++}
++
++void init_sched_mm(struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	init_task_work(work, task_cache_work);
++	work->next = work;
++}
++
++#else
++
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
++				    s64 delta_exec) { }
++
++
++void init_sched_mm(struct task_struct *p) { }
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
++
++#endif
++
++static inline
++void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+ 	trace_sched_stat_runtime(p, delta_exec);
+ 	account_group_exec_runtime(p, delta_exec);
++	account_mm_sched(rq, p, delta_exec);
+ 	cgroup_account_cputime(p, delta_exec);
+ }
+ 
+@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq)
+ 
+ 	delta_exec = update_curr_se(rq, &donor->se);
+ 	if (likely(delta_exec > 0))
+-		update_curr_task(donor, delta_exec);
++		update_curr_task(rq, donor, delta_exec);
+ 
+ 	return delta_exec;
+ }
+@@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	if (entity_is_task(curr)) {
+ 		struct task_struct *p = task_of(curr);
+ 
+-		update_curr_task(p, delta_exec);
++		update_curr_task(rq, p, delta_exec);
+ 
+ 		/*
+ 		 * If the fair_server is active, we need to account for the
+@@ -7862,7 +8081,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	 * per-cpu select_rq_mask usage
+ 	 */
+ 	lockdep_assert_irqs_disabled();
+-
++again:
+ 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ 	    asym_fits_cpu(task_util, util_min, util_max, target))
+ 		return target;
+@@ -7900,7 +8119,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	/* Check a recently used CPU as a potential idle candidate: */
+ 	recent_used_cpu = p->recent_used_cpu;
+ 	p->recent_used_cpu = prev;
+-	if (recent_used_cpu != prev &&
++	if (prev == p->wake_cpu &&
++	    recent_used_cpu != prev &&
+ 	    recent_used_cpu != target &&
+ 	    cpus_share_cache(recent_used_cpu, target) &&
+ 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+@@ -7953,6 +8173,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	if ((unsigned)i < nr_cpumask_bits)
+ 		return i;
+ 
++	if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) {
++		/*
++		 * Most likely select_cache_cpu() will have re-directed
++		 * the wakeup, but getting here means the preferred cache is
++		 * too busy, so re-try with the actual previous.
++		 *
++		 * XXX wake_affine is lost for this pass.
++		 */
++		prev = target = p->wake_cpu;
++		goto again;
++	}
++
+ 	/*
+ 	 * For cluster machines which have lower sharing cache like L2 or
+ 	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+@@ -8575,6 +8807,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 	return target;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
++
++static int select_cache_cpu(struct task_struct *p, int prev_cpu)
++{
++	struct mm_struct *mm = p->mm;
++	int cpu;
++
++	if (!mm || p->nr_cpus_allowed == 1)
++		return prev_cpu;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0)
++		return prev_cpu;
++
++
++	if (static_branch_likely(&sched_numa_balancing) &&
++	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
++		/*
++		 * XXX look for max occupancy inside prev_cpu's node
++		 */
++		return prev_cpu;
++	}
++
++	return cpu;
++}
++#else
++static int select_cache_cpu(struct task_struct *p, int prev_cpu)
++{
++	return prev_cpu;
++}
++#endif
++
++
+ /*
+  * select_task_rq_fair: Select target runqueue for the waking task in domains
+  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
+@@ -8600,6 +8866,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 	 * required for stable ->cpus_allowed
+ 	 */
+ 	lockdep_assert_held(&p->pi_lock);
++	guard(rcu)();
++
+ 	if (wake_flags & WF_TTWU) {
+ 		record_wakee(p);
+ 
+@@ -8607,6 +8875,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		    cpumask_test_cpu(cpu, p->cpus_ptr))
+ 			return cpu;
+ 
++		new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu);
++
+ 		if (!is_rd_overutilized(this_rq()->rd)) {
+ 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ 			if (new_cpu >= 0)
+@@ -8617,7 +8887,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
+ 	}
+ 
+-	rcu_read_lock();
+ 	for_each_domain(cpu, tmp) {
+ 		/*
+ 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
+@@ -8650,7 +8919,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		/* Fast path */
+ 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ 	}
+-	rcu_read_unlock();
+ 
+ 	return new_cpu;
+ }
+@@ -9300,6 +9568,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 	if (sysctl_sched_migration_cost == 0)
+ 		return 0;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (p->mm && p->mm->pcpu_sched) {
++		/*
++		 * XXX things like Skylake have non-inclusive L3 and might not
++		 * like this L3 centric view. What to do about L2 stickyness ?
++		 */
++		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
++		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
++	}
++#endif
++
+ 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+ 
+ 	return delta < (s64)sysctl_sched_migration_cost;
+@@ -9311,27 +9590,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+  * Returns 0, if task migration is not affected by locality.
+  * Returns a negative value, if task migration improves locality i.e migration preferred.
+  */
+-static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
+ {
+ 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ 	unsigned long src_weight, dst_weight;
+ 	int src_nid, dst_nid, dist;
+ 
+-	if (!static_branch_likely(&sched_numa_balancing))
+-		return 0;
+-
+-	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
++	if (!p->numa_faults)
+ 		return 0;
+ 
+-	src_nid = cpu_to_node(env->src_cpu);
+-	dst_nid = cpu_to_node(env->dst_cpu);
++	src_nid = cpu_to_node(src_cpu);
++	dst_nid = cpu_to_node(dst_cpu);
+ 
+ 	if (src_nid == dst_nid)
+ 		return 0;
+ 
+ 	/* Migrating away from the preferred node is always bad. */
+ 	if (src_nid == p->numa_preferred_nid) {
+-		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
++		struct rq *src_rq = cpu_rq(src_cpu);
++		if (src_rq->nr_running > src_rq->nr_preferred_running)
+ 			return 1;
+ 		else
+ 			return 0;
+@@ -9342,7 +9619,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ 		return -1;
+ 
+ 	/* Leaving a core idle is often worse than degrading locality. */
+-	if (env->idle == CPU_IDLE)
++	if (idle)
+ 		return 0;
+ 
+ 	dist = node_distance(src_nid, dst_nid);
+@@ -9357,7 +9634,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ 	return src_weight - dst_weight;
+ }
+ 
++static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
++{
++	if (!static_branch_likely(&sched_numa_balancing))
++		return 0;
++
++	if (!(env->sd->flags & SD_NUMA))
++		return 0;
++
++	return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu,
++					   env->idle == CPU_IDLE);
++}
++
+ #else
++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
++{
++	return 0;
++}
++
+ static inline long migrate_degrades_locality(struct task_struct *p,
+ 					     struct lb_env *env)
+ {
+@@ -13117,8 +13411,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+  */
+ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ {
+-	struct cfs_rq *cfs_rq;
+ 	struct sched_entity *se = &curr->se;
++	struct cfs_rq *cfs_rq;
+ 
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+@@ -13128,6 +13422,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ 	if (static_branch_unlikely(&sched_numa_balancing))
+ 		task_tick_numa(rq, curr);
+ 
++	task_tick_cache(rq, curr);
++
+ 	update_misfit_status(curr, rq);
+ 	check_update_overutilized_status(task_rq(curr));
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 83e3aa917142..839463027ab0 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1173,6 +1173,12 @@ struct rq {
+ 	u64			clock_pelt_idle_copy;
+ 	u64			clock_idle_copy;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	raw_spinlock_t		cpu_epoch_lock;
++	u64			cpu_runtime;
++	unsigned long		cpu_epoch;
++	unsigned long		cpu_epoch_next;
++#endif
+ 
+ 	atomic_t		nr_iowait;
+ 
+@@ -3885,6 +3891,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
+ 
++extern void init_sched_mm(struct task_struct *p);
++
+ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+ extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ #ifdef CONFIG_SMP
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch
new file mode 100644
index 0000000..118118a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch
@@ -0,0 +1,318 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 365A021D3E1
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:07:36 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716057; cv=none; b=gkWBJvwJg5iw9w3Hcj2a+7isBgs+dQ6fQDbY6wOnLy8+dyj/K69iJ9MXZ3iC+AHiVKMdhhAQoR1l9wBbUy+BDlfe78+DRZUcHT2UIqJWtHq9xcndAunehdB/pDXNo95Uc+pmFlmpm5x4k3E0kzRAGeqzAXJ2da+LetkIln18z8w=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716057; c=relaxed/simple;
+	bh=0AeD1Ue2wq1wzi/RuwjSxpYJG1oGbIqRn0kfUtN8vWQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ezySzW6WeP6U1sHlrXJpD4tSHCfswUkVtMkKRoY8mH+Is59q9EsKFce/r5LHpaugK4Vf9AWVbLfOfFJGgeU54XDE5BxdVqKFZzyDDz9t1/tqydhF9wFSbw/pomx2BYrO+hWtoQKyQHnIN8AUaxQhvGebuiVcyUt9i8bQDwF00zU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=L1tFeu3x; arc=none smtp.client-ip=198.175.65.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="L1tFeu3x"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716056; x=1786252056;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=0AeD1Ue2wq1wzi/RuwjSxpYJG1oGbIqRn0kfUtN8vWQ=;
+  b=L1tFeu3xpYyhkd6vtGYa4ACOStVJ3enDl+olXnAD4THPx7m0Kc/94fQX
+   0NFLzELjFB+k0dXkFEcvhvn2VXQNCEOqpU4KBJdAapmZmEa5Kw2a3uSD5
+   5xGm04sNo/62GAtkSLJDhfLmYvSib+2Y+m+5iYRVQYWZMC9fcPoUUOJIk
+   57s73MqGMxeACxAjkhR9PE504WxXvkEUrsCDlWBeU6A00KrTz8w5uJ8fg
+   62R1OQ44QJ5eTLS/469R4lFtouEYqw6B8JU9gex0GxRi5dRP00WgMGCoV
+   CD4HfPgwZPIsG54V4ibpdGi2Z/RSrK2prH4PrMvMSdJ5asPDTFjEDlvfI
+   Q==;
+X-CSE-ConnectionGUID: lQLqoBuSQl6aw8VZynnN4g==
+X-CSE-MsgGUID: yMcXhXa1RZSWwEf2YK1T6Q==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137717"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57137717"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:36 -0700
+X-CSE-ConnectionGUID: ofaF5mj0TIucF4p9yS/Zvw==
+X-CSE-MsgGUID: bcXwCarVSY2Y3E4fjnIy5g==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="170730164"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa004.fm.intel.com with ESMTP; 08 Aug 2025 22:07:29 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 02/28] sched: Several fixes for cache aware scheduling
+Date: Sat,  9 Aug 2025 13:01:15 +0800
+Message-Id: <84ceaca0a1de853284b4fc9888af806b03cde8bb.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+1. Fix compile error on percpu allocation.
+2. Enqueue to the target CPU rather than the current CPU.
+3. NULL LLC sched domain check(Libo Chen).
+4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
+5. Fix unsigned occupancy initialization to -1.
+6. If there is only 1 thread in the process, no need to enable cache
+   awareness
+7. Add __maybe_unused to __migrate_degrades_locality() to
+   avoid compile warnings.
+8. Do not enable gcov coverage for task_cache_work() and
+   fraction_mm_sched() to avoid softlockup by gcov.
+9. Make CONFIG_SCHED_CACHE depending on CONFIG_SMP to
+   avoid compile error on non-SMP system like microblaze
+   architecture.
+10. Do not enable account cache aware statistics in
+    account_mm_sched() for non-normal tasks, as it could
+    be invoked by RT tasks.(Shrikanth Hegde)
+11. Place cpu_epoch related fields in a dedicated cache line
+    to avoid interfering with clock_idle* fields.
+    (Shrikanth Hegde)
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/mm_types.h |  4 ++--
+ init/Kconfig             |  4 ++++
+ kernel/sched/fair.c      | 41 +++++++++++++++++++++++++++-------------
+ kernel/sched/features.h  |  1 +
+ kernel/sched/sched.h     |  2 +-
+ 5 files changed, 36 insertions(+), 16 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index cf26ad8b41ab..41a598a44361 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1450,11 +1450,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
+ 
+ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+ {
+-	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+ 	if (!pcpu_sched)
+ 		return -ENOMEM;
+ 
+diff --git a/init/Kconfig b/init/Kconfig
+index 27f4012347f9..4bab39a5254c 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -950,6 +950,10 @@ config NUMA_BALANCING
+ config SCHED_CACHE
+ 	bool "Cache aware scheduler"
+ 	default y
++	depends on SMP
++	help
++	  If set, the scheduler will try to aggregate tasks in the same process to
++	  a single LLC if possible.
+ 
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index e3897cd7696d..e97ab46509e3 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
+-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+ 	int i;
+@@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
+ 
+ 		pcpu_sched->runtime = 0;
+ 		pcpu_sched->epoch = epoch = rq->cpu_epoch;
+-		pcpu_sched->occ = -1;
++		pcpu_sched->occ = 0;
+ 	}
+ 
+ 	raw_spin_lock_init(&mm->mm_sched_lock);
+@@ -1227,7 +1227,7 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ 	}
+ }
+ 
+-static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ {
+ 	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
+ 
+@@ -1248,13 +1248,18 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
+ 
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (p->sched_class != &fair_sched_class)
++		return;
+ 	/*
+ 	 * init_task and kthreads don't be having no mm
+ 	 */
+ 	if (!mm || !mm->pcpu_sched)
+ 		return;
+ 
+-	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
++	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
+ 
+ 	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ 		__update_mm_sched(rq, pcpu_sched);
+@@ -1264,12 +1269,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	}
+ 
+ 	/*
+-	 * If this task hasn't hit task_cache_work() for a while, invalidate
++	 * If this task hasn't hit task_cache_work() for a while, or it
++	 * has only 1 thread, invalidate
+ 	 * it's preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
++	    get_nr_threads(p) <= 1) {
+ 		mm->mm_sched_cpu = -1;
+-		pcpu_sched->occ = -1;
++		pcpu_sched->occ = 0;
+ 	}
+ }
+ 
+@@ -1278,6 +1285,9 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 	struct callback_head *work = &p->cache_work;
+ 	struct mm_struct *mm = p->mm;
+ 
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
+ 	if (!mm || !mm->pcpu_sched)
+ 		return;
+ 
+@@ -1286,16 +1296,13 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 
+ 	guard(raw_spinlock)(&mm->mm_sched_lock);
+ 
+-	if (mm->mm_sched_epoch == rq->cpu_epoch)
+-		return;
+-
+ 	if (work->next == work) {
+ 		task_work_add(p, work, TWA_RESUME);
+ 		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
+ 	}
+ }
+ 
+-static void task_cache_work(struct callback_head *work)
++static void __no_profile task_cache_work(struct callback_head *work)
+ {
+ 	struct task_struct *p = current;
+ 	struct mm_struct *mm = p->mm;
+@@ -1322,6 +1329,9 @@ static void task_cache_work(struct callback_head *work)
+ 			unsigned long occ, m_occ = 0, a_occ = 0;
+ 			int m_cpu = -1, nr = 0, i;
+ 
++			if (!sd)
++				continue;
++
+ 			for_each_cpu(i, sched_domain_span(sd)) {
+ 				occ = fraction_mm_sched(cpu_rq(i),
+ 							per_cpu_ptr(mm->pcpu_sched, i));
+@@ -8815,6 +8825,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	struct mm_struct *mm = p->mm;
+ 	int cpu;
+ 
++	if (!sched_feat(SCHED_CACHE))
++		return prev_cpu;
++
+ 	if (!mm || p->nr_cpus_allowed == 1)
+ 		return prev_cpu;
+ 
+@@ -9569,7 +9582,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 		return 0;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	if (p->mm && p->mm->pcpu_sched) {
++	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
+ 		/*
+ 		 * XXX things like Skylake have non-inclusive L3 and might not
+ 		 * like this L3 centric view. What to do about L2 stickyness ?
+@@ -9647,7 +9660,9 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+ }
+ 
+ #else
+-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
++static __maybe_unused long __migrate_degrades_locality(struct task_struct *p,
++						       int src_cpu, int dst_cpu,
++						       bool idle)
+ {
+ 	return 0;
+ }
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 3c12d9f93331..d2af7bfd36bf 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_UTIL, true)
+ 
++SCHED_FEAT(SCHED_CACHE, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 839463027ab0..f4ab45ecca86 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1174,7 +1174,7 @@ struct rq {
+ 	u64			clock_idle_copy;
+ #endif
+ #ifdef CONFIG_SCHED_CACHE
+-	raw_spinlock_t		cpu_epoch_lock;
++	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
+ 	u64			cpu_runtime;
+ 	unsigned long		cpu_epoch;
+ 	unsigned long		cpu_epoch_next;
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch
new file mode 100644
index 0000000..b8354d1
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch
@@ -0,0 +1,117 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4AE9D226CFC
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:07:58 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.15
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716079; cv=none; b=mzMpUPIBhCIfqgfGYLirSyVew1DiGNGy8kHH9pByDFwjQLg/R08SklG4sqt+h9F0MjNW8uROdXW9EhU0eQGBZx9K4bKZLpb32NTZ568kuQTL5xijzNnbyKAfpI4nekWx9gHcKn2NZrcT76Sz4xJ2qgXu5qqYX/ksmmq4ZH9u+uk=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716079; c=relaxed/simple;
+	bh=57gcT4d4kI048m64jkmqBDrfcowgupEqZQwQ5AEee0w=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=f3RQRxzvnj66spzaJhbe9MgqAYCa98AUSZuwljPsRXInxq/Oxk06wgAT2vRlhS0ehsQOQHM82nnzblQnvrJvdVfOkRoSiG94h3cOAWLd5yBkPjkPpHdCL+rW9rkGbaTLW4RhQdXSHhYol4ZYkaUUjpFkZLW21Gb6+B8vqJUbOW8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=c7C5KF6q; arc=none smtp.client-ip=198.175.65.15
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="c7C5KF6q"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716078; x=1786252078;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=57gcT4d4kI048m64jkmqBDrfcowgupEqZQwQ5AEee0w=;
+  b=c7C5KF6q/sRDGDVG+lM2wu4H0TzbLyHIoFXYrwK1PhMTWehBhmUrl96S
+   T3NkwyIpBdauVKsc3hrNWBirHezNT+Ts0OE7838wAraS+qmqOaNyn/zFO
+   uMSRssAaGwLukBsRJhTXc1N5I0Xy/egiTw1fhkKvS4U8SfrTfRWrmwRa3
+   RPU4tMB524z4Z6MxtH6azdWGiN57MoFd2/dFpTSaE7cXAWavDizO4/WkF
+   yI3XD8KwS9r/rQo9E5DRI45b4Vgd1JhUvkVPHt9fZqza6Nai4EKnx1UNE
+   0Vq3A218YIyzDKDxbIwkrtpMqkq0EWb+pBp1au3p6UMfK3D2O6VNQOM+h
+   A==;
+X-CSE-ConnectionGUID: Fqgj+0eySbSqlUZZn4AFOw==
+X-CSE-MsgGUID: Y/ZbdGdBSSKyPLJRd4lBYA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60682928"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="60682928"
+Received: from orviesa005.jf.intel.com ([10.64.159.145])
+  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:58 -0700
+X-CSE-ConnectionGUID: vDQ06Yc6TseUjHJDDorjIA==
+X-CSE-MsgGUID: sVVKpPMjSQmXlszsGV2bRg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="170841605"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa005.jf.intel.com with ESMTP; 08 Aug 2025 22:07:52 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 03/28] sched: Avoid task migration within its preferred LLC
+Date: Sat,  9 Aug 2025 13:01:41 +0800
+Message-Id: <37376d2072f6175d2fb909a29b66a3da0bcfcce3.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+It was found that when running schbench, there is a
+significant amount of in-LLC task migrations, even if
+the wakee is woken up on its preferred LLC. This
+leads to core-to-core latency and impairs performance.
+
+Inhibit task migration if the wakee is already in its
+preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/fair.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index e97ab46509e3..00bd0d25bc91 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8835,6 +8835,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	if (cpu < 0)
+ 		return prev_cpu;
+ 
++	if (cpus_share_cache(cpu, prev_cpu))
++		return prev_cpu;
+ 
+ 	if (static_branch_likely(&sched_numa_balancing) &&
+ 	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
new file mode 100644
index 0000000..392f3c3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch
@@ -0,0 +1,131 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 029841DC994
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:08:14 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.15
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716096; cv=none; b=WeFRqzbJuIG6rs2oBAOGSTvUwR0GN2LxtVKltBBp1IAWJ1/M5927lhVOryPqkDV68MiNoDiPeaUuIGeJXy1yxTNPU+76g8h8o2kq0++bTNlmXdtCkgRKkjAyvo6JUXPfk9qDPu5fNyxlfwrmUYWgRrIiKd3DbVL5bDFDsKlvmIg=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716096; c=relaxed/simple;
+	bh=3Uph+Pq82/wD/SKt6Wb33FmEmMZN7GyPmlnOYYynb0Q=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=o3E1Bq0Hs0dTChFZYVm4GVOaPeXcpLPhZZEQJ43VOlsuNVair0TGmjdyd1fLlVjeODS5guLeDmjO76w+loIt+jPuKVqUMOnTWw1sIHx/QjCLlCjeJzEknh9dFn7KMZ1m1CPRGI1DknlEPNf/b1KDvycj+UhJPpZyr/+EIpZ7wE0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=KCQ9b2gg; arc=none smtp.client-ip=198.175.65.15
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="KCQ9b2gg"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716095; x=1786252095;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=3Uph+Pq82/wD/SKt6Wb33FmEmMZN7GyPmlnOYYynb0Q=;
+  b=KCQ9b2ggnvTfhFeV/E50fpZUEEpmBsHHnpwH10t35fPh5GZQ4EVxbF9O
+   iwfbtGzsyddng/NZIteqbsCZ21Nl1B6x7QxI9972j42g46j13xjdwRoZ3
+   8A5ColX2OkCXP0fikLLx5ox8/8xMCGNiAOuHNT4EVTgK2VkSLTjB4x6k2
+   OuuokNSBejb3QbstBidVgae5eMr6rPiKsjUpKeIv2M/QgpCk+dAN8C98Z
+   9hQOg7BYjmAjMdUmUQXdfIf7u4hNaX6qUCPOtPPWVhaIxMAKXUR8DS4hA
+   yBD3fm5G5+abwatbqRE6FgrAva6LfJ6mMuAuKCPYQ9SPgzZrUlqukRs7B
+   A==;
+X-CSE-ConnectionGUID: 2W0v7dR/SgiRTYDXa05aZw==
+X-CSE-MsgGUID: hTr8aLcoRrKUtcFs+hv2jQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60682947"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="60682947"
+Received: from orviesa005.jf.intel.com ([10.64.159.145])
+  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:08:15 -0700
+X-CSE-ConnectionGUID: /DUYqfzETGitdTxBYN+2cg==
+X-CSE-MsgGUID: /4qu8b2ATfih1n9Fc4FHNQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="170841644"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa005.jf.intel.com with ESMTP; 08 Aug 2025 22:08:09 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 04/28] sched: Avoid calculating the cpumask if the system is overloaded
+Date: Sat,  9 Aug 2025 13:02:04 +0800
+Message-Id: <88d1c3bc1e817cc72346f566153a4618604b9ecd.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: K Prateek Nayak <kprateek.nayak@amd.com>
+
+If SIS_UTIL terminates the search for idle CPUs, the result of
+cpumask_and() becomes irrelevant. Given that select_idle_cpu()
+may now be invoked twice per wake-up within select_idle_sibling()
+due to cache-aware wake-ups, this overhead can be observed in
+benchmarks such as hackbench.
+
+To conserve additional cycles-particularly in scenarios where
+the LLC is frequently targeted and the search aborts because
+the LLC is busy - calculate the cpumask only when the system is
+not overloaded.
+
+Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
+---
+ kernel/sched/fair.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 00bd0d25bc91..a7be5c5ecba3 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -7940,8 +7940,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+-
+ 	if (sched_feat(SIS_UTIL)) {
+ 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ 		if (sd_share) {
+@@ -7953,6 +7951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 		}
+ 	}
+ 
++	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
++
+ 	if (static_branch_unlikely(&sched_cluster_active)) {
+ 		struct sched_group *sg = sd->groups;
+ 
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
new file mode 100644
index 0000000..5c19d12
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch
@@ -0,0 +1,165 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7E3651DE3BE
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:08:29 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716112; cv=none; b=PThukVAMb7sFuKxVcgD3wTM4phVJ8rI9r7+ebpAt9DmMUXFw/IDfhBgJzVDETnMraJUzTpjxdg0CH+MqsJFdZpG0+0YGabzIwZ03oS5dGUCpiuwcqMBi79EXkvVi691ZUTMUjUdIFwJpzWnQscUTpYi1EC8GJgP4BnZ2xG+wmR8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716112; c=relaxed/simple;
+	bh=LDYuA5WJyjvULccdhS0DHdWu2p7tGoJ/eeAWhiaAlrg=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=bj8SEXlQMrLRUboVVGnQOu9JBe5g36Sf1XMNZ4Fxig9ZZalYOFEmmdY0+oe5Ky5U018MFmBrwaWfJieFQcobideyIOiTbWBmhitES6gj23mv2S9buVmE5umygmQde5ClZGVf904vuEv77RefIPDld9g1BbkRQFkgRCwN1dgJYBw=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Yz3P0As2; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Yz3P0As2"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716109; x=1786252109;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=LDYuA5WJyjvULccdhS0DHdWu2p7tGoJ/eeAWhiaAlrg=;
+  b=Yz3P0As2P7sAAeLgT7nQe+nKKbEUL/+9rKdSGXrRCerJoyPyZVJiAIca
+   rTCc2mIWo0r7xKXHBwJCzl+lhPLNxfq7ThXFDqw/086ptM6gsSmcdBYFy
+   0XG6Bpx4G8F6WIlomNDg2uKFh3+Gf6iv4ohkTrkI1AR9d2HRIWlbSXqPg
+   gjIc3qKxMHdoEmw84F/oRaqsVQVLHKyRLGcXSUZ869pJdp3tCl5EFYIHx
+   RipzC73I4/a7J8WSfr1XW9s1QojcqMVZE0c1LndRlkFmT99Paa711cvo0
+   L3/AK8mOeiqf6B9FcyzKS+XWq8jtfhVABayP9NRmDhXta4Wem5Y5cg2O/
+   Q==;
+X-CSE-ConnectionGUID: 1ql6YZT/RpyNFYw0G8nvRg==
+X-CSE-MsgGUID: Py+W8Q8/QNGIYDmbtHMFoQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019866"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57019866"
+Received: from fmviesa005.fm.intel.com ([10.60.135.145])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:08:29 -0700
+X-CSE-ConnectionGUID: IcQ9RT+CQgWiji82ikx4LA==
+X-CSE-MsgGUID: ojuGhmoFTkKNKnNcd1Z5hA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169704810"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:08:23 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 05/28] sched: Add hysteresis to switch a task's preferred LLC
+Date: Sat,  9 Aug 2025 13:02:18 +0800
+Message-Id: <e51f8a6e172606d520c91c94c0c14b045639217e.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Switching a process's preferred LLC generates lots of task
+migrations across LLCs. To avoid frequent switches
+of home LLC, implement the following policy:
+
+1. Require a 2x occ change threshold to switch preferred LLC
+2. Don't discard preferred LLC for a task
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a7be5c5ecba3..9e3c6f0eb934 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,6 +1175,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
++static int llc_id(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1307,6 +1315,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	struct task_struct *p = current;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
++	unsigned long last_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1;
+ 	cpumask_var_t cpus;
+ 
+@@ -1345,11 +1354,13 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+ 			}
+ 
+-			a_occ /= nr;
++			// a_occ /= nr;
+ 			if (a_occ > m_a_occ) {
+ 				m_a_occ = a_occ;
+ 				m_a_cpu = m_cpu;
+ 			}
++			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
++				last_m_a_occ = a_occ;
+ 
+ 			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+ 				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+@@ -1363,13 +1374,10 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		}
+ 	}
+ 
+-	/*
+-	 * If the max average cache occupancy is 'small' we don't care.
+-	 */
+-	if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD))
+-		m_a_cpu = -1;
+-
+-	mm->mm_sched_cpu = m_a_cpu;
++	if (m_a_occ > (2 * last_m_a_occ)) {
++		/* avoid the bouncing of mm_sched_cpu */
++		mm->mm_sched_cpu = m_a_cpu;
++	}
+ 
+ 	free_cpumask_var(cpus);
+ }
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
new file mode 100644
index 0000000..4054b16
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch
@@ -0,0 +1,200 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F24CE21D3E1
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:09:05 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716147; cv=none; b=LI4h+6OJWZaBS0TR7Q3NBzoXkmy9JmDvvsP0v6h3Wr+GpKZR5W1whx1t+MpULY/tpCopCTQwtk+d4eHYbBbXGG8tw911CUm66GpfCtas8ctsmrrtOtpyFMSEQ1wSEmj4dWMkrZhPa8ugb3u+CyP9djkHSe8sZ2gx2yAfRIPx9CU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716147; c=relaxed/simple;
+	bh=euRWyMdufoCin//rgMJ4T3fWMZpHw1jQB7L14khTmb4=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=FhwU+egXgnivL5wAIp8WWClNxp4yT7p2+qq0OVtPsZHuRHUqSefqGa8Mw69mD2l/SvPpXJI146UF1gIL0OohXuZVBnBx9uuzjtJB0fwJPdjrZGa51C5jL55hj27fGIPyUJpmNwnKZK7cVsQFji5MJ9gcLGaigOLthcWnAQfM4Ww=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=XB8zwFIH; arc=none smtp.client-ip=198.175.65.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="XB8zwFIH"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716146; x=1786252146;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=euRWyMdufoCin//rgMJ4T3fWMZpHw1jQB7L14khTmb4=;
+  b=XB8zwFIHW5tROwSjY+TkhHve8zxklhKn+ovNLIF8SV5U0iIW3uisGZ+h
+   hY/ESXsB+pgUaisyny/yy5pPLNKo8MGtQy5YXsFvNJCRR/qBzsPTciwpd
+   DWKZJ1KhyMu77ycu6eYUAzqZuN/gRnxFRxIu0gB+CDzvdaP0yW6Alm3q2
+   6uSDm53TwoaOggR9d3iPh9Z+dpDEn90e2yYpi8OZHptMKcxOxMQuhOE8g
+   XyVt9GJRY6uXVn+Xhk0ObrEJv4d8fU3+v015Xl9/d69ko5uk8uOcrTvoC
+   5KhPHr2patZHHRizOM78ma1nH5m9MLqfkUzr1tVCq2xOpXisttW9XZBQc
+   Q==;
+X-CSE-ConnectionGUID: fZf0KiitSgGguAZwYKknTQ==
+X-CSE-MsgGUID: qtUqGlf/QgGtoI55sDwkdQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137770"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57137770"
+Received: from fmviesa006.fm.intel.com ([10.60.135.146])
+  by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:05 -0700
+X-CSE-ConnectionGUID: 7a50TPFuTq2l0110KDRwEA==
+X-CSE-MsgGUID: IsqZoesVQLiHgA5hLOCuhg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165374544"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:00 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling
+Date: Sat,  9 Aug 2025 13:02:54 +0800
+Message-Id: <d77d4db175adc09cd01fdee097c16bc3e52c8be2.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+When a system gets busy and a process's preferred LLC
+is saturated by too many threads within this process, there are significant
+in-LLC task migrations within its preferred LLC. This leads to migration
+latency and degrades performance. Ideally, task aggregation should be
+inhibited if the task's preferred LLC is overloaded. This implies that a
+metric is needed to indicate whether the LLC is busy.
+
+Store the per-LLC utilization calculated via periodic load
+balancing. These statistics will be used in subsequent patches to
+determine whether tasks should be aggregated to their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/sched/topology.h |  3 ++
+ kernel/sched/fair.c            | 53 ++++++++++++++++++++++++++++++++++
+ 2 files changed, 56 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 198bb5cc1774..692f8a703b93 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -78,6 +78,9 @@ struct sched_domain_shared {
+ 	atomic_t	nr_busy_cpus;
+ 	int		has_idle_cores;
+ 	int		nr_idle_scan;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned long	util_avg;
++#endif
+ };
+ 
+ struct sched_domain {
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 9e3c6f0eb934..4f79b7652642 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8828,6 +8828,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ #ifdef CONFIG_SCHED_CACHE
+ static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
+ 
++/* expected to be protected by rcu_read_lock() */
++static bool get_llc_stats(int cpu, unsigned long *util,
++			  unsigned long *cap)
++{
++	struct sched_domain_shared *sd_share;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share)
++		return false;
++
++	*util = READ_ONCE(sd_share->util_avg);
++	*cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE;
++
++	return true;
++}
++
+ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ {
+ 	struct mm_struct *mm = p->mm;
+@@ -10670,6 +10686,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Save this sched group's statistic for later use:
++ * The task wakeup and load balance can make better
++ * decision based on these statistics.
++ */
++static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
++			     struct sched_group *group)
++{
++	/* Find the sched domain that spans this group. */
++	struct sched_domain *sd = env->sd->child;
++	struct sched_domain_shared *sd_share;
++
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++		return;
++
++	/* only care the sched domain that spans 1 LLC */
++	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
++	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
++		return;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
++				  cpumask_first(sched_group_span(group))));
++	if (!sd_share)
++		return;
++
++	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
++		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
++}
++#else
++static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
++				    struct sched_group *group)
++{
++}
++#endif
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10759,6 +10811,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	update_sg_if_llc(env, sgs, group);
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
new file mode 100644
index 0000000..e02811a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch
@@ -0,0 +1,293 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A097E125D6
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:09:19 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716161; cv=none; b=Rm7TcpHSNUIIoFYEPwrZQSk2+mFBFaIi8Biv/YBu8NhjoOpqLYiSDc8n/N7a+PcWKj1D5lN8yxsfXFZpqwNZo9V27otdxT/bMNi/j+pCcQsy85gMx6mqoYUfLUdWB0a4zERaoznppBe6okhDs8L/kX4GGnSX8g32CRW4pXc/jS4=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716161; c=relaxed/simple;
+	bh=V4wQFWsJMut3Mv4WM/pNBpXZswfWqWCXgqNTuOQgQwc=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=tR/Eg9pwE3kfKHQ1XrSvfns7tRo1/54dBTa3Mcw/Vf9PUP5J9yjcpYZjzIWQ96CkLtFqrg19Zl0Cj25CGXm9QEsmQiICsQrbY8sep3kg5LmP1PeugdTFvBMNkphNKcutc0NLmiPwdKekqx5gwUaOM5x4KAy1UaLJ56yS2wqthlE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=cZoFv+co; arc=none smtp.client-ip=198.175.65.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="cZoFv+co"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716160; x=1786252160;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=V4wQFWsJMut3Mv4WM/pNBpXZswfWqWCXgqNTuOQgQwc=;
+  b=cZoFv+cooYycPX6fhTDFz3M70RpTXLjWW54A4RKCHWeGADL7LpNgffxW
+   gBfRlECeZkPmt7ZbNEgFqOrc7h37RbVfI0hpXkfJXAbSVhFqX1dyT3XdI
+   KBuYaf5c3EOsWZOREhMvQUtHsoLmWta+xL56O2v1gsR3leEwTYp2Wagee
+   zuCK7oxtqjbXilAu6g6eLj5fAL1la9xryvQW3Hx9lwncNu2ChThNoNOIL
+   8rqMMvopoFaWOd9vVKtpzIX0eyrh2S0jAjm/gycY3Z9ipQFIzNCz17K0J
+   RJ7dwofq53rsYOMlEnDTxOs2VjSm+OkCdnAYfR59wl5PkmnVd8HuSY4K7
+   g==;
+X-CSE-ConnectionGUID: U6TaVEBsQ1urnPH9VT6FpQ==
+X-CSE-MsgGUID: xLUFPU/zRaayB1BNHTCWGw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137798"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57137798"
+Received: from fmviesa006.fm.intel.com ([10.60.135.146])
+  by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:19 -0700
+X-CSE-ConnectionGUID: MAk8xVHNTeiRA2GQcBH6/g==
+X-CSE-MsgGUID: rE8RpT+sQLqRJLnxvLOvng==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165374559"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:13 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 07/28] sched: Add helper function to decide whether to allow cache aware scheduling
+Date: Sat,  9 Aug 2025 13:03:10 +0800
+Message-Id: <701c7be7f0e69582d9ad0c25025ec2e133e73fbb.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Cache-aware scheduling is designed to aggregate threads into their
+preferred LLC, either via the task wake up path or the load balancing
+path. One side effect is that when the preferred LLC is saturated,
+more threads will continue to be stacked on it, degrading the workload's
+latency. A strategy is needed to prevent this aggregation from going too
+far such that the preferred LLC is too overloaded.
+
+Introduce helper function _get_migrate_hint() to implement the
+LLC migration policy:
+
+1) A task is aggregated to its preferred LLC if both source/dest LLC
+   are not too busy (<50% utilization, tunable), or the preferred
+   LLC will not be too out of balanced from the non preferred LLC
+   (>20% utilization, tunable, close to imbalance_pct of the LLC
+   domain).
+2) Allow a task to be moved from the preferred LLC to the
+   non-preferred one if the non-preferred LLC will not be too out
+   of balanced from the preferred prompting an aggregation task
+   migration later.  We are still experimenting with the aggregation
+   and migration policy. Some other possibilities are policy based
+   on LLC's load or average number of tasks running.  Those could
+   be tried out by tweaking _get_migrate_hint().
+
+The function _get_migrate_hint() returns migration suggestions for
+the upper-level functions.
+
+Aggregation will tend to make utilization on the preferred LLC to
+be more than the non-preferred one. Parameter "sysctl_llc_aggr_imb"
+is the imbalance allowed. If it is set to 0, as long as the preferred
+LLC is not utilized more than the source LLC, we could still aggregate
+towards the preferred LLC and a preference could still be there.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/debug.c |   4 ++
+ kernel/sched/fair.c  | 110 ++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |   5 ++
+ 3 files changed, 118 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 557246880a7e..682fd91a42a0 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -532,6 +532,10 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ #endif
+ 
++#ifdef CONFIG_SCHED_CACHE
++	debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap);
++	debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb);
++#endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+ 	debugfs_fair_server_init();
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 4f79b7652642..3128dbcf0a36 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8826,7 +8826,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ }
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
++static long __migrate_degrades_locality(struct task_struct *p,
++					int src_cpu, int dst_cpu,
++					bool idle);
++__read_mostly unsigned int sysctl_llc_aggr_cap       = 50;
++__read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
++
++/*
++ * The margin used when comparing LLC utilization with CPU capacity.
++ * Parameter sysctl_llc_aggr_cap determines the LLC load level where
++ * active LLC aggregation is done.
++ * Derived from fits_capacity().
++ *
++ * (default: ~50%)
++ */
++#define fits_llc_capacity(util, max)	\
++	((util) * 100 < (max) * sysctl_llc_aggr_cap)
++
++/*
++ * The margin used when comparing utilization.
++ * is 'util1' noticeably greater than 'util2'
++ * Derived from capacity_greater().
++ * Bias is in perentage.
++ */
++/* Allows dst util to be bigger than src util by up to bias percent */
++#define util_greater(util1, util2) \
++	((util1) * 100 > (util2) * (100 + sysctl_llc_aggr_imb))
++
++enum llc_mig_hint {
++	mig_allow = 0,
++	mig_ignore,
++	mig_forbid
++};
++
+ 
+ /* expected to be protected by rcu_read_lock() */
+ static bool get_llc_stats(int cpu, unsigned long *util,
+@@ -8844,6 +8876,82 @@ static bool get_llc_stats(int cpu, unsigned long *util,
+ 	return true;
+ }
+ 
++static enum llc_mig_hint _get_migrate_hint(int src_cpu, int dst_cpu,
++					   unsigned long tsk_util,
++					   bool to_pref)
++{
++	unsigned long src_util, dst_util, src_cap, dst_cap;
++
++	if (cpus_share_cache(src_cpu, dst_cpu))
++		return mig_allow;
++
++	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
++	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
++		return mig_ignore;
++
++	if (!fits_llc_capacity(dst_util, dst_cap) &&
++	    !fits_llc_capacity(src_util, src_cap))
++		return mig_ignore;
++
++	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
++	dst_util = dst_util + tsk_util;
++	if (to_pref) {
++		/*
++		 * sysctl_llc_aggr_imb is the imbalance allowed between
++		 * preferred LLC and non-preferred LLC.
++		 * Don't migrate if we will get preferred LLC too
++		 * heavily loaded and if the dest is much busier
++		 * than the src, in which case migration will
++		 * increase the imbalance too much.
++		 */
++		if (!fits_llc_capacity(dst_util, dst_cap) &&
++		    util_greater(dst_util, src_util))
++			return mig_forbid;
++	} else {
++		/*
++		 * Don't migrate if we will leave preferred LLC
++		 * too idle, or if this migration leads to the
++		 * non-preferred LLC falls within sysctl_aggr_imb percent
++		 * of preferred LLC, leading to migration again
++		 * back to preferred LLC.
++		 */
++		if (fits_llc_capacity(src_util, src_cap) ||
++		    !util_greater(src_util, dst_util))
++			return mig_forbid;
++	}
++	return mig_allow;
++}
++
++/*
++ * Give suggestion when task p is migrated from src_cpu to dst_cpu.
++ */
++static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cpu,
++							 struct task_struct *p)
++{
++	struct mm_struct *mm;
++	int cpu;
++
++	if (cpus_share_cache(src_cpu, dst_cpu))
++		return mig_allow;
++
++	mm = p->mm;
++	if (!mm)
++		return mig_allow;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0)
++		return mig_allow;
++
++	if (cpus_share_cache(dst_cpu, cpu))
++		return _get_migrate_hint(src_cpu, dst_cpu,
++					 task_util(p), true);
++	else if (cpus_share_cache(src_cpu, cpu))
++		return _get_migrate_hint(src_cpu, dst_cpu,
++					 task_util(p), false);
++	else
++		return mig_allow;
++}
++
+ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ {
+ 	struct mm_struct *mm = p->mm;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index f4ab45ecca86..83552aab74fb 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2844,6 +2844,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern unsigned int sysctl_llc_aggr_cap;
++extern unsigned int sysctl_llc_aggr_imb;
++#endif
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch
new file mode 100644
index 0000000..0e4cfdb
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch
@@ -0,0 +1,232 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7A24F2749D5
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:09:34 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716177; cv=none; b=NNJRjopqrRbhFk19x1BndJWZ90HqNxRnx0H7JE+07eSr/bdUJMU/c0NJ3LB2cV94Rsi1R1AdGM1d2xlML1jh2RnTHB1Dzdvr0qBwgdFuA8zjncQEUZO6kHF1Y2GSQmE70Toj/gzstTrxtr3JAqld0iuOXw9GF3i3gZmGNxoXo9Q=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716177; c=relaxed/simple;
+	bh=Q9WyVWFAxaya8q+BQJRZR5gSlB4xoSQBTrL3eFp0H3k=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=OJCxUgjq3LhGvzfQnlmgDk0eZNcoMO3+ooDXAn+dS2XPaoAD00XBVRhoIDDmzlGUakQWDO9E3wgaByP2px0tcqYpCXMaz1PTKVbRf0IMfCL7wmj/Pl6WrMp5Uk7woFJsPxrVPRghCGLe/mewLfBt3ueL13B0csiHsH9qkS/YD8s=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=OW5YzRQl; arc=none smtp.client-ip=198.175.65.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="OW5YzRQl"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716176; x=1786252176;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Q9WyVWFAxaya8q+BQJRZR5gSlB4xoSQBTrL3eFp0H3k=;
+  b=OW5YzRQlQh/nBHXwqrjN0l8Y7LfITnFjeexFpLxenGyiJvbsUX2LdH+f
+   NmdygIjqrdclruZgedLbBxuvOrc8rS64ODYq+fjwfwXQVB0yteRfnTP/u
+   KUJ6NLJp7E5qLizUurKDYQ/CQ8WhKvO+A1CCWJcny7Ywyk5pWHn0+ihL2
+   fyfKV1cKZRnLjLxVHkt7AZCj9E7OPIlGwDuDChPwUD61pbaKxh7wR9gpr
+   q7g35VzJcDGPAJtv/VzN73wW2yx/6zcGH0VLxrR+XHBCqIvDlMHA2v4f0
+   DYgKaiB40pfFiu4dwUi7Ps3HxC6vxt6/7c8fQVryZXz/WrZQ5I8EfdNEJ
+   A==;
+X-CSE-ConnectionGUID: Y02Bl/8pShaUv1kRhHj39A==
+X-CSE-MsgGUID: 3/idV79SS1m5g1kaTEBzDw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137811"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57137811"
+Received: from fmviesa006.fm.intel.com ([10.60.135.146])
+  by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:34 -0700
+X-CSE-ConnectionGUID: XuuWOYxBTOOma0SA+bSkWg==
+X-CSE-MsgGUID: Ip2uRtmWRum9W2ItftRIPg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165374575"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:28 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 08/28] sched: Set up LLC indexing
+Date: Sat,  9 Aug 2025 13:03:24 +0800
+Message-Id: <959d897daadc28b8115c97df04eec2af0fd79c5d.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Prepare for indexing arrays that track in each run queue: the number
+of tasks preferring current LLC and each of the other LLC.
+
+The reason to introduce LLC index is because the per LLC-scope data
+is needed to do cache aware load balancing. However, the native lld_id
+is usually the first CPU of that LLC domain, which is not continuous,
+which might waste the space if the per LLC-scope data is stored
+in an array (in current implementation).
+
+In the future, this LLC index could be removed after
+the native llc_id is used as the key to search into xarray based
+array.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h   |  3 +++
+ kernel/sched/fair.c     | 12 ++++++++++++
+ kernel/sched/sched.h    |  2 ++
+ kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++
+ 4 files changed, 46 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 02ff8b8be25b..81d92e8097f5 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -809,6 +809,9 @@ struct kmap_ctrl {
+ #endif
+ };
+ 
++/* XXX need fix to not use magic number */
++#define MAX_LLC 64
++
+ struct task_struct {
+ #ifdef CONFIG_THREAD_INFO_IN_TASK
+ 	/*
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 3128dbcf0a36..f5075d287c51 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1183,6 +1183,18 @@ static int llc_id(int cpu)
+ 	return per_cpu(sd_llc_id, cpu);
+ }
+ 
++/*
++ * continuous index.
++ * TBD: replace by xarray with key llc_id()
++ */
++static inline int llc_idx(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_idx, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 83552aab74fb..c37c74dfce25 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2056,6 +2056,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DECLARE_PER_CPU(int, sd_llc_size);
+ DECLARE_PER_CPU(int, sd_llc_id);
++DECLARE_PER_CPU(int, sd_llc_idx);
+ DECLARE_PER_CPU(int, sd_share_id);
+ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -2064,6 +2065,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ 
+ extern struct static_key_false sched_asym_cpucapacity;
+ extern struct static_key_false sched_cluster_active;
++extern int max_llcs;
+ 
+ static __always_inline bool sched_asym_cpucap_active(void)
+ {
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index b958fe48e020..91a2b7f65fee 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -657,6 +657,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DEFINE_PER_CPU(int, sd_llc_size);
+ DEFINE_PER_CPU(int, sd_llc_id);
++DEFINE_PER_CPU(int, sd_llc_idx);
+ DEFINE_PER_CPU(int, sd_share_id);
+ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -666,6 +667,25 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+ 
++int max_llcs = -1;
++
++static void update_llc_idx(int cpu)
++{
++#ifdef CONFIG_SCHED_CACHE
++	int idx = -1, llc_id = -1;
++
++	llc_id = per_cpu(sd_llc_id, cpu);
++	idx = per_cpu(sd_llc_idx, llc_id);
++
++	if (idx < 0) {
++		idx = max_llcs++;
++		BUG_ON(idx > MAX_LLC);
++		per_cpu(sd_llc_idx, llc_id) = idx;
++	}
++	per_cpu(sd_llc_idx, cpu) = idx;
++#endif
++}
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+@@ -684,6 +704,7 @@ static void update_top_cache_domain(int cpu)
+ 	per_cpu(sd_llc_size, cpu) = size;
+ 	per_cpu(sd_llc_id, cpu) = id;
+ 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
++	update_llc_idx(cpu);
+ 
+ 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ 	if (sd)
+@@ -2456,6 +2477,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (max_llcs < 0) {
++		for_each_possible_cpu(i)
++			per_cpu(sd_llc_idx, i) = -1;
++		max_llcs = 0;
++	}
++#endif
++
+ 	if (WARN_ON(cpumask_empty(cpu_map)))
+ 		goto error;
+ 
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch
new file mode 100644
index 0000000..c115c45
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch
@@ -0,0 +1,156 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.13])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 72B6D15665C
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:10:00 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.13
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716202; cv=none; b=b2L1GhXR1bKfaGNtSt2TSFJgivwyGWl/zG0ke7CHSqEAcHuFzHVF+Vph/AG5ZfJphDMNbIxy4SUFIumjAOZK2TB0Z4jGWHlzOvGKs9kRxGy3WkdJTVEO3FLULtEJnBKj5AORTkYZlfIB4LE4Izx1MQm/ZkRn8Sz9XQb/WKOg49I=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716202; c=relaxed/simple;
+	bh=yDyElrdoJP1owudVvXOmuFFmGrAdqlZ/3LSJv9270PI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=j3HhsKTtfxpLA3rJndUgWlhiEvEp59nMppnJBuBt3n57eWQsLpmRNumx60yqEWTiU+2a0Akk/6QCT1AAxHoly+zNGqSyQJ4Og7AOKWr039BhdBdB9rB0XwOlLRo6MBk6oRA3xZMgTm/i1/Glk/eLvpJOrXxvUAqHVunt8/0Gy+Q=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jA/q+1HC; arc=none smtp.client-ip=198.175.65.13
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jA/q+1HC"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716201; x=1786252201;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=yDyElrdoJP1owudVvXOmuFFmGrAdqlZ/3LSJv9270PI=;
+  b=jA/q+1HCZoWL9/if9zUiP2RagKNIlzVG2/9XmXTq+Ai1+idEA3GniBpc
+   0U+c7IPLnvxvaiyKdJCCUu5unbGE6uc4OZCK3b3LFPBZpAbM8stdCZMnd
+   Wj/PbIIK7iHErHRNJoSuTG2Hz3Kd1S2DZTWM7lcoF8Rml/dJplEh3gVCt
+   vpngwG0Zm9NV0fxTmPcRsqshl1tnvy4tttj+WdiTSfQEPhYj49I+gD0bh
+   3UQewsPVTarSIp+hr1KFG3cogmN+Rd4lGhrxiPXp8zr5spR5put/n5Xyn
+   1MEtslmziwaMvG+ZfcPPao0HwFNVTDTVL4ngyCM61uwjqrVJWtD3T8BQ/
+   Q==;
+X-CSE-ConnectionGUID: W/xDy5btSjOwlHAt10VALQ==
+X-CSE-MsgGUID: hD1wAyqHT7WAJ49GLugw0A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68139885"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="68139885"
+Received: from fmviesa008.fm.intel.com ([10.60.135.148])
+  by orvoesa105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:00 -0700
+X-CSE-ConnectionGUID: c2jIhCe2THSYJtrA2gygJg==
+X-CSE-MsgGUID: PCF36aw+QPy8wB+b//2Ymw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165891237"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa008.fm.intel.com with ESMTP; 08 Aug 2025 22:09:54 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 09/28] sched: Introduce task preferred LLC field
+Date: Sat,  9 Aug 2025 13:03:50 +0800
+Message-Id: <a8e889738dbb543c2d3844c1f1089aabc6b94bef.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+With cache aware scheduling enabled, each process is assigned
+a preferred LLC id, which will be used to quickly identify
+the LLC domain this thread prefers to run. This is similar to
+numa_preferred_nid for NUMA balance.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h | 1 +
+ init/init_task.c      | 3 +++
+ kernel/sched/fair.c   | 7 +++++++
+ 3 files changed, 11 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 81d92e8097f5..ac4973728c3e 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1408,6 +1408,7 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	int				preferred_llc;
+ #endif
+ 
+ #ifdef CONFIG_RSEQ
+diff --git a/init/init_task.c b/init/init_task.c
+index e557f622bd90..5fffbe766f57 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_group	= NULL,
+ 	.numa_faults	= NULL,
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	.preferred_llc  = -1,
++#endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ 	.kasan_depth	= 1,
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f5075d287c51..94ad84ba19e1 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1267,6 +1267,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
++	int mm_sched_llc = -1;
+ 
+ 	if (!sched_feat(SCHED_CACHE))
+ 		return;
+@@ -1298,6 +1299,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 		mm->mm_sched_cpu = -1;
+ 		pcpu_sched->occ = 0;
+ 	}
++
++	if (mm->mm_sched_cpu != -1)
++		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
++
++	if (p->preferred_llc != mm_sched_llc)
++		p->preferred_llc = mm_sched_llc;
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
new file mode 100644
index 0000000..9f593d2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch
@@ -0,0 +1,255 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.13])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 89D17157E6B
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:10:13 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.13
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716215; cv=none; b=YUFlGTDaJF85FWib/Q7a+Wsp38xQlzoRV7AlyUsBs2T1Nm/D3GTDbCOLCu/JYT4Bz1kY7FBLiNzXW6SeRLGEdj+kiCkLEbPCq1Dkw53ko18P2N2wbe+qOYsR7L33XPzRdv8x7pin6JN4QQ3K2vGxPtYxzPr3f13C84cowPsbc0I=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716215; c=relaxed/simple;
+	bh=z06d0DcakiSO/sbAcrP1nzo0jya94AyoNYzp6vdKu+E=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Ie5cAVqHtvb8rhtiw/uv2b7q9XOHp9GEaOPJqeh1V5G10tNLgO252LhnRQe82GxwRDsA9JqO8Z/Pn7dGb6/9pzL+eDl4+d0jt7D8uohXyByy5gEgRhnGJ+jOPjC5jrJlsyqSb7bw9iDQtCUKZPX6LrQnQOAMGwjypK1vT60zh4Q=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=A43oUD+X; arc=none smtp.client-ip=198.175.65.13
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="A43oUD+X"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716214; x=1786252214;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=z06d0DcakiSO/sbAcrP1nzo0jya94AyoNYzp6vdKu+E=;
+  b=A43oUD+XrpEd//+VfIwxy0/clww1jBOSaqukWPErBvX/YdTyCblU2SZc
+   2RzVMdCc8oIYfY7mAnN+bu4TlDneL5nuPMn8idWsUe/ibRoK5MwcfHrk1
+   4wf705GDHnZFwZzx7MaW2tVbkko9eMPuBBX9wEZV9YRSNvgsYWWcfkB9f
+   X09DYxiaF6aoyy46GVmca0RePk7ZqdJVl5uzZAHcWSo20QuUXb6HVtKNk
+   d843I5ITdrSq6lu1g0W1GAYdjZ+obzYIC4503sdpdA31Ura1IlPBWiexd
+   +xto3bCIJV99nwMfneStXQQK9UCe9VrSRX+40SkrUq9jsfJUv0KlPZ3A8
+   Q==;
+X-CSE-ConnectionGUID: plGWHn/OT2iWZBXIShk1/w==
+X-CSE-MsgGUID: HnPTC8sbTiuVF1BIb62/Dg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68139903"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="68139903"
+Received: from fmviesa008.fm.intel.com ([10.60.135.148])
+  by orvoesa105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:13 -0700
+X-CSE-ConnectionGUID: UNc6FU+9Rn6IWN1WcGQpPA==
+X-CSE-MsgGUID: H1SRXM8VR6yqBYvNRQaDcA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165891283"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa008.fm.intel.com with ESMTP; 08 Aug 2025 22:10:07 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 10/28] sched: Calculate the number of tasks that have LLC preference on a runqueue
+Date: Sat,  9 Aug 2025 13:04:04 +0800
+Message-Id: <bd767d696636f233f30f334d41f46ce45c256948.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Track for each run queue, the number of tasks that have a LLC preference
+and how many of those tasks are running in its preferred LLC.  This is
+similar to nr_numa_running and nr_preferred_running for NUMA balance,
+and will be used by the cache-aware load balancing in subsequent patches.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/core.c  | 12 +++++++++++
+ kernel/sched/fair.c  | 51 +++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |  7 ++++++
+ 3 files changed, 69 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index a5fb3057b1c4..a97a8039ce91 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -499,6 +499,18 @@ void __trace_set_current_state(int state_value)
+ }
+ EXPORT_SYMBOL(__trace_set_current_state);
+ 
++#ifdef CONFIG_SMP
++int task_llc(const struct task_struct *p)
++{
++	return per_cpu(sd_llc_id, task_cpu(p));
++}
++#else
++int task_llc(const struct task_struct *p)
++{
++	return 0;
++}
++#endif
++
+ /*
+  * Serialization rules:
+  *
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 94ad84ba19e1..f964d5a44fcc 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1195,6 +1195,24 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	rq->nr_llc_running += (p->preferred_llc != -1);
++	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	rq->nr_llc_running -= (p->preferred_llc != -1);
++	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1303,8 +1321,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (mm->mm_sched_cpu != -1)
+ 		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
+ 
+-	if (p->preferred_llc != mm_sched_llc)
++	if (p->preferred_llc != mm_sched_llc) {
++		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
++		account_llc_enqueue(rq, p);
++	}
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+@@ -1408,6 +1429,17 @@ void init_sched_mm(struct task_struct *p)
+ 	work->next = work;
+ }
+ 
++void reset_llc_stats(struct rq *rq)
++{
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (rq->nr_llc_running)
++		rq->nr_llc_running = 0;
++
++	rq->nr_pref_llc_running = 0;
++}
++
+ #else
+ 
+ static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+@@ -1418,6 +1450,17 @@ void init_sched_mm(struct task_struct *p) { }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++}
++
++void reset_llc_stats(struct rq *rq)
++{
++}
+ #endif
+ 
+ static inline
+@@ -3957,6 +4000,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+ 		account_numa_enqueue(rq, task_of(se));
++		account_llc_enqueue(rq, task_of(se));
+ 		list_add(&se->group_node, &rq->cfs_tasks);
+ 	}
+ #endif
+@@ -3970,10 +4014,15 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ #ifdef CONFIG_SMP
+ 	if (entity_is_task(se)) {
+ 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
++		account_llc_dequeue(rq_of(cfs_rq), task_of(se));
+ 		list_del_init(&se->group_node);
+ 	}
+ #endif
+ 	cfs_rq->nr_queued--;
++
++	/* safeguard? */
++	if (!parent_entity(se) && !cfs_rq->nr_queued)
++		reset_llc_stats(rq_of(cfs_rq));
+ }
+ 
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index c37c74dfce25..8026e2c66e9f 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1106,6 +1106,10 @@ struct rq {
+ 	unsigned int		nr_preferred_running;
+ 	unsigned int		numa_migrate_on;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int		nr_pref_llc_running;
++	unsigned int		nr_llc_running;
++#endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ #ifdef CONFIG_SMP
+ 	unsigned long		last_blocked_load_update_tick;
+@@ -1967,6 +1971,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ 
+ #endif /* !CONFIG_NUMA_BALANCING */
+ 
++void reset_llc_stats(struct rq *rq);
++int task_llc(const struct task_struct *p);
++
+ #ifdef CONFIG_SMP
+ 
+ static inline void
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
new file mode 100644
index 0000000..9e101dc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch
@@ -0,0 +1,217 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id ACB411DE3BE
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:10:33 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716235; cv=none; b=lYq4sXXnPZXpun4uCiODi1V3d/oXpyY0WTO8EB77SDdlY6hzvunzTarb/1BsD6HfY68VzKdZ/P3gkg6eJSWY/V4zZO3lamlWGMLfuQQ9r0QtHrs78GolCQXrw2NP8BYJ0Ju1m5iSnICs6lWRkPa7xXiHxKjTT1AzsfOnMwn7rTs=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716235; c=relaxed/simple;
+	bh=mMOSEbE/HppEmtkHE5TulvZHAhAbaSvHHXGxGBzFcgU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=j3HRMLmVG1MFAS1qP6GNt/s6/77Uh+WddEprQ5Z3l8TxNvB+P4AvujK8AWnshyXFcGmTbARtQ+BIJgmdYZlbH9m8Qs+2XeS6vuXVoCzJlMxbKBnl0JQB9Z0xEIolcdu+YlhDSc69qnES+cWMESyBPPowQbtqjplsmzkh92xzVx4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=geKQjfK/; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="geKQjfK/"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716234; x=1786252234;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=mMOSEbE/HppEmtkHE5TulvZHAhAbaSvHHXGxGBzFcgU=;
+  b=geKQjfK/ENQzxJIVdaSMSI44R3jzrRtlLAcLLVwYHZqNhRHgxmIbm8PU
+   XiWO/MC/iDc0e7d2LUmRe0CKRrydnkWRCT7V1d5Ru83rqB+7K/+RrZNhk
+   +2sQx23IAdQbusICOeU5sYoOB7pa5uDZu1oWLgGvhnJwEFa2V+2w+qxj2
+   m6YwMTmZ4b38bn4agOoOn4ktTclSJFaj3Mp772dwNENS9tmK3L6FTfdSK
+   gtxcuwqrQMw1U7n0bVzf8SFtoDVy9euo9ZcsqmS67rcCdjvv8Sewo0TTC
+   69ZCq755Kkj3SVTPSNh4ROx9trC8pZSNM1tAluNyLdBtgKLb9L4vwsHnZ
+   g==;
+X-CSE-ConnectionGUID: gBIugcE8QvyHdtE7rR2INg==
+X-CSE-MsgGUID: omeDEy9ySEOEIT5GfzHbDg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019934"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57019934"
+Received: from fmviesa005.fm.intel.com ([10.60.135.145])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:33 -0700
+X-CSE-ConnectionGUID: 3IoQ+9+0SaCZvxJgCfv5ug==
+X-CSE-MsgGUID: 3gf5XiXlTDy8oJdmBqQc4w==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169705062"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:10:27 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 11/28] sched: Introduce per runqueue task LLC preference counter
+Date: Sat,  9 Aug 2025 13:04:18 +0800
+Message-Id: <fc52347ecefb756cb3a9a7d90874e7c502fa55e0.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Each runqueue is assigned a static array, where each element indicates
+the number of tasks preferring a particular LLC mapped to the
+array index.
+
+For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
+this runqueue which prefer to run within LLC3 (indexed from 0 to MAX_LLC
+across the entire system). With this information, the load balancer can
+make better decisions to select the busiest runqueue and migrate tasks
+to their preferred LLC domains.
+
+Note: The static array could be converted to an xarray in the future.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c  | 42 ++++++++++++++++++++++++++++++++++++++++--
+ kernel/sched/sched.h |  1 +
+ 2 files changed, 41 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f964d5a44fcc..cfae71ee870b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1195,22 +1195,51 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static inline int pref_llc_idx(struct task_struct *p)
++{
++	return llc_idx(p->preferred_llc);
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_feat(SCHED_CACHE))
+ 		return;
+ 
+ 	rq->nr_llc_running += (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	++rq->nr_pref_llc[pref_llc];
+ }
+ 
+ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_feat(SCHED_CACHE))
+ 		return;
+ 
+ 	rq->nr_llc_running -= (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	/* avoid negative counter */
++	if (rq->nr_pref_llc[pref_llc] > 0)
++		--rq->nr_pref_llc[pref_llc];
+ }
+ 
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+@@ -1279,6 +1308,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
+ 	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+ }
+ 
++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
++
+ static inline
+ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+@@ -1321,7 +1352,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (mm->mm_sched_cpu != -1)
+ 		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
+ 
+-	if (p->preferred_llc != mm_sched_llc) {
++	/* task not on rq accounted later in account_entity_enqueue() */
++	if (task_running_on_cpu(rq->cpu, p) &&
++	    p->preferred_llc != mm_sched_llc) {
+ 		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
+ 		account_llc_enqueue(rq, p);
+@@ -1431,11 +1464,16 @@ void init_sched_mm(struct task_struct *p)
+ 
+ void reset_llc_stats(struct rq *rq)
+ {
++	int i;
++
+ 	if (!sched_feat(SCHED_CACHE))
+ 		return;
+ 
+-	if (rq->nr_llc_running)
++	if (rq->nr_llc_running) {
++		for (i = 0; i < MAX_LLC; ++i)
++			rq->nr_pref_llc[i] = 0;
+ 		rq->nr_llc_running = 0;
++	}
+ 
+ 	rq->nr_pref_llc_running = 0;
+ }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 8026e2c66e9f..4464b92767ad 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1109,6 +1109,7 @@ struct rq {
+ #ifdef CONFIG_SCHED_CACHE
+ 	unsigned int		nr_pref_llc_running;
+ 	unsigned int		nr_llc_running;
++	unsigned int		nr_pref_llc[MAX_LLC];
+ #endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ #ifdef CONFIG_SMP
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
new file mode 100644
index 0000000..3436e02
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch
@@ -0,0 +1,147 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE60C2472BA
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:10:49 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716251; cv=none; b=ADP9uISEjTXXmrZUS4riveuGfD0WO35c5rz53HhmhWNMrD8Dv6NEgTvyjQYnfTc2u3LwbGoaDVNp6SSxFFbbtfw/FH7XlH762INSbgUPWTafzzs+ATW6FN2x9nPTJmp96ZH+mnf1JNlGrm30zuWW2dvWodZS8ErATrdpAPxPp5I=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716251; c=relaxed/simple;
+	bh=aqSPvY6s7QQr//GlOU5D+JTqKry164SNV/VZUR7Kspk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=t/5xtGUzGrKxeHEjmp10AAMihG56pqeTBrBMPkuq4PHQwrU2AfkoQsDB4Y9TtiCy7GzKfao5LjE7hgoUByuq8rYZKjatFGSKo04f1EvdsfMKnnI+kvm1KPBegBNQVntmqPVlcdaoa8X2w/mUXe+QOHkdUYdt1Sj1Q17LdJWTtvM=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=hnHR/vdK; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="hnHR/vdK"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716250; x=1786252250;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=aqSPvY6s7QQr//GlOU5D+JTqKry164SNV/VZUR7Kspk=;
+  b=hnHR/vdKO7tdLOhgecBrJ/L6XtQ1BO8dVbYIB82zj+iOPcT1jV8xsoaQ
+   qVvNm13ubKRb17a6vcXBW7sO0sqESiBXEDAA3LOy6nrman47fi+cJF4GK
+   /APwE4CIXL2nHpsyP/5wUxfP2JBnuaAMw69BpWv3yxh4gHtAvKYTTGlqu
+   yYAwTm4DfQnzeKlTLUohbS6ngMAvDbvMCBqHRIliyvHmG2k+p5eyDajr2
+   YczcwUmYJan/0n1K6JWf7awWcuI7A08+OPYkjN3MABdDHrz0f/nYtETcb
+   muTOyFAlK1kHzNTlu61gC8CfjRqlA8omMF+AZVXL7YPft7aD6yFC9a32S
+   Q==;
+X-CSE-ConnectionGUID: 5tGpMziNRzOAuvg9CBzp+w==
+X-CSE-MsgGUID: RdOEUAbzTDSASRaq4EwAIw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019960"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="57019960"
+Received: from fmviesa005.fm.intel.com ([10.60.135.145])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:49 -0700
+X-CSE-ConnectionGUID: DegXMsyUTHGOiD9Y2cNnrw==
+X-CSE-MsgGUID: A+KDxoubRjeA2xjTkEMmBQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169705082"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:10:44 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 12/28] sched: Calculate the total number of preferred LLC tasks during load balance
+Date: Sat,  9 Aug 2025 13:04:39 +0800
+Message-Id: <4145385d4ce232e10cae713c8449d459c325db46.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+During load balancing between LLCs, gather the number of tasks
+on each runqueue of a source LLC.
+
+For example, consider a system with 4 sched groups LLC0, LLC1,
+..., LLC3. We are balancing towards LLC3 and LLC0 has 3 tasks
+preferring LLC3, LLC1 has 2 tasks preferring LLC3 and LLC2 has
+1 task preferring LLC3. LLC0 with most tasks preferring LLC3
+will be chosen as the busiest LLC to pick the tasks from.
+
+The number of tasks preferring the destination LLC are gathered
+from each run queue for a source LLC.
+
+For example, consider the sched_group LLC0 with two CPUs, CPU0
+and CPU1. On CPU0, 2 tasks prefer to run on LLC3, and on CPU1,
+one task prefers LLC3. The total number of tasks preferring
+LLC3 in LLC0 is 2 + 1 = 3.
+
+These statistics enable the load balancer to select tasks from
+a sched_group that best aligns tasks with their preferred LLCs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cfae71ee870b..f1697658c3b8 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10496,6 +10496,9 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int nr_pref_llc[MAX_LLC];
++#endif
+ };
+ 
+ /*
+@@ -10974,6 +10977,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		if (sched_feat(SCHED_CACHE)) {
++			int j;
++
++			for (j = 0; j < max_llcs; ++j)
++				sgs->nr_pref_llc[j] += rq->nr_pref_llc[j];
++		}
++#endif
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
new file mode 100644
index 0000000..8b9b9c1
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch
@@ -0,0 +1,177 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4717515665C
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:11:54 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716315; cv=none; b=V21sVGbgLQDQ3LIjl4XUJrmJ9J0H7EnLAxMhWiHDhYvdTQf37ITQq1SLcLqEU8QZBleGBd5opQCrWn9ZA+ka3UsL0gAkAAWeYPjzH6uXv6zStuJq71dJWgYewm9hUHjq7qSX6lm/Lgw0QNQVBR235FBzMEr5TtJzqy9vbdAZgM8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716315; c=relaxed/simple;
+	bh=AIuQrSu/5y/S9TIWqFOGCV+S6deOxyVrovvWy4SL2kw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=OloMkMGj39oyisSI/elKEAvpy20hxc65DCBP77XOsD6DM1R+a5A5Da/ppqs7Z7p3GBtilMBdNOBeSCw4CLzWnpi/GbZs0twrI/jHBj75g3QWh2U+l9MnhNx5slK4wgHsVE88KJtcO1SMNVHstEsw3B+CP/Ty6sI5OGUAf63XtLk=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZuSndoFC; arc=none smtp.client-ip=192.198.163.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZuSndoFC"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716314; x=1786252314;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=AIuQrSu/5y/S9TIWqFOGCV+S6deOxyVrovvWy4SL2kw=;
+  b=ZuSndoFC4541nSvbCP6cEW/FYV1l8zzkJP80vf6pWY1ZOHgg/RVomY/T
+   Ti6yNCB58L0Sd4kxkaI5sPLuF7vCi3M5pS05/bm13pSUnPEFFjtsLUyj2
+   f9POeHNiXdDCLJm6AbH5YNMJHTMKKxVu+wmWQOJyUA2JQAjfjhd4Y2l7i
+   hwHCcBSXZp9fOvMWTMVoUR8/ktX+69hF3c7sKUUgsZ3Ez6EVCyQ/ijbHd
+   VIyS49HPzpJp2UXMWArIqJMLsn/1xm4WXUpNNwWGCUSD+Ru4vQomDx8lv
+   I+sL/FNJq4W7oKMEvp16XyroY9GrTk18XR9yqRbCDZM1KnkZXCppQk2n8
+   g==;
+X-CSE-ConnectionGUID: UjPqhATKTouUNB6GDkUddg==
+X-CSE-MsgGUID: rAOe+a/mQeqiUy/XHrFejQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259917"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56259917"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:11:53 -0700
+X-CSE-ConnectionGUID: WVRUnOqXSoeMjZHQRQuF4Q==
+X-CSE-MsgGUID: AGpfHQaOSwWVkoL9IadjUw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165475971"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:11:48 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 13/28] sched: Tag the sched group as llc_balance if it has tasks prefer other LLC
+Date: Sat,  9 Aug 2025 13:05:44 +0800
+Message-Id: <c111a2b0837ba1616a558f5797d63294e062c33d.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+During load balancing between LLCs, check whether there are tasks
+preferring the destination LLC. If so, balance those tasks to the
+destination LLC first.
+
+Tag the sched_group that has tasks preferring to run on other LLCs
+(non-local) with the group_llc_balance flag. This way, the load
+balancer will later attempt to pull/push these tasks to their
+preferred LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index f1697658c3b8..30ebc7d1b999 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10491,6 +10491,7 @@ struct sg_lb_stats {
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
++	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	unsigned int nr_numa_running;
+@@ -10855,6 +10856,43 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	return false;
+ }
+ 
++/*
++ * Do LLC balance on sched group that contains LLC, and have tasks preferring
++ * to run on LLC in idle dst_cpu.
++ */
++#ifdef CONFIG_SCHED_CACHE
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	struct sched_domain *child = env->sd->child;
++	int llc;
++
++	if (!sched_feat(SCHED_CACHE))
++		return false;
++
++	if (env->sd->flags & SD_SHARE_LLC)
++		return false;
++
++	/* only care about task migration among LLCs */
++	if (child && !(child->flags & SD_SHARE_LLC))
++		return false;
++
++	llc = llc_idx(env->dst_cpu);
++	if (sgs->nr_pref_llc[llc] > 0 &&
++	    _get_migrate_hint(env->src_cpu, env->dst_cpu,
++			      0, true) == mig_allow)
++		return true;
++
++	return false;
++}
++#else
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	return false;
++}
++#endif
++
+ static inline long sibling_imbalance(struct lb_env *env,
+ 				    struct sd_lb_stats *sds,
+ 				    struct sg_lb_stats *busiest,
+@@ -11037,6 +11075,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
+ 	update_sg_if_llc(env, sgs, group);
++
++	/* Check for tasks in this group can be moved to their preferred LLC */
++	if (!local_group && llc_balance(env, sgs, group))
++		sgs->group_llc_balance = 1;
++
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
new file mode 100644
index 0000000..c2d89bc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch
@@ -0,0 +1,181 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9DFDCBA36
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:12:08 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716330; cv=none; b=aQ1DuKtufh+Odr/LorF9O/fM51njWURpIhUr0LEbL04DjrjFm+d09C28slCF+44U9FtSmiimp1pCffRh1R/fvm2ZZoxIwYLoRUwe6OyK5cj/+TCojsQvg2MTs+TBXvkqNEX58rHKytdwSZsbXZQ7b/69UrXdxM0ua0rXL6iOtQ0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716330; c=relaxed/simple;
+	bh=O81QutAo6orLIIDwAsw0ZMG10IN4T/AEDQCSSIJxqyQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=U5xozr9Mtp5uu2TvfPRuTYtKe4U1Wvg6adAEEQYPp3XhL43lFnMD5WFxuPpA676GFAHZvR3sqXgli2n0l/wcn29K4BggdO2CKIje4lN3tYfPeo9MARoA5x4puu9zfIoLrFm8QyrapPsLUWke2Sltghaenw5fxTvdXdEsap9QY/I=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MEuihH0+; arc=none smtp.client-ip=192.198.163.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MEuihH0+"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716328; x=1786252328;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=O81QutAo6orLIIDwAsw0ZMG10IN4T/AEDQCSSIJxqyQ=;
+  b=MEuihH0+FVuXmARwKbtg3ZEInFXwZofL2oeZFz8hAFncHiYR3thGt37w
+   r1T1tf3lEswVj9+r/pBIBCI/p6tNeK/mU9z44eSxhHK5hkAbm4U0pK2Yd
+   kJSRq/e/BjJbTfq0wdmHL+xZeKBs3wZgVSLF/cjxbK8xkGolfMsNQtISm
+   W3cnADlN4qbVKCTxsMFINRPYiR7F/yD1Oj5rfzQ0wt0MvGxsmxI+X3NFf
+   9X2WdZwnrcjBg1uRuKw5Ke2i5+i08CMFggHeD7mmAh+bcE6otZUr5bfyT
+   1/s8kOVFgN6mguEbs8JSr9oz3bIDVZO52aj/iKxcnT/mwWA+dnZZTC1UF
+   g==;
+X-CSE-ConnectionGUID: 7CDD+sx7Si6bye2RdBQ3gA==
+X-CSE-MsgGUID: jib9w3+fQbenCvLznD37kQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259932"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56259932"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:07 -0700
+X-CSE-ConnectionGUID: DOXrAiV0Se6h6BcziixHkw==
+X-CSE-MsgGUID: PSBpAuryS0CRK6OjhNz0bw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165475987"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:12:02 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 14/28] sched: Introduce update_llc_busiest() to deal with groups having preferred LLC tasks
+Date: Sat,  9 Aug 2025 13:05:58 +0800
+Message-Id: <bf3d8e42cd09bb23a296d26528624e835e6a520d.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+The load balancer attempts to identify the busiest sched_group with
+the highest load and migrates some tasks to a less busy sched_group
+to distribute the load across different CPUs.
+
+When cache-aware scheduling is enabled, the busiest sched_group is
+defined as the one with the highest number of tasks preferring to run
+on the destination LLC. If the busiest group has llc_balance tag,
+the cache aware load balance will be launched.
+
+Introduce the helper function update_llc_busiest() to identify
+such sched group with most tasks preferring the destination LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 30ebc7d1b999..b8cc85291351 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10885,12 +10885,36 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	int idx;
++
++	/* Only the candidate with llc_balance need to be taken care of */
++	if (!sgs->group_llc_balance)
++		return false;
++
++	/*
++	 * There are more tasks that want to run on dst_cpu's LLC.
++	 */
++	idx = llc_idx(env->dst_cpu);
++	return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx];
++}
+ #else
+ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 			       struct sched_group *group)
+ {
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	return false;
++}
+ #endif
+ 
+ static inline long sibling_imbalance(struct lb_env *env,
+@@ -11122,6 +11146,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 	     sds->local_stat.group_type != group_has_spare))
+ 		return false;
+ 
++	/* deal with prefer LLC load balance, if failed, fall into normal load balance */
++	if (update_llc_busiest(env, busiest, sgs))
++		return true;
++
++	/* if there is already a busy group, skip the normal load balance */
++	if (busiest->group_llc_balance)
++		return false;
++
+ 	if (sgs->group_type > busiest->group_type)
+ 		return true;
+ 
+@@ -12029,9 +12061,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
+ 	/*
+ 	 * Try to move all excess tasks to a sibling domain of the busiest
+ 	 * group's child domain.
++	 * Also do so if we can move some tasks that prefer the local LLC.
+ 	 */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+-	    sibling_imbalance(env, &sds, busiest, local) > 1)
++	    (busiest->group_llc_balance ||
++	    sibling_imbalance(env, &sds, busiest, local) > 1))
+ 		goto force_balance;
+ 
+ 	if (busiest->group_type != group_overloaded) {
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
new file mode 100644
index 0000000..2bd9a06
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch
@@ -0,0 +1,191 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.19])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7EC501DE3BE
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:12:37 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.19
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716359; cv=none; b=QbiGTyG3g2KE4hVHW5ANIWTpqVt3p0S/zxdB0gitjC7ulgs6vf7jEjdLirjHI7kaK+ztqqN/rCqTa/6hfausklfNc4rz8dYqp0CG6Y6YgxkKUwvBQIrp/KqxhxJRz+O3v6tp0XTYU16LxAdHr+C5BFN93tNkPr76wtjETRyaMaw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716359; c=relaxed/simple;
+	bh=yGnpZ6FlvoEQkStNXEnYs/+BqL2dRHGniBRSc7+bEyI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=m1yPm73VBI2YcbcZuIT/BdavdjC/l3BdlBwnghR8mggxNfB23vX6iBYsgIl3jTAM5i/J7m6oqk0xrxs69iqlLFQ90jQ8hbG1d9nYsXQ1qBKpwmo+MO9karhCFAgqXxtek2+Fw6jBNSgn4f3uTQlC2jV9TQHxB0EG1HVMEE5ZgXA=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TEnIOB1/; arc=none smtp.client-ip=198.175.65.19
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TEnIOB1/"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716358; x=1786252358;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=yGnpZ6FlvoEQkStNXEnYs/+BqL2dRHGniBRSc7+bEyI=;
+  b=TEnIOB1/ILlwqYFTI6kKipRVVwx6DRo7Bg6DA7Rv5STVaUTCs0KWpJvq
+   Z6xP+XRjxHgLpyN1tALXTPQatPAmzpP8yoIO81oaWxQRxnjTqFILCkrJF
+   kCtJQ/VZjCEhVc7wgV23PMjUStSCtMH0P9OW3KNu8Za5Pnw7tMZySzv3t
+   NECmaJZ84sMsSF0CtDUsuTqG088mMNcuu4rS+3dzRuIJxgZ7St1Ds47Z9
+   1QQxPYZElaPgiDkZBfePvDHM5kCX7XSWYlmCxqKtAeHU7eYkQHevmw80h
+   wEnmX47OYGWTIQssLuDSa8NOAiVjB27DwhzduhCcmV9MOJn0mr3hsf+ZP
+   A==;
+X-CSE-ConnectionGUID: lvcsmsR8QjyFBqcujWQmiQ==
+X-CSE-MsgGUID: bzzKp3XeT8WmtT0FxfRmoA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56932532"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56932532"
+Received: from orviesa006.jf.intel.com ([10.64.159.146])
+  by orvoesa111.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:37 -0700
+X-CSE-ConnectionGUID: cebyh5CzSdq9bdRIx+9UHw==
+X-CSE-MsgGUID: 291hr3FsSYCIyZnbCahMKA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="164703738"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa006.jf.intel.com with ESMTP; 08 Aug 2025 22:12:31 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 15/28] sched: Introduce a new migration_type to track the preferred LLC load balance
+Date: Sat,  9 Aug 2025 13:06:27 +0800
+Message-Id: <b2ac5df723e9995d5b336282c639cc484ac1b884.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Introduce a new migration type named migrate_llc_task to facilitate
+cache-aware load balancing.
+
+After the busiest sched_group is identified as the one that needs
+migration due to having most tasks preferring destination LLC, tag the
+migration type as the newly introduced migrate_llc_task. During load
+balancing, each runqueue within the busiest preferred-LLC sched_group
+is checked, and the runqueue with the highest number of tasks preferring
+to run on the destination CPU is chosen as the busiest runqueue.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 37 ++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 36 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b8cc85291351..a301b56dd2b4 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9746,7 +9746,8 @@ enum migration_type {
+ 	migrate_load = 0,
+ 	migrate_util,
+ 	migrate_task,
+-	migrate_misfit
++	migrate_misfit,
++	migrate_llc_task
+ };
+ 
+ #define LBF_ALL_PINNED	0x01
+@@ -10180,6 +10181,15 @@ static int detach_tasks(struct lb_env *env)
+ 			env->imbalance -= util;
+ 			break;
+ 
++		case migrate_llc_task:
++			/*
++			 * Since can_migrate_task() succeed, when we reach here, it means that p
++			 * can be migrated even if dst_cpu is not p's preferred_llc, because there
++			 * are no idle cores for p to do in-llc load balance.
++			 */
++			env->imbalance--;
++			break;
++
+ 		case migrate_task:
+ 			env->imbalance--;
+ 			break;
+@@ -11817,6 +11827,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (busiest->group_llc_balance) {
++		/* Move a task that prefer local LLC */
++		env->migration_type = migrate_llc_task;
++		env->imbalance = 1;
++		return;
++	}
++#endif
++
+ 	if (busiest->group_type == group_imbalanced) {
+ 		/*
+ 		 * In the group_imb case we cannot rely on group-wide averages
+@@ -12125,6 +12144,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ 	unsigned int busiest_nr = 0;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int busiest_pref_llc = 0;
++	int dst_llc;
++#endif
+ 	int i;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+@@ -12233,6 +12256,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 			}
+ 			break;
+ 
++		case migrate_llc_task:
++#ifdef CONFIG_SCHED_CACHE
++			dst_llc = llc_idx(env->dst_cpu);
++			if (!cpus_share_cache(env->dst_cpu, rq->cpu) &&
++			    busiest_pref_llc < rq->nr_pref_llc[dst_llc]) {
++				busiest_pref_llc = rq->nr_pref_llc[dst_llc];
++				busiest = rq;
++			}
++#endif
++			break;
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
+ 				busiest_nr = nr_running;
+@@ -12415,6 +12448,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 	case migrate_misfit:
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
++	case migrate_llc_task:
++		break;
+ 	}
+ }
+ 
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch
new file mode 100644
index 0000000..8b05b84
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch
@@ -0,0 +1,190 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.19])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C82CABA36
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:12:53 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.19
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716375; cv=none; b=aS7CLs0yVWKaF6OGVkXK8ZAgy8yUgUakdznwWDtZD4N9gQ9eA7tWRxrHj1IeAjaqTL8M+VmHHPvI8FEcOuDBcfH3oVpULXvb4/xFnoBCpg/mVg6MCRCvDJrLWdumxn7wi15V2NyagC2GII5gOWOj3odj3IWvyB3Ywa1aJoBDB/I=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716375; c=relaxed/simple;
+	bh=fRnw1t3Rh9UYvT00ArjMcNIiB37mwFZbA2eVDMCUX9M=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=RJUo8ambmNBfq+wvLRlOCmiE/W7wpbkuJF7yL9JEQZ65V62F8oCjVmz3qVgLvkV3PLO6TzjT6umm4UV9UMY1fHNephBb+kWs8mVMmZ6rPjQkthPxxV8sRM5GBZAKF/4w8+2Bp7vO3sUeIwn+6xlZ35XOq6ECCeBWwM5GJsRHcEU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=DO7eB20K; arc=none smtp.client-ip=198.175.65.19
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="DO7eB20K"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716374; x=1786252374;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=fRnw1t3Rh9UYvT00ArjMcNIiB37mwFZbA2eVDMCUX9M=;
+  b=DO7eB20K/f/MVe9m5JNfhy4n46O8ccIoioHjrTsbuwZlLpqzzksjVlAv
+   FSWzb4JVfyBHYiQaKCDVfK0CkNncYJ22CpRHB1RwD3zGRfwsq6x9aCRQH
+   TKpWNRyQsj3e8nZiDG6U1hLeWNbIKj/X6uKv56QKiYVXkZeKHyR4Zqnxi
+   U5rknviHlsICE9lsjONRBpod6oRa32YfBF17V6dJ4X6Vo8cglEVlG/FKh
+   aqYrgjA98DE4rWoYD97vabGK2LMtYuZO47cKW4wuRsI+yu4gtqm55Wgcy
+   1wWRnlj7aVuWb9SYbQgGx83xtwUCYP9X9i36gO7Eb2NFdoWyrm8YyDqHa
+   g==;
+X-CSE-ConnectionGUID: 0oJuosNIQM23ggPBmbF4UQ==
+X-CSE-MsgGUID: LCnyHPYNSYquGLfPEsgHmQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56932555"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56932555"
+Received: from orviesa006.jf.intel.com ([10.64.159.146])
+  by orvoesa111.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:53 -0700
+X-CSE-ConnectionGUID: 0pVJsl26T9OrBTO7kMP8BQ==
+X-CSE-MsgGUID: o19YVeMBQCulANK7Q6DXLA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="164703753"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa006.jf.intel.com with ESMTP; 08 Aug 2025 22:12:47 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 16/28] sched: Consider LLC locality for active balance
+Date: Sat,  9 Aug 2025 13:06:42 +0800
+Message-Id: <38d036cf946223b46a20ad60ccf13f9dcb316240.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+If busiest run queue has only one task, active balance is enlisted
+to actually move the task.  However, before moving the task,
+we should consider whether we are moving the task from its preferred
+LLC.
+
+Don't move the single running task in a run queue to another LLC, if
+we are moving it from its desired LLC, or moving it will cause too much
+imbalance between the LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 48 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a301b56dd2b4..592a4034e760 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -12332,10 +12332,43 @@ imbalanced_active_balance(struct lb_env *env)
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	if (!sched_feat(SCHED_CACHE))
++		return 0;
++
++	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
++		return 0;
++	/*
++	 * All tasks want to stay put. Move only if LLC is
++	 * heavily loaded or don't pull a task from its
++	 * preferred CPU if it is the only one running.
++	 */
++	if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable &&
++	    (env->src_rq->nr_running <= 1 ||
++	    _get_migrate_hint(env->src_cpu, env->dst_cpu,
++			      0, false) == mig_forbid))
++		return 1;
++
++	return 0;
++}
++#else
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	return 0;
++}
++#endif
++
+ static int need_active_balance(struct lb_env *env)
+ {
+ 	struct sched_domain *sd = env->sd;
+ 
++	if (break_llc_locality(env))
++		return 0;
++
+ 	if (asym_active_balance(env))
+ 		return 1;
+ 
+@@ -12355,7 +12388,8 @@ static int need_active_balance(struct lb_env *env)
+ 			return 1;
+ 	}
+ 
+-	if (env->migration_type == migrate_misfit)
++	if (env->migration_type == migrate_misfit ||
++	    env->migration_type == migrate_llc_task)
+ 		return 1;
+ 
+ 	return 0;
+@@ -12800,9 +12834,20 @@ static int active_load_balance_cpu_stop(void *data)
+ 		goto out_unlock;
+ 
+ 	/* Is there any task to move? */
+-	if (busiest_rq->nr_running <= 1)
+-		goto out_unlock;
++	if (busiest_rq->nr_running <= 1) {
++#ifdef CONFIG_SCHED_CACHE
++		int llc = llc_idx(target_cpu);
+ 
++		if (!sched_feat(SCHED_CACHE))
++			goto out_unlock;
++
++		if (llc < 0)
++			goto out_unlock;
++		/* don't migrate if task does not prefer target */
++		if (busiest_rq->nr_pref_llc[llc] < 1)
++#endif
++			goto out_unlock;
++	}
+ 	/*
+ 	 * This condition is "impossible", if it occurs
+ 	 * we need to fix it. Originally reported by
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
new file mode 100644
index 0000000..d548fb9
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch
@@ -0,0 +1,201 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.20])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 58996226CFC
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:13:07 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.20
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716388; cv=none; b=ToNlzHkLLYiYeK8T8BsAhwwVTQMjfVDgwARwIDYa+wyZ8Eu38JHaFmFWhEHDz8Y4QXb3R7dulNjX2NJYnlkPmQ0FB+POvt2GUZ/4GvwbMKz42XqpgP66/Git+tq6B67e0BFqrfwmVgwWN0fqYa7Y2mT9Jw28QbyFm7zODtPY6sc=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716388; c=relaxed/simple;
+	bh=sTSG237di6kHrTi+M/LVG5ENiqilE30WO0gE5TPh2Qg=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=k07/zqOO54KVW2SXRP1BcFCx9O6eoCI0J3Yg2JN4fa4l56WOxmJSEvLVg2Qq4TaVlBV1mD6qAvItCmcERw1UNU4TYSdIrJB+dIamh6hR7WzT6I/vQu1VEkz4aED2Kp/nidg5cbmW5fT2HqFSYZjL/i79XRYjNuUmf9W31GhMzEo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=G3ntFCoX; arc=none smtp.client-ip=198.175.65.20
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="G3ntFCoX"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716387; x=1786252387;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=sTSG237di6kHrTi+M/LVG5ENiqilE30WO0gE5TPh2Qg=;
+  b=G3ntFCoXyBZCYeCFd2XdSpKtKywszwsAL7iG155Ga7pmma4DufkDAk1j
+   J0oIixy6CX3G2NDetf51jJCmgOaHTwM5/Zyy62tX553kTkCWxQYisiVUg
+   1tLtppV/kH9sI0k6oKldvrjqqgkVdJpDQWrsW6zAURpZZQre0+t9sB2DH
+   giDN1ULvFcnaQhebg6L8k2Sk3KyDkVnyIgPtuntXxM6AYlGGbknUXkX/S
+   gDEJ0HpmTVhkCqcyfcxY/dueuq+yt+7fItoY/olEIlSogszYxJCyn99+x
+   O91JFgRzp1p1is0mJ7huD6m3c93Mm1gkIWBIs7CmYnQAnn6h0ZqAcvxVv
+   Q==;
+X-CSE-ConnectionGUID: RA52ecx/Ti+wEdDxJMLnkg==
+X-CSE-MsgGUID: /5YdSh1jQj+Ywel/XofIvA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56768552"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56768552"
+Received: from fmviesa002.fm.intel.com ([10.60.135.142])
+  by orvoesa112.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:06 -0700
+X-CSE-ConnectionGUID: +bCrWry6RQe4Sex8ybOxpA==
+X-CSE-MsgGUID: IbTHoJawTVyJKtMAWCm2+g==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="189180343"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa002.fm.intel.com with ESMTP; 08 Aug 2025 22:13:01 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 17/28] sched: Consider LLC preference when picking tasks from busiest queue
+Date: Sat,  9 Aug 2025 13:06:58 +0800
+Message-Id: <fe8ae3521256d58c52e8ece2a03f0d95ced1b15c.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+When picking tasks from busiest queue for load balance, we currently
+do not consider LLC preference.
+
+Order the task in the busiest queue such that we picked the tasks in the
+following order:
+	1. tasks that prefer dst cpu's LLC
+	2. tasks that have no preference in LLC
+	3. tasks that prefer LLC other than the ones they are on
+	4. tasks that prefer the LLC that they are currently on
+
+This will allow tasks better chances to wind up in its preferred LLC.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 592a4034e760..8d5792b9e658 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10093,6 +10093,68 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+ 	return NULL;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Prepare lists to detach tasks in the following order:
++ * 1. tasks that prefer dst cpu's LLC
++ * 2. tasks that have no preference in LLC
++ * 3. tasks that prefer LLC other than the ones they are on
++ * 4. tasks that prefer the LLC that they are currently on.
++ */
++static struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	struct task_struct *p;
++	LIST_HEAD(pref_old_llc);
++	LIST_HEAD(pref_new_llc);
++	LIST_HEAD(no_pref_llc);
++	LIST_HEAD(pref_other_llc);
++
++	if (!sched_feat(SCHED_CACHE))
++		return tasks;
++
++	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
++		return tasks;
++
++	while (!list_empty(tasks)) {
++		p = list_last_entry(tasks, struct task_struct, se.group_node);
++
++		if (p->preferred_llc == llc_id(env->dst_cpu)) {
++			list_move(&p->se.group_node, &pref_new_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == llc_id(env->src_cpu)) {
++			list_move(&p->se.group_node, &pref_old_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == -1) {
++			list_move(&p->se.group_node, &no_pref_llc);
++			continue;
++		}
++
++		list_move(&p->se.group_node, &pref_other_llc);
++	}
++
++	/*
++	 * We detach tasks from list tail in detach tasks.  Put tasks
++	 * to be chosen first at end of list.
++	 */
++	list_splice(&pref_new_llc, tasks);
++	list_splice(&no_pref_llc, tasks);
++	list_splice(&pref_other_llc, tasks);
++	list_splice(&pref_old_llc, tasks);
++	return tasks;
++}
++#else
++static inline struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	return tasks;
++}
++#endif
++
+ /*
+  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
+  * busiest_rq, as part of a balancing operation within domain "sd".
+@@ -10101,7 +10163,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+  */
+ static int detach_tasks(struct lb_env *env)
+ {
+-	struct list_head *tasks = &env->src_rq->cfs_tasks;
++	struct list_head *tasks;
+ 	unsigned long util, load;
+ 	struct task_struct *p;
+ 	int detached = 0;
+@@ -10120,6 +10182,8 @@ static int detach_tasks(struct lb_env *env)
+ 	if (env->imbalance <= 0)
+ 		return 0;
+ 
++	tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks);
++
+ 	while (!list_empty(tasks)) {
+ 		/*
+ 		 * We don't want to steal all, otherwise we may be treated likewise,
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
new file mode 100644
index 0000000..7e42bb5
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch
@@ -0,0 +1,163 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.20])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B1D46274B39
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:13:19 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.20
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716401; cv=none; b=gwUxPhraVHLm64bXQSb1oNwdX318HEEFGQP3NJIRjG0ej1HembqLwL/AMdMnKs2idXx3KEfcQggsIlJeGxPd86ymVhFs/rlGwCRgO+oHKZRTtPkeotIYE6Skr2Z90a1CPa/LNaWTM1XdHnwRmA1ybF/xRMbrR1KLLpNUD058xfI=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716401; c=relaxed/simple;
+	bh=s+rR7wC7UnAbjuxbPj5c6/L98TvboEiREJprAaugmo8=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Ex3plSrAdkKiiWgy1wFo/qOyi1KRMrAmvUSGgR+Sl7c+uHNitZ3FkYOzLvNN2Kk7bcRKDLWQu/zjHkM0B/ktpD3735kBxpJu0PJ8IZ58b18B0w/r7VUcJthi/o2hdqN358rDx5jLpz4Z9VU9UsOwUlIB1nTwdFObyEGsDbo1DD0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZNX9P8Zh; arc=none smtp.client-ip=198.175.65.20
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZNX9P8Zh"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716400; x=1786252400;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=s+rR7wC7UnAbjuxbPj5c6/L98TvboEiREJprAaugmo8=;
+  b=ZNX9P8Zhh4or6QRvyIGlsqU0K0XXUdzdv+gCoKT4EcsJWyPrUrrpjeh6
+   Qmj8TTDc2Q3+gYj6uSTMIaEUdV5BvlkAcN9NnwrPfjzZslpxwRyFTPJTx
+   KtvP4Sp2C8p5ushx8yObLd6nOXcFZSnue19p3r5NoF227rlrE4GYeUWKq
+   dM+U0/Nq/0qZLmHe33WFqOXqLI4gmE0PevCwc5pjj8qUenPxHW1kXvWF+
+   3fvMGqOlhGBBzPmI9Nt4so8fHdQ0chc/atY+kOpU5fgp8EHxTRIRoH0Lz
+   6bGZ08Lr6XkBPeiz62J27S7cC3PWHkwpEOh1gP4JN0l7CxroHUJGaT1Ve
+   g==;
+X-CSE-ConnectionGUID: b1FzoNUSROqCeN8PP7hLeA==
+X-CSE-MsgGUID: PpjmRF1hSJmGHpUlPRMqJA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56768565"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56768565"
+Received: from fmviesa002.fm.intel.com ([10.60.135.142])
+  by orvoesa112.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:19 -0700
+X-CSE-ConnectionGUID: TVpwBJFnSOm4LFGDl/UL7A==
+X-CSE-MsgGUID: vYNYzwHuTSmQtfe59f1VcQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="189180352"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa002.fm.intel.com with ESMTP; 08 Aug 2025 22:13:13 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 18/28] sched: Do not migrate task if it is moving out of its preferred LLC
+Date: Sat,  9 Aug 2025 13:07:10 +0800
+Message-Id: <081010e2c9cd8f4b3c9aa6d1b98fbe9438cd3c06.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+In the final step of task migration during load balancing,
+can_migrate_task() is used to determine whether a task can
+be moved to the destination. If the task has an LLC preference,
+consider this preference when moving it out of its preferred LLC.
+With this check in place, there is no need to retain the task's
+cache-hot CPU check in task_hot(); remove it accordingly.
+
+Besides, add more checks in detach_tasks() to avoid choosing
+tasks that prefer their current LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 28 +++++++++++++++++-----------
+ 1 file changed, 17 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 8d5792b9e658..22b7a7fe538e 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9822,17 +9822,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 	if (sysctl_sched_migration_cost == 0)
+ 		return 0;
+ 
+-#ifdef CONFIG_SCHED_CACHE
+-	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
+-		/*
+-		 * XXX things like Skylake have non-inclusive L3 and might not
+-		 * like this L3 centric view. What to do about L2 stickyness ?
+-		 */
+-		return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ >
+-		       per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ;
+-	}
+-#endif
+-
+ 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+ 
+ 	return delta < (s64)sysctl_sched_migration_cost;
+@@ -10029,6 +10018,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (env->flags & LBF_ACTIVE_LB)
+ 		return 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_feat(SCHED_CACHE) &&
++	    get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid)
++		return 0;
++#endif
++
+ 	degrades = migrate_degrades_locality(p, env);
+ 	if (!degrades)
+ 		hot = task_hot(p, env);
+@@ -10289,6 +10284,17 @@ static int detach_tasks(struct lb_env *env)
+ 		if (env->imbalance <= 0)
+ 			break;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Don't detach more tasks if remaining tasks want to stay:
++		 * The tasks have already been sorted by order_tasks_by_llc(),
++		 * they are tasks that prefer the current LLC.
++		 */
++		if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 &&
++		    llc_id(env->src_cpu) == p->preferred_llc)
++			break;
++#endif
++
+ 		continue;
+ next:
+ 		if (p->sched_task_hot)
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
new file mode 100644
index 0000000..29bccbd
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch
@@ -0,0 +1,194 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0E605BA36
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:13:32 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716413; cv=none; b=Y5DY4uVXJj/NXx4xa+vIglUy/e0Mz3lC23M4GrgHNX8VTYhXYab2lfrFY6mo9TrkT8w/WQHHy0ath+3g82U7f7w+5f8oq86hgIXxPEP3isbuKS9ryw2kjNuMOw6y8wqF8EHfhI4CaEef8Gm0ym0TuNTWKHvhA39IckSeqYVQ55Q=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716413; c=relaxed/simple;
+	bh=rJfyf/57CaUPFk3O70t1/xBAt4kiiSb/LUukMkIaaWI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=BbGezSEvMdgtuVQGY5ll4boKLDz3xf/HaQKe/BbQh2RvAxEla/bMNlbvy7fNRiyq6jqUio9sHSAT2xUSQfMGzvU0gbx6uABWuPs54UhyitX7QqsGpuhPSoWaqfjMJ+JchZxUZEza6Pv3LtvE1xF1YB8vYubCHFqLzoIDWZeEypI=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Fv6T2jfF; arc=none smtp.client-ip=198.175.65.9
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Fv6T2jfF"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716412; x=1786252412;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=rJfyf/57CaUPFk3O70t1/xBAt4kiiSb/LUukMkIaaWI=;
+  b=Fv6T2jfFSgH2501Wvsd9NAJpLZ95G3qQ2wpS+brwkLJI4Z9OwbGNI010
+   TXV31asmWoF9+Q+nOOHJAmbtTXBwCofZZc4StvBRsyudecftX7Wk1PRur
+   u7QKz2FJNo0ci4Owq3KzhsOU/Zu+KxpkANT6PW233G4v7L1dfPrJmuTsr
+   kc0L6AcVmncnIjhBRuMo0p6BD/uY9llqRtu1k1OnH8I9Jcei+J4SP8kPW
+   Qrss/vUTJVcjLGz++sDQq5rXiSF8X5srU5tRisTDzgzNNua10vFLKF5a/
+   +MoDPmHcbzf2Z88/IRgSh6BUtg+yyb4sVeKEc69v7SY3AfaN4pHBsvMby
+   w==;
+X-CSE-ConnectionGUID: yRiWx0TaSqSHcni4YwFu1Q==
+X-CSE-MsgGUID: KOLdWXj6RXq5qKElN0Zavw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620355"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="79620355"
+Received: from fmviesa009.fm.intel.com ([10.60.135.149])
+  by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:31 -0700
+X-CSE-ConnectionGUID: 66tPf5k4Q3+atsnA5jm4Jg==
+X-CSE-MsgGUID: 4d6H20jNRjW/omJZg3lPVw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165844013"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:26 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 19/28] sched: Introduce SCHED_CACHE_LB to control cache aware load balance
+Date: Sat,  9 Aug 2025 13:07:23 +0800
+Message-Id: <eba3303cdab63e2d96dcc630d153004e4afb88f3.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Introduce the SCHED_CACHE_LB sched feature to enable or disable
+cache aware load balance in the schduler.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 18 ++++++++++--------
+ kernel/sched/features.h |  1 +
+ 2 files changed, 11 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 22b7a7fe538e..9843d4e1d84f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10019,7 +10019,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 		return 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	if (sched_feat(SCHED_CACHE) &&
++	if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
+ 	    get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid)
+ 		return 0;
+ #endif
+@@ -10105,7 +10105,7 @@ static struct list_head
+ 	LIST_HEAD(no_pref_llc);
+ 	LIST_HEAD(pref_other_llc);
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return tasks;
+ 
+ 	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
+@@ -10290,7 +10290,8 @@ static int detach_tasks(struct lb_env *env)
+ 		 * The tasks have already been sorted by order_tasks_by_llc(),
+ 		 * they are tasks that prefer the current LLC.
+ 		 */
+-		if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 &&
++		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
++		    p->preferred_llc != -1 &&
+ 		    llc_id(env->src_cpu) == p->preferred_llc)
+ 			break;
+ #endif
+@@ -10947,7 +10948,7 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain *child = env->sd->child;
+ 	int llc;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return false;
+ 
+ 	if (env->sd->flags & SD_SHARE_LLC)
+@@ -11058,7 +11059,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain *sd = env->sd->child;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE ||
++	    !sched_feat(SCHED_CACHE_LB))
+ 		return;
+ 
+ 	/* only care the sched domain that spans 1 LLC */
+@@ -11120,7 +11122,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 			*sg_overutilized = 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-		if (sched_feat(SCHED_CACHE)) {
++		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) {
+ 			int j;
+ 
+ 			for (j = 0; j < max_llcs; ++j)
+@@ -12406,7 +12408,7 @@ imbalanced_active_balance(struct lb_env *env)
+ static inline bool
+ break_llc_locality(struct lb_env *env)
+ {
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 		return 0;
+ 
+ 	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+@@ -12908,7 +12910,7 @@ static int active_load_balance_cpu_stop(void *data)
+ #ifdef CONFIG_SCHED_CACHE
+ 		int llc = llc_idx(target_cpu);
+ 
+-		if (!sched_feat(SCHED_CACHE))
++		if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
+ 			goto out_unlock;
+ 
+ 		if (llc < 0)
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index d2af7bfd36bf..11dbd74cd365 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -88,6 +88,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+ SCHED_FEAT(SIS_UTIL, true)
+ 
+ SCHED_FEAT(SCHED_CACHE, true)
++SCHED_FEAT(SCHED_CACHE_LB, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
new file mode 100644
index 0000000..1cf1911
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch
@@ -0,0 +1,145 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7A4A02749D5
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:13:44 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716425; cv=none; b=VIDdU39lmJxSYrHB9S4l1mh0boKZZki9BLORb9qgZo0xYyJtYcfX+m2EFuLna+wvMqdM3b9jRoxWjfYX98zrEOAuWNoH8zCG6FpNq8YiKHKq1NGdKxQHVgzOOiLG5uy1qO9t7Wa4goaOHrkQI+arcKezllgcvY4ibca99xDzFQA=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716425; c=relaxed/simple;
+	bh=YdhYlPfOzBX2JmU404dp2rIw4uGKUk0/reO/wjvEcdA=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=AeIzKWJSFg2pTtYt5o5Vz8pc3/BIn9Yhzjs5gEx3/9nVXOR8/67BRRpJDX5hibfRtV6EY8e/fAEY/Zxa+8RbAe6m0nXD1z5eRZPQvlghGCpOjLX0XPm2maXH0OgysqVx3VFMCsFbO52VsoQD2p85NC594bUHYDFoDa6p7TKhhlA=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=RBrobR1i; arc=none smtp.client-ip=198.175.65.9
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="RBrobR1i"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716424; x=1786252424;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=YdhYlPfOzBX2JmU404dp2rIw4uGKUk0/reO/wjvEcdA=;
+  b=RBrobR1iwxBeJZoLKXMRg5MQx8WGXo8muZpXStdx9iyAK264sfhIG/qu
+   joL4lEprzIBRw8X03Yy5P97hTPxpboN85pYrMB8bQ6FHQo9ybNckkrA6U
+   Fm88MiZ42tIDdBFytUX2SY9R5LIWL4D6l7uxGHF/7t9G2tSrGXQZrLdmV
+   kEXY08yQyIUpqXqtLN/Fts7veKj7eYDCqo12PTEZYQ6XGxrwnt4HlyW3a
+   b9OPJyXpEcpsigfeiakBxz87spvYkl6NsSdiBGHP8WsVh/XlkJ8G7/XfP
+   bZjJOX3ekGPt6NQIeusuWhHKU/YI3AgkFy7IeRU2nggpBNO+zFRf78cN1
+   w==;
+X-CSE-ConnectionGUID: 6JJ7KzVjTq2D3T0a9H1P9w==
+X-CSE-MsgGUID: pH0Ms/VYRfa3u1EQ9lfZ0A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620375"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="79620375"
+Received: from fmviesa009.fm.intel.com ([10.60.135.149])
+  by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:44 -0700
+X-CSE-ConnectionGUID: p2RcioaZTnKrvWCzoclFkA==
+X-CSE-MsgGUID: 7ArRWLtgSgqBcLoFA2nIvw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165844031"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:38 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 20/28] sched: Introduce SCHED_CACHE_WAKE to control LLC aggregation on wake up
+Date: Sat,  9 Aug 2025 13:07:35 +0800
+Message-Id: <144358df73cbb8c7d24f757fc40cb068be603bed.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+Introduce SCHED_CACHE_WAKE feature to enable or disable cache-aware
+wake up. Disable this feature by default because cache-aware wakeup
+is overly aggressive in stacking wakees of the same process on the
+same LLC, if they are frequently woken up.
+
+The wake ups can be much more frequent than load balances, adding
+much overhead when load balance alone for LLC aggregation is sufficient.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 6 +++++-
+ kernel/sched/features.h | 1 +
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 9843d4e1d84f..6e61f9e1f628 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9063,7 +9063,7 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	struct mm_struct *mm = p->mm;
+ 	int cpu;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE))
+ 		return prev_cpu;
+ 
+ 	if (!mm || p->nr_cpus_allowed == 1)
+@@ -9076,6 +9076,10 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	if (cpus_share_cache(cpu, prev_cpu))
+ 		return prev_cpu;
+ 
++	if (_get_migrate_hint(prev_cpu, cpu,
++			      task_util(p), true) == mig_forbid)
++		return prev_cpu;
++
+ 	if (static_branch_likely(&sched_numa_balancing) &&
+ 	    __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) {
+ 		/*
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 11dbd74cd365..44b408cf0dd4 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -89,6 +89,7 @@ SCHED_FEAT(SIS_UTIL, true)
+ 
+ SCHED_FEAT(SCHED_CACHE, true)
+ SCHED_FEAT(SCHED_CACHE_LB, true)
++SCHED_FEAT(SCHED_CACHE_WAKE, false)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
new file mode 100644
index 0000000..263a69a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
@@ -0,0 +1,299 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D7E92472BA
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:13:56 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716438; cv=none; b=oP/KP8CT9PyG8F223CbqjTnw5fAnuBqC4qrJsRtJ4FhmBZN2qpqY1eTd8fjtB3IjDADG3eDf23ECJS0GGe21q95Lgbd8aoZ2d1dhrh9ekTQMUxybtv0qdhdWt0awWabB+mPE502GRGKAWZIXsyFrdxhb/zbs1b9+pJ2xhI3lh0U=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716438; c=relaxed/simple;
+	bh=2aEa4h1fI5J4/1AEUoGjg0eQXawj82V6LxqSqdQWKjc=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=eLlyXwmAlVBfL8ugZpWuJv/cYDUhBSgXTmsJA1VoG4McRZM9A+e9EyJW9FEw5BbRwioynILfrxFhAe4zM1FRKim6rhs5NDIaRMWKq7+xJ+DXnEZ4q4gjxkB8JnCe7RI2fcforxvMACn4NmxTFkZ0407GDkQ+uKE5393PmwyageE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jyvdr92q; arc=none smtp.client-ip=198.175.65.9
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jyvdr92q"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716436; x=1786252436;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=2aEa4h1fI5J4/1AEUoGjg0eQXawj82V6LxqSqdQWKjc=;
+  b=jyvdr92qHqcE9M3jPHv+fDdx/YNY3xnAId3qtw3C2QzAwffEVL4ZQTBS
+   CePnqJIrCnZ2R6zfimwVCnLdiYu5OvVZ5ChHHhlNO+ZL+HID3ktCe7O2w
+   48m583KvcVHXVXNpkpIfS7DLrauwwN0nhjxTOtWhNNA6tX0C3umtnps9k
+   I2871JFWkVEb0mXhuELAw1LEqE+pk38njQNVgLdHwoT5vvi9CMGrEAr/N
+   RU5gBb080A9sLYEGTtpnWCaPKZUTFqtKi9ostEazBVphHyMbIcwyMjLxO
+   F+hAW/C7GT5rTxqDXPHnZ6JnGmHGoIMYcwTWKxDujBKbcjIHs9lsLI5Ay
+   w==;
+X-CSE-ConnectionGUID: frbwZBVeQsyYvImCyN450Q==
+X-CSE-MsgGUID: O9BLeD00TFuab4r38fYeEg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620417"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="79620417"
+Received: from fmviesa009.fm.intel.com ([10.60.135.149])
+  by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:55 -0700
+X-CSE-ConnectionGUID: 57mN5EeXQnOL1PlRN1msdQ==
+X-CSE-MsgGUID: ZdnU+8XKSQuiZvpeY/fMpw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165844039"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:50 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 21/28] sched: Introduce a static key to enable cache aware only for multi LLCs
+Date: Sat,  9 Aug 2025 13:07:47 +0800
+Message-Id: <fa45f8eb38ab06d02847d57195c6304af3107c70.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If there are more than one LLCs in the node, the
+cache aware scheduling is enabled. Otherwise, the
+cache aware scheduling is disabled.
+
+The definition of multiple LLCs in a node is that
+every node in the system should have more than one
+LLC. For example, if node0, node1, and node2 each
+have 4 LLCs, while node3 has 1 LLC (possibly due
+to CPU hotplug), cache-aware scheduling should be
+disabled.
+
+Suggested-by: Libo Chen <libo.chen@oracle.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/fair.c     | 31 ++++++++++++++++++++++---------
+ kernel/sched/sched.h    |  1 +
+ kernel/sched/topology.c | 22 ++++++++++++++++++++--
+ 3 files changed, 43 insertions(+), 11 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6e61f9e1f628..194ec594561b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,6 +1175,8 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
++DEFINE_STATIC_KEY_FALSE(sched_cache_present);
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -1318,7 +1320,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	unsigned long epoch;
+ 	int mm_sched_llc = -1;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return;
+ 
+ 	if (p->sched_class != &fair_sched_class)
+@@ -1366,7 +1369,8 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 	struct callback_head *work = &p->cache_work;
+ 	struct mm_struct *mm = p->mm;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_feat(SCHED_CACHE) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return;
+ 
+ 	if (!mm || !mm->pcpu_sched)
+@@ -9063,7 +9067,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
+ 	struct mm_struct *mm = p->mm;
+ 	int cpu;
+ 
+-	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return prev_cpu;
+ 
+ 	if (!mm || p->nr_cpus_allowed == 1)
+@@ -10024,6 +10029,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
++	   static_branch_likely(&sched_cache_present) &&
+ 	    get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid)
+ 		return 0;
+ #endif
+@@ -10109,7 +10115,8 @@ static struct list_head
+ 	LIST_HEAD(no_pref_llc);
+ 	LIST_HEAD(pref_other_llc);
+ 
+-	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return tasks;
+ 
+ 	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
+@@ -10295,6 +10302,7 @@ static int detach_tasks(struct lb_env *env)
+ 		 * they are tasks that prefer the current LLC.
+ 		 */
+ 		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
++		    static_branch_likely(&sched_cache_present) &&
+ 		    p->preferred_llc != -1 &&
+ 		    llc_id(env->src_cpu) == p->preferred_llc)
+ 			break;
+@@ -10952,7 +10960,8 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain *child = env->sd->child;
+ 	int llc;
+ 
+-	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return false;
+ 
+ 	if (env->sd->flags & SD_SHARE_LLC)
+@@ -11064,7 +11073,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+ 	struct sched_domain_shared *sd_share;
+ 
+ 	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE ||
+-	    !sched_feat(SCHED_CACHE_LB))
++	    !sched_feat(SCHED_CACHE_LB) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return;
+ 
+ 	/* only care the sched domain that spans 1 LLC */
+@@ -11126,7 +11136,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 			*sg_overutilized = 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) {
++		if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) &&
++		    static_branch_likely(&sched_cache_present)) {
+ 			int j;
+ 
+ 			for (j = 0; j < max_llcs; ++j)
+@@ -12412,7 +12423,8 @@ imbalanced_active_balance(struct lb_env *env)
+ static inline bool
+ break_llc_locality(struct lb_env *env)
+ {
+-	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
++	if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) ||
++	    !static_branch_likely(&sched_cache_present))
+ 		return 0;
+ 
+ 	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
+@@ -12914,7 +12926,8 @@ static int active_load_balance_cpu_stop(void *data)
+ #ifdef CONFIG_SCHED_CACHE
+ 		int llc = llc_idx(target_cpu);
+ 
+-		if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB))
++		if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) ||
++		    !static_branch_likely(&sched_cache_present))
+ 			goto out_unlock;
+ 
+ 		if (llc < 0)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 4464b92767ad..3e60618a88e9 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2857,6 +2857,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int sysctl_llc_aggr_cap;
+ extern unsigned int sysctl_llc_aggr_imb;
++extern struct static_key_false sched_cache_present;
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 91a2b7f65fee..8483c02b4d28 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -2476,6 +2476,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	int i, ret = -ENOMEM;
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
++	bool llc_has_parent_sd = false;
++	unsigned int multi_llcs_node = 1;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	if (max_llcs < 0) {
+@@ -2545,6 +2547,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 				struct sched_domain __rcu *top_p;
+ 				unsigned int nr_llcs;
+ 
++				if (!llc_has_parent_sd)
++					llc_has_parent_sd = true;
+ 				/*
+ 				 * For a single LLC per node, allow an
+ 				 * imbalance up to 12.5% of the node. This is
+@@ -2566,10 +2570,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 				 * between LLCs and memory channels.
+ 				 */
+ 				nr_llcs = sd->span_weight / child->span_weight;
+-				if (nr_llcs == 1)
++				/*
++				 * iff all nodes have multiple LLCs, the
++				 * multi_llcs_node will be set to 1. If
++				 * there is at least 1 node having 1 single
++				 * LLC, the multi_llcs_node remains 0.
++				 */
++				if (nr_llcs == 1) {
+ 					imb = sd->span_weight >> 3;
+-				else
++					multi_llcs_node = 0;
++				} else {
+ 					imb = nr_llcs;
++					multi_llcs_node &= 1;
++				}
+ 				imb = max(1U, imb);
+ 				sd->imb_numa_nr = imb;
+ 
+@@ -2617,6 +2630,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	if (has_cluster)
+ 		static_branch_inc_cpuslocked(&sched_cluster_active);
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (llc_has_parent_sd && multi_llcs_node && !sched_asym_cpucap_active())
++		static_branch_inc_cpuslocked(&sched_cache_present);
++#endif
++
+ 	if (rq && sched_debug_verbose)
+ 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+ 
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch
new file mode 100644
index 0000000..9235bda
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch
@@ -0,0 +1,164 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 72A7D2749D5
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:14:08 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716450; cv=none; b=IeQpUvuEInOqzZCCDB3S2FFJC3zSp4/XkEtNAu2D98UQxwsXgo2BbwLtaxH6iJwbi4gN7aV60Aez3K8ydwiJFAzlJgHRf/3+aORKqKYd4JzlvthQfT5xmROL82OUpU66n+lqag40QZb3zB8TfPDePjvzD5oeSB7VRw6J3Cixx+g=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716450; c=relaxed/simple;
+	bh=aXXr09emE5m1BA8+F+9P+Qieum2q8rFMJHj2OrnZISQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=S5MR0P7NeiK39j0s1bhGmWRhNoCo1BGGAp0DbGAVUu/cKCksVWgSVTBQqiegxZ+gsioGdqPIL8dWWDic13tfdX1SnoZ6qXprsQhZj7vCqF2cTb/xO8jcfhLxZaWGG0ZCiaJ0gxuXgS+7OA9TZAf4d9OKuUhX4CnNCW42X+VS4FE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GZsq9eJL; arc=none smtp.client-ip=192.198.163.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GZsq9eJL"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716449; x=1786252449;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=aXXr09emE5m1BA8+F+9P+Qieum2q8rFMJHj2OrnZISQ=;
+  b=GZsq9eJL5DGjYunKJTsof+Ln2cKfOk2BGwjzrC+sKFMATJMkyxO99PMo
+   2Oyl5uVHYhRF8Tm0YmjjPIUI3d++yQ67YwJyf8GQHF8cYeMonziUJgHhH
+   kWHsp/STcLEVX42oVsCvQJlHA6eoqh5JKSyBBe3w1N12e5vNle7MdQRuI
+   9sPdUfMBH0dbovuNFtw5OfBzc2eoiu4kiBY1XCFzj5eShFF03nf9Tv2B/
+   ClF5YQoCu+HTwDDVvM9QKGz82gKXl8kYElV4byqv5tvHmI7Psovf6yI1d
+   zi0XGuLMMAQ/QyWVmk7U53AdlwCTHAyvtt6E7DmP/8gc3IF+ydRquzbUa
+   g==;
+X-CSE-ConnectionGUID: Q9jwphPASBiXZUHMOabB2g==
+X-CSE-MsgGUID: 2ZVJmYQUSISFufuXvDLTyg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860025"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="60860025"
+Received: from orviesa003.jf.intel.com ([10.64.159.143])
+  by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:08 -0700
+X-CSE-ConnectionGUID: ilrvcjmWQsiRa7CRb1DYVw==
+X-CSE-MsgGUID: Uh2zL5Q0RAa3BSoaBdHAzw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169693092"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:02 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 22/28] sched: Turn EPOCH_PERIOD and EPOCH_OLD into tunnable debugfs
+Date: Sat,  9 Aug 2025 13:07:59 +0800
+Message-Id: <79c8fdcf7e875617935cfaba2ea1f2c2ae5ce62c.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Convert EPOCH_PERIOD and EPOCH_OLD into tunable debugfs
+entries. Users can adjust the decay rate as needed.
+By default, occupancy decays by half every 10 ms.
+
+Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/debug.c | 2 ++
+ kernel/sched/fair.c  | 9 ++++++---
+ kernel/sched/sched.h | 2 ++
+ 3 files changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 682fd91a42a0..7a9ec03704b9 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -535,6 +535,8 @@ static __init int sched_init_debug(void)
+ #ifdef CONFIG_SCHED_CACHE
+ 	debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap);
+ 	debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb);
++	debugfs_create_u32("llc_period", 0644, debugfs_sched, &sysctl_llc_period);
++	debugfs_create_u32("llc_old", 0644, debugfs_sched, &sysctl_llc_old);
+ #endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 194ec594561b..64f757ad39fc 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1175,6 +1175,9 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+ #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
+ #define EPOCH_OLD	5		/* 50 ms */
+ 
++__read_mostly unsigned int sysctl_llc_period    = EPOCH_PERIOD;
++__read_mostly unsigned int sysctl_llc_old       = EPOCH_OLD;
++
+ DEFINE_STATIC_KEY_FALSE(sched_cache_present);
+ 
+ static int llc_id(int cpu)
+@@ -1283,9 +1286,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ 	long delta = now - rq->cpu_epoch_next;
+ 
+ 	if (delta > 0) {
+-		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		n = (delta + sysctl_llc_period - 1) / sysctl_llc_period;
+ 		rq->cpu_epoch += n;
+-		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		rq->cpu_epoch_next += n * sysctl_llc_period;
+ 		__shr_u64(&rq->cpu_runtime, n);
+ 	}
+ 
+@@ -1346,7 +1349,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 * has only 1 thread, invalidate
+ 	 * it's preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old ||
+ 	    get_nr_threads(p) <= 1) {
+ 		mm->mm_sched_cpu = -1;
+ 		pcpu_sched->occ = 0;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 3e60618a88e9..d752d64d4acd 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2858,6 +2858,8 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ extern unsigned int sysctl_llc_aggr_cap;
+ extern unsigned int sysctl_llc_aggr_imb;
+ extern struct static_key_false sched_cache_present;
++extern unsigned int sysctl_llc_period;
++extern unsigned int sysctl_llc_old;
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch
new file mode 100644
index 0000000..962ea38
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch
@@ -0,0 +1,171 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 50FEB275872
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:14:20 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716461; cv=none; b=IojQ3o0319gniGMl43HTVAIglRucNSyn6f7mIZ2sA6nGcNZlUGEZWY6057tDsNZ4vk1O+nB32WSiImkG1cA4P3bSQwXMpAQf17p3nQR/jrpVdHP7V0+mJDJgG2Sf7l8Ti7krqUFqfX2pqdYZqFCdot+k/yWtmOk0EJDIuVmMV9c=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716461; c=relaxed/simple;
+	bh=vn8SQoDAOd07cpCKj326vCb8D/qCqZCs9BwVgi4KoVg=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=PMXTmQv0twXoDtxt8UL0eicbBf9o7Tfv//e1wCHtE99sZhtB/nIU8wVaymmbbfxF6XMzVWor4WdG/qxhSD/wWJ4vwz9cuiTjtjsAycEcLvPhT9aUF3kUVc4kwE6SAX+4OTFtFFHuRXxnq1R/3zLjD9SIRWfAHwReiV3b07cgCmM=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=nkaLairD; arc=none smtp.client-ip=192.198.163.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="nkaLairD"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716461; x=1786252461;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=vn8SQoDAOd07cpCKj326vCb8D/qCqZCs9BwVgi4KoVg=;
+  b=nkaLairDDjkysNs4UHTZ5xKHewoxOhkUNj9VyNX4tbdn4A+qnsAVwMle
+   R5heWKuCY8Flip8hzeFiNi/CFABQw9zu7obpiMTotWoXuYrKrGBh3HoZh
+   lmLG5GkRobIJvrI3ad/N+LP8GAWOX5LCCD9ciXh9NpYENpuy7gVq79Rno
+   lOPq4XCXPVEuiMBh+0Se3GxDjUG9K2DZWlyzewIOPzwn2XZvGRXdUZ3Ot
+   w3MIJHSIsVA80TETVQPqTJE71E/W3dHyU/Fc9CdibOzm0oeRAQl7UFWIV
+   RWX9ArTsi+Tp+Wc1c8CtMey1/OwsGiNy5hIgFduA1bpZq8LLDwe4VQMBf
+   g==;
+X-CSE-ConnectionGUID: Uof0EABPSOGTbrf//bNl/A==
+X-CSE-MsgGUID: G6Abi0pKRMKI/T1SAuVRiQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860044"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="60860044"
+Received: from orviesa003.jf.intel.com ([10.64.159.143])
+  by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:20 -0700
+X-CSE-ConnectionGUID: Y7bYuOZeSdOfQBAqGM3CtQ==
+X-CSE-MsgGUID: edOp5vWRTYOveGm/1PiD4g==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169693142"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:14 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 23/28] sched: Scan a task's preferred node for preferred LLC
+Date: Sat,  9 Aug 2025 13:08:11 +0800
+Message-Id: <178bf43d7cbc9b2c9aea408dd56b87391067df37.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+When sched_cache is enabled, fully scanning all online
+CPUs to find the hottest one is very costly. As a first
+step, limit the scan to only the CPUs within the task's
+preferred node. If the node containing the task's preferred
+LLC is not in the CPU scan mask, add it. Additionally, if
+the node where the current task is running is not in the
+scan mask, add it too.
+
+Suggested-by: Jianyong Wu <jianyong.wu@outlook.com>
+Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++---
+ 1 file changed, 33 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 64f757ad39fc..420d3a080990 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1390,13 +1390,36 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 	}
+ }
+ 
++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
++			      int pref_nid, int curr_cpu)
++{
++#ifdef CONFIG_NUMA_BALANCING
++	/* first honor the task's preferred node */
++	if (pref_nid != NUMA_NO_NODE)
++		cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
++#endif
++
++	/* secondly honor the task's cache CPU if it is not included */
++	if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus))
++		cpumask_or(cpus, cpus,
++			   cpumask_of_node(cpu_to_node(cache_cpu)));
++
++	/*
++	 * Thirdly honor the task's current running node
++	 * as the last resort.
++	 */
++	if (!cpumask_test_cpu(curr_cpu, cpus))
++		cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
++}
++
+ static void __no_profile task_cache_work(struct callback_head *work)
+ {
+ 	struct task_struct *p = current;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long last_m_a_occ = 0;
+-	int cpu, m_a_cpu = -1;
++	int cpu, m_a_cpu = -1, cache_cpu,
++	    pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id();
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1406,11 +1429,18 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
+-	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
++	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
++	cache_cpu = mm->mm_sched_cpu;
++#ifdef CONFIG_NUMA_BALANCING
++	if (static_branch_likely(&sched_numa_balancing))
++		pref_nid = p->numa_preferred_nid;
++#endif
++
+ 	scoped_guard (cpus_read_lock) {
+-		cpumask_copy(cpus, cpu_online_mask);
++		get_scan_cpumasks(cpus, cache_cpu,
++				  pref_nid, curr_cpu);
+ 
+ 		for_each_cpu(cpu, cpus) {
+ 			/* XXX sched_cluster_active */
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch
new file mode 100644
index 0000000..fd94172
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch
@@ -0,0 +1,169 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 057372472BA
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:14:32 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716474; cv=none; b=VIfN6Nj+OmuTvjcAQC5ue6+EJDcsfkI76dqTW3x38qtgbZ4hqzLAXpZlXfM9DlKB8dKo57i5wMslwoWbzJnvNz1ykOFHxPfoi2S93m+jstGBBseKp1ztbQwQ2K61GTrnAqbJdZm7pnjXyKLjWHy5vyrtWh/xw7PspOvnpd6AKpw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716474; c=relaxed/simple;
+	bh=FSWaU6wjxtUO8jXpCphgNEzz4lxAHBHSTH4IiVUyBUk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=XjsKAevWGzT+p1INgbZoPieajKhnft6CnQg6Bbk4u1Z+7t5XRC0bswqFYNyS3V/ZFpoWqhCw4RYtVBZI+9IiO7g1Q46JsT7+09UPlrjZPLkzbNiFJI/DRRGFHTenkxTcke+8xYuOm3RbKap9vjHJ6muwNQMS05rVf9nebWSot00=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gmFy4oU1; arc=none smtp.client-ip=192.198.163.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gmFy4oU1"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716473; x=1786252473;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=FSWaU6wjxtUO8jXpCphgNEzz4lxAHBHSTH4IiVUyBUk=;
+  b=gmFy4oU1sZtwQyMYwHB8SIoIYYE5hI8m7TFKqp6zqiVd8rnbxM1bMhLC
+   74RYM+I/O6xKXfeu/UVzPgl1+lq9og33Njeix9LSwjF6dc54BAfz6kZpm
+   XSL2l8zGvHKS024WUDTZLsjKB3ozB4WcNBoQCDO/MuFQPfhhiy+fkzGjd
+   GRjfmZ2nZEDzv9f+jC+e5CY8l12nBabKfqFG4La0LMDW1GFk7YsYd275+
+   ppat6y66psYB1mii4x1wz+0D3WFtxDecRb9O1Al2JUDYi696b9W2OMbl/
+   S6cwP0rZ5li5OyXKhxjLqGgxHrmb3hYgWbTMhafNcMH8p7GmxPK8BizEQ
+   Q==;
+X-CSE-ConnectionGUID: hfmQ09kFSka1PTGb3z++tA==
+X-CSE-MsgGUID: jqX5qpmwT2awTicWyNB8Gw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860082"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="60860082"
+Received: from orviesa003.jf.intel.com ([10.64.159.143])
+  by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:32 -0700
+X-CSE-ConnectionGUID: UKKGkDsGRSSXIWvNvv1UIw==
+X-CSE-MsgGUID: AYQBgKAjRwGDzbPfQLudxQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="169693165"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:26 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 24/28] sched: Record average number of runninhg tasks per process
+Date: Sat,  9 Aug 2025 13:08:23 +0800
+Message-Id: <f76bd466650791d1d4893bb66d973dc2861f473f.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Performance regression was found when running hackbench
+with many threads per process(the fd number is high).
+To avoid this regression, process having a large number
+of active threads should be excluded from cache aware
+scheduling.
+
+With sched_cache enabled, record the number of active threads within
+the process. This calculation occurs in the periodic task_cache_work():
+when iterating over the CPUs, check the currently running task on that
+CPU; if the running task belongs to the same process as the task that
+launches task_cache_work(), increment the active thread count by 1.
+
+If the number exceeds the number of CPUs in the preferred LLC,
+sched_cache is prevented from aggregating too many threads in one
+LLC domain.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/mm_types.h |  1 +
+ kernel/sched/fair.c      | 14 ++++++++++++--
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 41a598a44361..13b715357ccb 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1033,6 +1033,7 @@ struct mm_struct {
+ 		raw_spinlock_t mm_sched_lock;
+ 		unsigned long mm_sched_epoch;
+ 		int mm_sched_cpu;
++		u64 nr_running_avg;
+ #endif
+ 
+ #ifdef CONFIG_MMU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 420d3a080990..2577b4225c3f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1414,12 +1414,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
+ 
+ static void __no_profile task_cache_work(struct callback_head *work)
+ {
+-	struct task_struct *p = current;
++	struct task_struct *p = current, *cur;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long last_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1, cache_cpu,
+-	    pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id();
++	    pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(),
++	    nr_running = 0;
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1460,6 +1461,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					m_cpu = i;
+ 				}
+ 				nr++;
++
++				rcu_read_lock();
++				cur = rcu_dereference(cpu_rq(i)->curr);
++				if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
++				    cur->mm == mm)
++					nr_running++;
++				rcu_read_unlock();
++
+ 				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+ 					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+ 			}
+@@ -1489,6 +1498,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		mm->mm_sched_cpu = m_a_cpu;
+ 	}
+ 
++	update_avg(&mm->nr_running_avg, nr_running);
+ 	free_cpumask_var(cpus);
+ }
+ 
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch
new file mode 100644
index 0000000..5d4f16d
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch
@@ -0,0 +1,160 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D7B93262FE7
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:14:45 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716487; cv=none; b=SwLkLgdtnhX6+OqLa78yMZ0B4CsWLaf64/W+OI5unQJxwjCB9iC7AOkHjHIaNJ3elBvx03DMvZl1I9GsmU+4HfwjpJKe6RPlgB+vcyUgYbcDZyIBhWoXYSKefXUoVChQUkXZnD99yesLvH9Ng14G5w2CQagDetRVGrHCe4izAMY=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716487; c=relaxed/simple;
+	bh=NXUgKwlBD+LbS7BQoNhwYmUMMNvH06C+9x5ABL9KQvU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=hLGB1Xa+0uSZNB/dGK1ZYaVry91TulpBPgnDTPCsKPDTIPOxhNaZZ9Cuzpbo38IwKkY5uJgzEZ1uUGrD/s/RAk4WaWJnoklk4db0h0lc6DTb3DdY///Tx0rE+4HGY8/VgFkFtuHBrA6i2cPS9Gpq8iWYdaB6+MhIJDjLW2gw96I=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JtFt/v4t; arc=none smtp.client-ip=192.198.163.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JtFt/v4t"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716485; x=1786252485;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=NXUgKwlBD+LbS7BQoNhwYmUMMNvH06C+9x5ABL9KQvU=;
+  b=JtFt/v4t08KznPLSFaAfIMnOlZCZo8kJRJT+HfIrOEKXEd/dqH+lrk/a
+   pIz5BUtjCYdr/a4c7pZ1o1bWZFwpHGC/M6S2vru19uOo+9h+a6+cDG4JL
+   yM5/aSH0L5HRBrhBX4JQAp/3MD69CoaWhFe+GdocmtVOMYwP+erObftt/
+   1RQRwVjl1GoMs0U6JSVWCb5Sk5EwLmq/bSRzlsfuwbMavl7fO/aSz0Urq
+   XtvO5DV6xrhsk1Y77keeRc1mlMUGjry5fYahbKbwyaOxyKFtrNAFVv1mP
+   qRCo9H1Kh+G72Foi9f3RFJzd/ky65xGF3aC6FH86kLP/zxm2pRC6gsTTi
+   w==;
+X-CSE-ConnectionGUID: IHbXoWNLTlyKIkz7HGkE0w==
+X-CSE-MsgGUID: A4HbKv+ySb+99UGpcOUEYA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259994"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56259994"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:45 -0700
+X-CSE-ConnectionGUID: tRxkh/5RR2W0QaPbtyXBXw==
+X-CSE-MsgGUID: kEi0G4T2TKO4l03CeelQGA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165476161"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:14:39 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 25/28] sched: Skip cache aware scheduling if the process has many active threads
+Date: Sat,  9 Aug 2025 13:08:36 +0800
+Message-Id: <463bc54a283c1b908ea286ce67f301e2d1d39ea1.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If the number of active threads within the process
+exceeds the number of Cores(divided by SMTs number)
+in the LLC, do not enable cache-aware scheduling.
+This is because there is a risk of cache contention
+within the preferred LLC when too many threads are
+present.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/fair.c | 24 +++++++++++++++++++++++-
+ 1 file changed, 23 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 2577b4225c3f..4bf794f170cf 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1205,6 +1205,18 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
++{
++	int smt_nr = 1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active())
++		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
++#endif
++
++	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
+ 	int pref_llc;
+@@ -1350,7 +1362,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 * it's preferred state.
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old ||
+-	    get_nr_threads(p) <= 1) {
++	    get_nr_threads(p) <= 1 ||
++	    exceed_llc_nr(mm, cpu_of(rq))) {
+ 		mm->mm_sched_cpu = -1;
+ 		pcpu_sched->occ = 0;
+ 	}
+@@ -1430,6 +1443,11 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
++	if (get_nr_threads(p) <= 1) {
++		mm->mm_sched_cpu = -1;
++		return;
++	}
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9095,6 +9113,10 @@ static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cp
+ 	if (cpu < 0)
+ 		return mig_allow;
+ 
++	 /* skip cache aware load balance for single/too many threads */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++		return mig_allow;
++
+ 	if (cpus_share_cache(dst_cpu, cpu))
+ 		return _get_migrate_hint(src_cpu, dst_cpu,
+ 					 task_util(p), true);
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch
new file mode 100644
index 0000000..186a2a4
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch
@@ -0,0 +1,196 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C0E67221540
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:14:58 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716500; cv=none; b=MmUb4c5SUS1z0wOntqBT1lBWA98RaJXvOGOJgyKpL3css2V1PB3tSyqGl6uL4LpYMOyo+rKX3+Or66/w3kvy4IrnK/1zKzbYlsK4uG0lHZdaI/ylANl1HNqUIDGNZvHQU/rlxzJ01GSoxw4kvO56Gsq+Q3Dt9ImEFj8vhMltr7o=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716500; c=relaxed/simple;
+	bh=ihWK71wp/q5X7Akw0a3NfcmXIvV26+nk2ItbYopgMBI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=jLyq+PE48ZqCxRzNfCECXbgKXorBAOraTW1R5D8fjI5Zh6EGFB8ZxO9LxYUE8qGvLCGMlNFmK1+oj1nTLMrg0x4R6BkGXURfRaIdO4gl1uU8D++Mr+VA477bh1glz9u1Ll6+Hks8Jtf0M6xJo+lTeh2jLF0wW7on+hBzGyeTVsU=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eixROvUd; arc=none smtp.client-ip=192.198.163.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eixROvUd"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716498; x=1786252498;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=ihWK71wp/q5X7Akw0a3NfcmXIvV26+nk2ItbYopgMBI=;
+  b=eixROvUdtZ+jX6c9PUzsFkzZYy58X17+XSMG4TAKB4dMmEuzDFKNJS5/
+   y6mSE9FUJZqVr6Z/MJgBg/rnrSxKx4WNtLRYRaKlxeoov9FqlTgFQ0cm3
+   xrL/E9j5rOidep+PGoL8jF0Vi5sxq3zlPOp19TXKYmYCBYSnGBQNe2AtY
+   mgT3vaD7Elxg79E7NgAUMSiS4MZSj26K4v5ujKu8dsw3shTBxA6CvPmPO
+   rVrBtoWLK3XI3ZTP4tki0uiAJIjOH7fmVd/U18FdmdONY3ZWGGHpTVNYs
+   SDtMc5zQFr8yS7suux1xYdFcGTMSbKUOlN8esjYEKijdSJZB/ZI2rOcmc
+   Q==;
+X-CSE-ConnectionGUID: 2OIbjhYrRs6bZ6T4QRg5pg==
+X-CSE-MsgGUID: 1Tg83zzKRiy+BCTOQqeOJA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56260022"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56260022"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:58 -0700
+X-CSE-ConnectionGUID: iiCU+no/RZqZ47l6ZSVsuQ==
+X-CSE-MsgGUID: gBZUGA4RTmG67FOC847xZA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165476169"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:14:52 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 26/28] sched: Do not enable cache aware scheduling for process with large RSS
+Date: Sat,  9 Aug 2025 13:08:49 +0800
+Message-Id: <881a665a94858d4fb6f13491f4dffe58c8fc3870.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+It has been reported that when running memory-intensive workloads
+such as stream, sched_cache may saturate the memory bandwidth on
+the preferred LLC.
+
+To prevent this from happening, evaluate the process's memory
+footprint by checking the size of RSS (anonymous pages and shared
+pages) and comparing it to the size of the LLC. If the former is
+larger, skip cache-aware scheduling. This is because if tasks
+do not actually share data, aggregating tasks with large RSS will
+likely result in cache contention and performance depredation.
+
+However, in theory, RSS is not the same as memory footprint.
+This is just an estimated approach to prevent over-aggregation.
+The default behavior is to strictly compare the size of RSS with
+the size of the LLC. The next patch will introduce a user-provided
+hint to customize this comparison.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 44 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 4bf794f170cf..cbda7dad1305 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1205,6 +1205,34 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
++{
++	struct cpu_cacheinfo *this_cpu_ci;
++	struct cacheinfo *l3_leaf;
++	unsigned long rss;
++	unsigned int llc;
++
++	/*
++	 * get_cpu_cacheinfo_level() can not be used
++	 * because it requires the cpu_hotplug_lock
++	 * to be held. Use get_cpu_cacheinfo()
++	 * directly because the 'cpu' can not be
++	 * offlined at the moment.
++	 */
++	this_cpu_ci = get_cpu_cacheinfo(cpu);
++	if (!this_cpu_ci->info_list ||
++	    this_cpu_ci->num_leaves < 3)
++		return true;
++
++	l3_leaf = this_cpu_ci->info_list + 3;
++	llc = l3_leaf->size;
++
++	rss = get_mm_counter(mm, MM_ANONPAGES) +
++		get_mm_counter(mm, MM_SHMEMPAGES);
++
++	return (llc <= (rss * PAGE_SIZE));
++}
++
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+ 	int smt_nr = 1;
+@@ -1363,7 +1391,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old ||
+ 	    get_nr_threads(p) <= 1 ||
+-	    exceed_llc_nr(mm, cpu_of(rq))) {
++	    exceed_llc_nr(mm, cpu_of(rq)) ||
++	    exceed_llc_capacity(mm, cpu_of(rq))) {
+ 		mm->mm_sched_cpu = -1;
+ 		pcpu_sched->occ = 0;
+ 	}
+@@ -1448,6 +1477,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		return;
+ 	}
+ 
++	/*
++	 * Do not check exceed_llc_nr() because
++	 * the active number of threads needs to
++	 * been updated anyway.
++	 */
++	if (exceed_llc_capacity(mm, curr_cpu))
++		return;
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9113,8 +9150,12 @@ static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cp
+ 	if (cpu < 0)
+ 		return mig_allow;
+ 
+-	 /* skip cache aware load balance for single/too many threads */
+-	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++	/*
++	 * skip cache aware load balance for single/too many threads
++	 * and large footprint.
++	 */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
++	    exceed_llc_capacity(mm, dst_cpu))
+ 		return mig_allow;
+ 
+ 	if (cpus_share_cache(dst_cpu, cpu))
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch
new file mode 100644
index 0000000..e91dd01
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch
@@ -0,0 +1,303 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8BD69278E77
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:15:14 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716517; cv=none; b=f0wCf0A1e24ot1gDONm8873CUbeJO7p+XOYXyd1L81oXyHjWyDUgzMJcD0hJ3DF8nImLeld/DZRB4Rw1t1WEKTNhLr+PgIxdQt1pezZAV7PflwC9pScJMoIsibbNOHtKzaO++na+m07o/7UQdsk+sPTfO2f6+LpbJhepVHPoteU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716517; c=relaxed/simple;
+	bh=/Ba0fCfCtChaUyr8nA+Reo2+vDWT1X8nPBpOr+cNYjk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=EDkKLloCKeEh8IprYHQ5cLYC7XD5yrfwdWMtkoq85n7q8KXHTNKJtaTomDYe5XtQCLNBxnEyUumHNx4/C6VKj/drV2J40y00jxpuHKC2otW2Agu5fvbDIBaIndWVIczwxgu3StyAleOP2GSxyPvAaYACU8MlGinZXBKpz5KxMhg=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y90Y97GH; arc=none smtp.client-ip=192.198.163.18
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y90Y97GH"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716515; x=1786252515;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=/Ba0fCfCtChaUyr8nA+Reo2+vDWT1X8nPBpOr+cNYjk=;
+  b=Y90Y97GHLKUMgwRE30WxTWa+nQqnGyCMmuH77cLq8i/vdaPuuN2Ktztv
+   zBg5w7QsE4Tap803U0WzbsFCGqUe1e5QS4yNcC38D0ELqu1BwsHy6z+jR
+   WhnmKOXM13ylYTNfpsosT8H/fTVU1o4HavW4jD2mb3Xd/w2lpl/NwsKv1
+   lUdWhkLxUWSHTQXwZSwGQLULWx/qg/CpYON81o3vjH2gAxshxAoSNBtaB
+   P0r4Ex0Lc94pbcpxMXlN2Yvf/QybuyXM6p6DUPSf0Ju1um2BF6tKB2GEf
+   jmV33bJNWsB01mTnWfFZ55QI4h+P2NvGIgzeEkg3JsyzHbN7Qffq9u85f
+   Q==;
+X-CSE-ConnectionGUID: ktJ3gmikQC2MxSJlFd01xQ==
+X-CSE-MsgGUID: gbMwARUNRLGJ2ccZVMMRBw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56260030"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="56260030"
+Received: from orviesa007.jf.intel.com ([10.64.159.147])
+  by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:15:13 -0700
+X-CSE-ConnectionGUID: lbKBkeQYTzKHrxkrdXEGtA==
+X-CSE-MsgGUID: NxMJcQ1jQD24A6f12YxIFA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="165476192"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:15:08 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 27/28] sched: Allow the user space to tune the scale factor for RSS comparison
+Date: Sat,  9 Aug 2025 13:09:02 +0800
+Message-Id: <81c197882b7c9f4325a5cb32f8a9d1e1fc900297.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+sched_cache compares the process's resident pages with
+the size of the LLC to determine whether task aggregation
+on the preferred LLC might cause cache contention. If the
+former is larger than the latter, skip cache-aware task
+aggregation. However, some workloads with large resident
+pages have a small memory footprint; such workloads could
+benefit from cache-aware scheduling. The kernel lacks a
+efficient mechanism to track the task's memory footprint
+(yes, we have resctrl, but it is for user-space query,
+and not process scope), so it is up to userspace to pass
+this hint to the kernel.
+
+Introduce /sys/kernel/debug/sched/sched_cache_ignore_rss
+to control the extent to which users ignore the RSS
+restriction. This value ranges from 0 to 100. A value of
+0 means that the user disables the cache aware scheduling.
+1 means if a process's RSS is larger than the LLC size,
+cache-aware scheduling will be skipped. 100 means cache
+aware scheduling is alwasy enabled regardless of RSS size.
+N (between 1 and 100) means turn off cache aware scheduling
+when RSS is greater than (N-1) * 256 * LLC size
+
+For example, suppose the L3 size is 32MB. If the
+sysctl_sched_cache_ignore_rss is 1: When the RSS is larger
+than 32MB, the process is regarded as exceeding the LLC capacity.
+If the sysctl_sched_cache_ignore_rss is 99: When the RSS is
+larger than 784GB, the process is regarded as exceeding the
+LLC capacity(please refer to the code):
+784GB = (1 + (99 - 1) * 256) * 32MB
+
+Additionally, the number of SMTs is also considered for
+sysctl_sched_cache_aggr_cap; if there are many SMTs in the core,
+sysctl_llc_aggr_cap will be reduced. This inhibits task aggregation
+from cache-aware scheduling on systems with a high number of SMTs,
+like Power 10 and Power 11.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Reported-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
+Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/debug.c | 82 +++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/fair.c  | 10 ++++--
+ kernel/sched/sched.h |  3 +-
+ 3 files changed, 90 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 7a9ec03704b9..6676fc2a8c08 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -166,6 +166,83 @@ static const struct file_operations sched_feat_fops = {
+ 	.release	= single_release,
+ };
+ 
++#ifdef CONFIG_SCHED_CACHE
++#define SCHED_CACHE_CREATE_CONTROL(name, val)			  \
++static int sysctl_sched_cache_##name = val;			  \
++static ssize_t sched_cache_write_##name(struct file *filp,	  \
++					const char __user *ubuf,  \
++					size_t cnt, loff_t *ppos) \
++{								  \
++	char buf[16];						  \
++	unsigned int percent;					  \
++	if (cnt > 15)						  \
++		cnt = 15;					  \
++	if (copy_from_user(&buf, ubuf, cnt))			  \
++		return -EFAULT;					  \
++	buf[cnt] = '\0';					  \
++	if (kstrtouint(buf, 10, &percent))			  \
++		return -EINVAL;					  \
++	if (percent > 100)					  \
++		return -EINVAL;					  \
++	sysctl_sched_cache_##name = percent;			  \
++	*ppos += cnt;						  \
++	return cnt;						  \
++}								  \
++static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
++{								  \
++	seq_printf(m, "%d\n", sysctl_sched_cache_##name);	  \
++	return 0;						  \
++}								  \
++static int sched_cache_open_##name(struct inode *inode,		  \
++				   struct file *filp)		  \
++{								  \
++	return single_open(filp, sched_cache_show_##name, NULL);  \
++}								  \
++static const struct file_operations sched_cache_fops_##name = {	  \
++	.open		= sched_cache_open_##name,		  \
++	.write		= sched_cache_write_##name,		  \
++	.read		= seq_read,				  \
++	.llseek		= seq_lseek,				  \
++	.release	= single_release,			  \
++}
++
++SCHED_CACHE_CREATE_CONTROL(ignore_rss, 1);
++int get_sched_cache_rss_scale(void)
++{
++	if (!sysctl_sched_cache_ignore_rss)
++		return 0;
++
++	if (sysctl_sched_cache_ignore_rss >= 100)
++		return INT_MAX;
++	/*
++	 * Suppose the L3 size is 32MB. If the
++	 * sysctl_sched_cache_ignore_rss is 1:
++	 * When the RSS is larger than 32MB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity. If the
++	 * sysctl_sched_cache_ignore_rss is 99:
++	 * When the RSS is larger than 784GB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 784GB = (1 + (99 - 1) * 256) * 32MB
++	 */
++	return (1 + (sysctl_sched_cache_ignore_rss - 1) * 256);
++}
++
++SCHED_CACHE_CREATE_CONTROL(aggr_cap, 50);
++int get_sched_cache_cap_scale(void)
++{
++	int smt_nr = 1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active())
++		smt_nr =
++			cpumask_weight(cpu_smt_mask(raw_smp_processor_id()));
++#endif
++	return (sysctl_sched_cache_aggr_cap / smt_nr);
++}
++#endif /* SCHED_CACHE */
++
+ #ifdef CONFIG_SMP
+ 
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+@@ -533,10 +610,13 @@ static __init int sched_init_debug(void)
+ #endif
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap);
+ 	debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb);
+ 	debugfs_create_u32("llc_period", 0644, debugfs_sched, &sysctl_llc_period);
+ 	debugfs_create_u32("llc_old", 0644, debugfs_sched, &sysctl_llc_old);
++	debugfs_create_file("llc_aggr_cap", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_aggr_cap);
++	debugfs_create_file("llc_ignore_rss", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_ignore_rss);
+ #endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cbda7dad1305..018825f04063 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1211,6 +1211,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	struct cacheinfo *l3_leaf;
+ 	unsigned long rss;
+ 	unsigned int llc;
++	int scale;
+ 
+ 	/*
+ 	 * get_cpu_cacheinfo_level() can not be used
+@@ -1230,7 +1231,11 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	rss = get_mm_counter(mm, MM_ANONPAGES) +
+ 		get_mm_counter(mm, MM_SHMEMPAGES);
+ 
+-	return (llc <= (rss * PAGE_SIZE));
++	scale = get_sched_cache_rss_scale();
++	if (scale == INT_MAX)
++		return false;
++
++	return ((llc * scale) <= (rss * PAGE_SIZE));
+ }
+ 
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+@@ -9037,7 +9042,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ static long __migrate_degrades_locality(struct task_struct *p,
+ 					int src_cpu, int dst_cpu,
+ 					bool idle);
+-__read_mostly unsigned int sysctl_llc_aggr_cap       = 50;
+ __read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
+ 
+ /*
+@@ -9049,7 +9053,7 @@ __read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
+  * (default: ~50%)
+  */
+ #define fits_llc_capacity(util, max)	\
+-	((util) * 100 < (max) * sysctl_llc_aggr_cap)
++	((util) * 100 < (max) * get_sched_cache_cap_scale())
+ 
+ /*
+  * The margin used when comparing utilization.
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index d752d64d4acd..eaeca4e77ead 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2855,11 +2855,12 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-extern unsigned int sysctl_llc_aggr_cap;
+ extern unsigned int sysctl_llc_aggr_imb;
+ extern struct static_key_false sched_cache_present;
+ extern unsigned int sysctl_llc_period;
+ extern unsigned int sysctl_llc_old;
++int get_sched_cache_rss_scale(void);
++int get_sched_cache_cap_scale(void);
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch
new file mode 100644
index 0000000..051e055
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch
@@ -0,0 +1,307 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 47CE12797BD
+	for <linux-kernel@vger.kernel.org>; Sat,  9 Aug 2025 05:15:27 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1754716532; cv=none; b=Q3Lu9iJcgpkY3JofeQsI3NuQ1TQam6CIlO+tdvTSCRjGAjVblky3W53EIomiHy80dmktuPQdtHxgcRNWPE+j/bg5BQe6GDtHnoJUJTNFKCR/9DYjJgajvDVOAMxm+f5X8nLVN12/qTm5fIAB7ohPRpMT5XEfcEaB2rgn4WdJjlA=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1754716532; c=relaxed/simple;
+	bh=Qnj+s91JA/iUJUPb0vlHFoRsDjRZdaL0BuPbFnDHlf4=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Id7UAlt3jf4R+5/V26lYDKIMUvQ2sWz1U1L6PxHG7qpC1Y2DBCKgalLglN7phNtyW+llMFcUZ8TM/8hPX4zlNTja13GIfazsivILfGOPcNG17Rvk+pJ6zxYEFCqME/cFa0umvr+QT0QzQ/sIQRDaZpzSMHofa4VucqncRSPHNUo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=KZT0jTXM; arc=none smtp.client-ip=198.175.65.12
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="KZT0jTXM"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1754716528; x=1786252528;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Qnj+s91JA/iUJUPb0vlHFoRsDjRZdaL0BuPbFnDHlf4=;
+  b=KZT0jTXMrwi6hPfQPUBt2UhPb6fUrWeb/O3CDNims/uo7a2fRVzB96ni
+   iAcBIq7cRPl82Mb6WAlg3t8qyCFKa4+mARu2XOEG/1TlTOTPJpPcEFd23
+   uyE/VkcDEWP2Pk6IHBPTbmMmpwS9xu2Sc2pif7fu4IJ95Ou4he7GApWaO
+   MXcgjRuqD2lXHrJW2ZCh04Xx6L3C8w5eBUkk3oAlp2wVkN4HgtgEp9ORv
+   Z96q81Q0Wd5WaHbBqUBeZQ2vuQf9nOsBlZZ3rd4ahEG7C3LjJGEU6HHLi
+   KLNOB6OypoKBQux1+HQWGMGe9cMObuKZXq8mFwxxNatSDn+GrcGALRB6K
+   A==;
+X-CSE-ConnectionGUID: jbgqMP0qR/OvQDc48pzOPw==
+X-CSE-MsgGUID: hEeOV4ycSKW7r3RHicCLKw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68514913"
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="68514913"
+Received: from fmviesa010.fm.intel.com ([10.60.135.150])
+  by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:15:27 -0700
+X-CSE-ConnectionGUID: RhK2nh/kRuuYh7KksFDbdw==
+X-CSE-MsgGUID: nLJQ9FRIRdmfFlE8PcQEEw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; 
+   d="scan'208";a="166275369"
+Received: from chenyu-dev.sh.intel.com ([10.239.62.107])
+  by fmviesa010.fm.intel.com with ESMTP; 08 Aug 2025 22:15:20 -0700
+From: Chen Yu <yu.c.chen@intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [RFC PATCH v4 28/28] sched: Add ftrace to track cache aware load balance and hottest CPU changes
+Date: Sat,  9 Aug 2025 13:09:17 +0800
+Message-Id: <3e3622a5b2129b56741989f15a8debabec064de9.1754712565.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.25.1
+In-Reply-To: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+References: <cover.1754712565.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce 3 trace events:
+
+1.
+The average time spent scanning CPUs and calculating occupancy
+in each sample period. This event can be used to track the
+overhead of cache-aware scheduling.
+
+2.
+The footprint when switching to a new mm_sched_cpu (a cache-hot CPU).
+This event can be used to track whether there is any abnormal
+bouncing of mm_sched_cpu.
+
+3.
+The footprint of load balancing when migrating a task between CPUs.
+This event can be used to track whether cache-aware load balancing
+behaves as expected.
+
+All these events can be used with bpftrace to gain a basic
+understanding of whether cache-aware scheduling is effective.
+
+Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/trace/events/sched.h | 93 ++++++++++++++++++++++++++++++++++++
+ kernel/sched/fair.c          | 25 ++++++++--
+ 2 files changed, 113 insertions(+), 5 deletions(-)
+
+diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
+index 4e6b2910cec3..398180c18946 100644
+--- a/include/trace/events/sched.h
++++ b/include/trace/events/sched.h
+@@ -10,6 +10,99 @@
+ #include <linux/tracepoint.h>
+ #include <linux/binfmts.h>
+ 
++TRACE_EVENT(sched_scan_cost,
++
++	TP_PROTO(struct task_struct *t, u64 cost, int nr,
++		 u64 old_running, u64 new_running),
++
++	TP_ARGS(t, cost, nr, old_running, new_running),
++
++	TP_STRUCT__entry(
++		__array(	char,	comm,	TASK_COMM_LEN	)
++		__field(	pid_t,	pid			)
++		__field(	u64,	cost			)
++		__field(	int,	nr			)
++		__field(	u64,	old_running		)
++		__field(	u64,	new_running		)
++	),
++
++	TP_fast_assign(
++		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
++		__entry->pid	= t->pid;
++		__entry->cost	= cost;
++		__entry->nr	= nr;
++		__entry->old_running	= old_running;
++		__entry->new_running	= new_running;
++	),
++
++	TP_printk("comm=%s pid=%d cost=%llu nr=%d old_r=%lld new_r=%lld",
++		  __entry->comm, __entry->pid,
++		  __entry->cost, __entry->nr,
++		  __entry->old_running, __entry->new_running)
++);
++
++TRACE_EVENT(sched_cache_work,
++
++	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
++		 int new_cpu, int new_llc),
++
++	TP_ARGS(t, pref_cpu, pref_llc, new_cpu, new_llc),
++
++	TP_STRUCT__entry(
++		__array(	char,	comm,	TASK_COMM_LEN	)
++		__field(	pid_t,	pid			)
++		__field(	int,	pref_cpu		)
++		__field(	int,	pref_llc		)
++		__field(	int,	new_cpu			)
++		__field(	int,	new_llc			)
++	),
++
++	TP_fast_assign(
++		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
++		__entry->pid	= t->pid;
++		__entry->pref_cpu	= pref_cpu;
++		__entry->pref_llc	= pref_llc;
++		__entry->new_cpu	= new_cpu;
++		__entry->new_llc	= new_llc;
++	),
++
++	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
++		  __entry->comm, __entry->pid,
++		  __entry->pref_cpu, __entry->pref_llc,
++		  __entry->new_cpu, __entry->new_llc)
++);
++
++TRACE_EVENT(sched_attach_task,
++
++	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
++		 int attach_cpu, int attach_llc),
++
++	TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
++
++	TP_STRUCT__entry(
++		__array(	char,	comm,	TASK_COMM_LEN	)
++		__field(	pid_t,	pid			)
++		__field(	int,	pref_cpu		)
++		__field(	int,	pref_llc		)
++		__field(	int,	attach_cpu		)
++		__field(	int,	attach_llc		)
++	),
++
++	TP_fast_assign(
++		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
++		__entry->pid	= t->pid;
++		__entry->pref_cpu	= pref_cpu;
++		__entry->pref_llc	= pref_llc;
++		__entry->attach_cpu	= attach_cpu;
++		__entry->attach_llc	= attach_llc;
++	),
++
++	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
++		  __entry->comm, __entry->pid,
++		  __entry->pref_cpu, __entry->pref_llc,
++		  __entry->attach_cpu, __entry->attach_llc)
++);
++
+ /*
+  * Tracepoint for calling kthread_stop, performed to end a kthread:
+  */
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 018825f04063..cb2c33ee0d92 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1467,8 +1467,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	unsigned long last_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1, cache_cpu,
+ 	    pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(),
+-	    nr_running = 0;
++	    nr_running = 0, nr_scan = 0;
+ 	cpumask_var_t cpus;
++	u64 t0, scan_cost = 0;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+ 
+@@ -1499,6 +1500,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		pref_nid = p->numa_preferred_nid;
+ #endif
+ 
++	t0 = sched_clock_cpu(curr_cpu);
+ 	scoped_guard (cpus_read_lock) {
+ 		get_scan_cpumasks(cpus, cache_cpu,
+ 				  pref_nid, curr_cpu);
+@@ -1521,6 +1523,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					m_cpu = i;
+ 				}
+ 				nr++;
++				nr_scan++;
+ 
+ 				rcu_read_lock();
+ 				cur = rcu_dereference(cpu_rq(i)->curr);
+@@ -1529,8 +1532,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					nr_running++;
+ 				rcu_read_unlock();
+ 
+-				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+-					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
++				//trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
++				//	     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+ 			}
+ 
+ 			// a_occ /= nr;
+@@ -1541,8 +1544,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
+ 				last_m_a_occ = a_occ;
+ 
+-			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+-				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
++			//trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
++			//	     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+ 
+ 			for_each_cpu(i, sched_domain_span(sd)) {
+ 				/* XXX threshold ? */
+@@ -1553,12 +1556,17 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		}
+ 	}
+ 
++	scan_cost = sched_clock_cpu(curr_cpu) - t0;
++
+ 	if (m_a_occ > (2 * last_m_a_occ)) {
+ 		/* avoid the bouncing of mm_sched_cpu */
++		trace_sched_cache_work(p, mm->mm_sched_cpu, llc_id(mm->mm_sched_cpu),
++					m_a_cpu, llc_id(m_a_cpu));
+ 		mm->mm_sched_cpu = m_a_cpu;
+ 	}
+ 
+ 	update_avg(&mm->nr_running_avg, nr_running);
++	trace_sched_scan_cost(p, scan_cost, nr_scan, mm->nr_running_avg, nr_running);
+ 	free_cpumask_var(cpus);
+ }
+ 
+@@ -10443,6 +10451,13 @@ static void attach_task(struct rq *rq, struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(rq);
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (p->mm)
++		trace_sched_attach_task(p,
++					p->mm->mm_sched_cpu,
++					p->mm->mm_sched_cpu != -1 ? llc_id(p->mm->mm_sched_cpu) : -1,
++					cpu_of(rq), llc_id(cpu_of(rq)));
++#endif
+ 	WARN_ON_ONCE(task_rq(p) != rq);
+ 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+ 	wakeup_preempt(rq, p, 0);
+-- 
+2.25.1
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.17/0002-bbr3.patch
new file mode 100644
index 0000000..e69de29
diff --git a/sys-kernel/gentoo-sources-6.17/0003-block.patch b/sys-kernel/gentoo-sources-6.17/0003-block.patch
new file mode 100644
index 0000000..e69de29
diff --git a/sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip b/sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip
new file mode 100644
index 0000000..e69de29
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch
new file mode 100644
index 0000000..2ac2c2f
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch
@@ -0,0 +1,654 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 46062169AD2
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:21 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206703; cv=none; b=RpWLRsxlJTzhlJSNJ6YDnnOidsJ7oCIJ0QG0EXS7VFoOFFRWiuWYlsET6M5MjOkyE+dnQih3vxbVtcm+li+EdUZBeyP5FVticeDHkmuoWPHZblewToySaE5iRFgZqZZMrF2/g7ww+IHVQ3wb1PmaWoyqrDBaIo5To0g72h92TRE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206703; c=relaxed/simple;
+	bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version:Content-Type; b=QJa7XLmNRAgs2IV6jX9+J3RTiz2TA7hXn5NgC4yjWKV75coBs2eumwHZZgG2HlZqrxNZy2yyHAMM73rFnrDZIvG+RpHWxcfbJopVHrre/vMQ3HJJFjQUmhaAwWCfX+5CuF2S3mkLLbQPk1FwQMpFRQzmQi7ZRNOguwaR+/BIBvQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=fA7dEfIE; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="fA7dEfIE"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206701; x=1791742701;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=;
+  b=fA7dEfIE91ULN1jqc64owLAysrWyqWsDA5nuO1+sgcIA15Yn8yYj6iw4
+   55VPKl3g+xYXhPmGyE7a0LZvFUc9YG3ckmUpqO0pvf6oo1RJcM13mS3yi
+   KNsM4bbd9aFpNPTftzZGqryw94QrGirzar7JNUNOk0MJqRkziOVPLHnOi
+   iVfGn7SOaI4LzDDzlorOXwaeFstT3f2UVe0Cr2vAWBdxYyDop0Z+G9hqb
+   BhSDn+aeXU8OqAYP/xGpt3Ce8cbnDhTJhA+r5jzej1xMspSEeS1p/SQOm
+   slC+k3w/mm9HPugo6aL39ZyshlQHrAN4qvnJBJT/5GnR6bFHs9O0IKtHz
+   w==;
+X-CSE-ConnectionGUID: AwkM8kCOR6yXxOyCyDBj4Q==
+X-CSE-MsgGUID: FBEmDsF5QKC61vf0MqpBmQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339614"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339614"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:19 -0700
+X-CSE-ConnectionGUID: HGgPT3dBQFm59TiA7l3rfA==
+X-CSE-MsgGUID: SlOHviQzSgGRjsbScX9f4Q==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487181"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:19 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load balancing
+Date: Sat, 11 Oct 2025 11:24:38 -0700
+Message-Id: <865b852e3fdef6561c9e0a5be9a94aec8a68cdea.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+
+Cache-aware load balancing aims to aggregate tasks with potential
+shared resources into the same cache domain. This approach enhances
+cache locality, thereby optimizing system performance by reducing
+cache misses and improving data access efficiency.
+
+In the current implementation, threads within the same process are
+considered as entities that potentially share resources.
+Cache-aware load balancing monitors the CPU occupancy of each cache
+domain for every process. Based on this monitoring, it endeavors to
+migrate threads within a given process to its cache-hot domains,
+with the goal of maximizing cache locality.
+
+It is an attempt at modelling cache affinity. While the patch series
+only targets LLC, it could very well be extended to clusters (L2),
+or other kind of domains grouping inside a node.
+
+As it stands, the mechanism only computes a CPU within the LLC that
+has the highest recent runtime; this CPU is then used in the load
+balance path in subsequent patches to steer toward this LLC.
+
+More elaborate measures could be added later in NUMA_BALANCING: for
+example, migrating task A to its preferred LLC when it has spare CPU
+capacity, or swapping task A with another running task B in task Aâs
+preferred LLC.
+
+Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/mm_types.h |  44 ++++++
+ include/linux/sched.h    |   4 +
+ init/Kconfig             |  11 ++
+ kernel/fork.c            |   6 +
+ kernel/sched/core.c      |   6 +
+ kernel/sched/fair.c      | 288 +++++++++++++++++++++++++++++++++++++++
+ kernel/sched/features.h  |   1 +
+ kernel/sched/sched.h     |   8 ++
+ 8 files changed, 368 insertions(+)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 08bc2442db93..3ca557c2f36d 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -927,6 +927,11 @@ struct mm_cid {
+ };
+ #endif
+ 
++struct mm_sched {
++	u64 runtime;
++	unsigned long epoch;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -1017,6 +1022,17 @@ struct mm_struct {
+ 		 */
+ 		raw_spinlock_t cpus_allowed_lock;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
++		 * See account_mm_sched() and ...
++		 */
++		struct mm_sched __percpu *pcpu_sched;
++		raw_spinlock_t mm_sched_lock;
++		unsigned long mm_sched_epoch;
++		int mm_sched_cpu;
++#endif
++
+ #ifdef CONFIG_MMU
+ 		atomic_long_t pgtables_bytes;	/* size of all page tables */
+ #endif
+@@ -1436,6 +1452,34 @@ static inline unsigned int mm_cid_size(void)
+ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
++#ifdef CONFIG_SCHED_CACHE
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
++
++static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
++{
++	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++
++	if (!pcpu_sched)
++		return -ENOMEM;
++
++	mm_init_sched(mm, pcpu_sched);
++	return 0;
++}
++
++#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
++
++static inline void mm_destroy_sched(struct mm_struct *mm)
++{
++	free_percpu(mm->pcpu_sched);
++	mm->pcpu_sched = NULL;
++}
++#else /* !CONFIG_SCHED_CACHE */
++
++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
++static inline void mm_destroy_sched(struct mm_struct *mm) { }
++
++#endif /* CONFIG_SCHED_CACHE */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index f8188b833350..d7ddb7ce6c4b 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1400,6 +1400,10 @@ struct task_struct {
+ 	unsigned long			numa_pages_migrated;
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	struct callback_head		cache_work;
++#endif
++
+ #ifdef CONFIG_RSEQ
+ 	struct rseq __user *rseq;
+ 	u32 rseq_len;
+diff --git a/init/Kconfig b/init/Kconfig
+index e3eb63eadc87..4e625db7920a 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -970,6 +970,17 @@ config NUMA_BALANCING
+ 
+ 	  This system will be inactive on UMA systems.
+ 
++config SCHED_CACHE
++	bool "Cache aware load balance"
++	default y
++	depends on SMP
++	help
++	  When enabled, the scheduler will attempt to aggregate tasks from
++	  the same process onto a single Last Level Cache (LLC) domain when
++	  possible. This improves cache locality by keeping tasks that share
++	  resources within the same cache domain, reducing cache misses and
++	  lowering data access latency.
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index c4ada32598bd..9cd6efe2926d 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
+ 	cleanup_lazy_tlbs(mm);
+ 
+ 	WARN_ON_ONCE(mm == current->active_mm);
++	mm_destroy_sched(mm);
+ 	mm_free_pgd(mm);
+ 	mm_free_id(mm);
+ 	destroy_context(mm);
+@@ -1079,6 +1080,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	if (mm_alloc_cid(mm, p))
+ 		goto fail_cid;
+ 
++	if (mm_alloc_sched(mm))
++		goto fail_sched;
++
+ 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ 				     NR_MM_COUNTERS))
+ 		goto fail_pcpu;
+@@ -1088,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	return mm;
+ 
+ fail_pcpu:
++	mm_destroy_sched(mm);
++fail_sched:
+ 	mm_destroy_cid(mm);
+ fail_cid:
+ 	destroy_context(mm);
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index be00629f0ba4..79d15e904d12 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4520,6 +4520,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ 	p->migration_pending = NULL;
+ 	init_sched_mm_cid(p);
++	init_sched_mm(p);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+@@ -8821,6 +8822,11 @@ void __init sched_init(void)
+ 
+ 		rq->core_cookie = 0UL;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		raw_spin_lock_init(&rq->cpu_epoch_lock);
++		rq->cpu_epoch_next = jiffies;
++#endif
++
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+ 	}
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b173a059315c..a2ea002f4fd6 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
+ 	sa->runnable_avg = sa->util_avg;
+ }
+ 
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
++
+ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ {
+ 	u64 now = rq_clock_task(rq);
+@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ 		trace_sched_stat_runtime(running, delta_exec);
+ 		account_group_exec_runtime(running, delta_exec);
++		account_mm_sched(rq, donor, delta_exec);
+ 
+ 		/* cgroup time is always accounted against the donor */
+ 		cgroup_account_cputime(donor, delta_exec);
+@@ -1193,6 +1196,289 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 	return delta_exec;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++
++/*
++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
++ * tunable or so.
++ */
++#define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
++#define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
++
++static int llc_id(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
++{
++	unsigned long epoch;
++	int i;
++
++	for_each_possible_cpu(i) {
++		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct rq *rq = cpu_rq(i);
++
++		pcpu_sched->runtime = 0;
++		pcpu_sched->epoch = rq->cpu_epoch;
++		epoch = rq->cpu_epoch;
++	}
++
++	raw_spin_lock_init(&mm->mm_sched_lock);
++	mm->mm_sched_epoch = epoch;
++	mm->mm_sched_cpu = -1;
++
++	/*
++	 * The update to mm->pcpu_sched should not be reordered
++	 * before initialization to mm's other fields, in case
++	 * the readers may get invalid mm_sched_epoch, etc.
++	 */
++	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++}
++
++/* because why would C be fully specified */
++static __always_inline void __shr_u64(u64 *val, unsigned int n)
++{
++	if (n >= 64) {
++		*val = 0;
++		return;
++	}
++	*val >>= n;
++}
++
++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	lockdep_assert_held(&rq->cpu_epoch_lock);
++
++	unsigned long n, now = jiffies;
++	long delta = now - rq->cpu_epoch_next;
++
++	if (delta > 0) {
++		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		rq->cpu_epoch += n;
++		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		__shr_u64(&rq->cpu_runtime, n);
++	}
++
++	n = rq->cpu_epoch - pcpu_sched->epoch;
++	if (n) {
++		pcpu_sched->epoch += n;
++		__shr_u64(&pcpu_sched->runtime, n);
++	}
++}
++
++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
++
++	__update_mm_sched(rq, pcpu_sched);
++
++	/*
++	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
++	 * the accumulation period, this means the multiplcation here should
++	 * not overflow.
++	 */
++	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
++}
++
++static inline
++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_sched *pcpu_sched;
++	unsigned long epoch;
++
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (p->sched_class != &fair_sched_class)
++		return;
++	/*
++	 * init_task and kthreads don't having mm
++	 */
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
++
++	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
++		__update_mm_sched(rq, pcpu_sched);
++		pcpu_sched->runtime += delta_exec;
++		rq->cpu_runtime += delta_exec;
++		epoch = rq->cpu_epoch;
++	}
++
++	/*
++	 * If this task hasn't hit task_cache_work() for a while, or it
++	 * has only 1 thread, invalidate its preferred state.
++	 */
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	    get_nr_threads(p) <= 1) {
++		if (mm->mm_sched_cpu != -1)
++			mm->mm_sched_cpu = -1;
++	}
++}
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	struct mm_struct *mm = p->mm;
++
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	guard(raw_spinlock)(&mm->mm_sched_lock);
++
++	if (work->next == work) {
++		task_work_add(p, work, TWA_RESUME);
++		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
++	}
++}
++
++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
++			      int pref_nid, int curr_cpu)
++{
++#ifdef CONFIG_NUMA_BALANCING
++	/* First honor the task's preferred node. */
++	if (pref_nid != NUMA_NO_NODE)
++		cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
++#endif
++
++	/* Next honor the task's cache CPU if it is not included. */
++	if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus))
++		cpumask_or(cpus, cpus,
++			   cpumask_of_node(cpu_to_node(cache_cpu)));
++
++	/*
++	 * Lastly make sure that the task's current running node is
++	 * considered.
++	 */
++	if (!cpumask_test_cpu(curr_cpu, cpus))
++		cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
++}
++
++static void __no_profile task_cache_work(struct callback_head *work)
++{
++	struct task_struct *p = current;
++	struct mm_struct *mm = p->mm;
++	unsigned long m_a_occ = 0;
++	unsigned long curr_m_a_occ = 0;
++	int cpu, m_a_cpu = -1, cache_cpu,
++	    pref_nid = NUMA_NO_NODE, curr_cpu;
++	cpumask_var_t cpus;
++
++	WARN_ON_ONCE(work != &p->cache_work);
++
++	work->next = work;
++
++	if (p->flags & PF_EXITING)
++		return;
++
++	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
++		return;
++
++	curr_cpu = task_cpu(p);
++	cache_cpu = mm->mm_sched_cpu;
++#ifdef CONFIG_NUMA_BALANCING
++	if (static_branch_likely(&sched_numa_balancing))
++		pref_nid = p->numa_preferred_nid;
++#endif
++
++	scoped_guard (cpus_read_lock) {
++		get_scan_cpumasks(cpus, cache_cpu,
++				  pref_nid, curr_cpu);
++
++		for_each_cpu(cpu, cpus) {
++			/* XXX sched_cluster_active */
++			struct sched_domain *sd = per_cpu(sd_llc, cpu);
++			unsigned long occ, m_occ = 0, a_occ = 0;
++			int m_cpu = -1, i;
++
++			if (!sd)
++				continue;
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				occ = fraction_mm_sched(cpu_rq(i),
++							per_cpu_ptr(mm->pcpu_sched, i));
++				a_occ += occ;
++				if (occ > m_occ) {
++					m_occ = occ;
++					m_cpu = i;
++				}
++			}
++
++			/*
++			 * Compare the accumulated occupancy of each LLC. The
++			 * reason for using accumulated occupancy rather than average
++			 * per CPU occupancy is that it works better in asymmetric LLC
++			 * scenarios.
++			 * For example, if there are 2 threads in a 4CPU LLC and 3
++			 * threads in an 8CPU LLC, it might be better to choose the one
++			 * with 3 threads. However, this would not be the case if the
++			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
++			 * if average per CPU occupancy is used).
++			 * Besides, NUMA balancing fault statistics behave similarly:
++			 * the total number of faults per node is compared rather than
++			 * the average number of faults per CPU. This strategy is also
++			 * followed here.
++			 */
++			if (a_occ > m_a_occ) {
++				m_a_occ = a_occ;
++				m_a_cpu = m_cpu;
++			}
++
++			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
++				curr_m_a_occ = a_occ;
++
++			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
++		}
++	}
++
++	if (m_a_occ > (2 * curr_m_a_occ)) {
++		/*
++		 * Avoid switching mm_sched_cpu too fast.
++		 * The reason to choose 2X is because:
++		 * 1. It is better to keep the preferred LLC stable,
++		 *    rather than changing it frequently and cause migrations
++		 * 2. 2X means the new preferred LLC has at least 1 more
++		 *    busy CPU than the old one(200% vs 100%, eg)
++		 * 3. 2X is chosen based on test results, as it delivers
++		 *    the optimal performance gain so far.
++		 */
++		mm->mm_sched_cpu = m_a_cpu;
++	}
++
++	free_cpumask_var(cpus);
++}
++
++void init_sched_mm(struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++
++	init_task_work(work, task_cache_work);
++	work->next = work;
++}
++
++#else
++
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
++				    s64 delta_exec) { }
++
++void init_sched_mm(struct task_struct *p) { }
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
++
++#endif
++
+ /*
+  * Used by other classes to account runtime.
+  */
+@@ -13031,6 +13317,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ 	if (static_branch_unlikely(&sched_numa_balancing))
+ 		task_tick_numa(rq, curr);
+ 
++	task_tick_cache(rq, curr);
++
+ 	update_misfit_status(curr, rq);
+ 	check_update_overutilized_status(task_rq(curr));
+ 
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 3c12d9f93331..d2af7bfd36bf 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_UTIL, true)
+ 
++SCHED_FEAT(SCHED_CACHE, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index be9745d104f7..2ded8d3d0ecc 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1166,6 +1166,12 @@ struct rq {
+ 	u64			clock_pelt_idle_copy;
+ 	u64			clock_idle_copy;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
++	u64			cpu_runtime;
++	unsigned long		cpu_epoch;
++	unsigned long		cpu_epoch_next;
++#endif
+ 
+ 	atomic_t		nr_iowait;
+ 
+@@ -3790,6 +3796,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
+ 
++extern void init_sched_mm(struct task_struct *p);
++
+ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+ extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ static inline
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch b/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch
new file mode 100644
index 0000000..cbf16ce
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch
@@ -0,0 +1,227 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19068204096
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:21 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206704; cv=none; b=EzlLh3pSj7Y4f8RITAS280jAzGdfSil0Uvmf2s0iDBWXhjbTN9kKcwe8yCBI8vI/kpxwAU/q6SDZiBXRODyVXxt+x1ZEHGNytyNVJ+14VdLcKLUF/bWqEXXojGdMU1nZFeYor5k/Gwn2eBMXY7mjVq+req3REwzEV/z7PNxWJYU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206704; c=relaxed/simple;
+	bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version:Content-Type; b=naTQ9gtxsiPYap1e7sRA67shhCjtvQU5+UWYPmFmFnsa1NV0CLod+8tcKlUn52BHYuXFMHk+KQi3AhpPSOC+Tysfot4R/EhnOjDucwfpslAmfKl+rwCfOrGMnq3fjOG/h3r7EnuLxz8dxpUfqriJzedrFrStvfO37iAPvvF5HVg=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=LSwa/WAK; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="LSwa/WAK"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206702; x=1791742702;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=;
+  b=LSwa/WAKvGAX6RIYpQ7iNqrlvhm/Szlkb5ZlWCgbajQDsBhTiTWg/PPi
+   Nxj6VEs7MSoZptgkIvxX8jl3FQca3deDnRuhlinmaGbJYu3LY3ZP4p3jp
+   4+hBugKd3GkfwcLlWr+3IrP84r9gwdtMmKlDccI1G07f4s4tirTBoEDsm
+   gJ8uA3qrKlx1xYMf/sgz5udiByo4NeRPGdBdJ+bYBTDvNTGeTE9k4bBmi
+   0OuSxEI9YhInAS8s2mr8VnpZwUVjixmAO4g6ZwRHW42PucNrjAj/v7YoU
+   sfJ1aDaIb4/pD7oTExOcJxChABHQZAXGQ1b9F1jBoWdX4w8mb0HwbQJ+I
+   A==;
+X-CSE-ConnectionGUID: V6kqtIYCR06jkGZvnWCLsQ==
+X-CSE-MsgGUID: XSPXCIWWQjiVjOSNzEq1Ow==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339631"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339631"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:20 -0700
+X-CSE-ConnectionGUID: wcTW2V7hQHun3H1J8na2Fw==
+X-CSE-MsgGUID: zfpr8MStR5yuJxzDpmsnpw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487184"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:20 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 02/19] sched/fair: Record per-LLC utilization to guide cache-aware scheduling decisions
+Date: Sat, 11 Oct 2025 11:24:39 -0700
+Message-Id: <7684e7381c61a2a0d0580790340d4daa5349e48c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+When a system becomes busy and a processâs preferred LLC is
+saturated with too many threads, tasks within that LLC migrate
+frequently. These in LLC migrations introduce latency and degrade
+performance. To avoid this, task aggregation should be suppressed when
+the preferred LLC is overloaded, which requires a metric to indicate
+LLC utilization.
+
+Record per LLC utilization/cpu capacity during periodic load
+balancing. These statistics will be used in later patches to decide
+whether tasks should be aggregated into their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched/topology.h |  4 ++
+ kernel/sched/fair.c            | 73 ++++++++++++++++++++++++++++++++++
+ 2 files changed, 77 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 5263746b63e8..fa25db00fdb6 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -77,6 +77,10 @@ struct sched_domain_shared {
+ 	atomic_t	nr_busy_cpus;
+ 	int		has_idle_cores;
+ 	int		nr_idle_scan;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned long	util_avg;
++	unsigned long	capacity ____cacheline_aligned_in_smp;
++#endif
+ };
+ 
+ struct sched_domain {
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a2ea002f4fd6..1ebb0d99a906 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9559,6 +9559,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/* Called from load balancing paths with rcu_read_lock held */
++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
++					 unsigned long *cap)
++{
++	struct sched_domain_shared *sd_share;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share)
++		return false;
++
++	*util = READ_ONCE(sd_share->util_avg);
++	*cap = READ_ONCE(sd_share->capacity);
++
++	return true;
++}
++#else
++static inline bool get_llc_stats(int cpu, unsigned long *util,
++				 unsigned long *cap)
++{
++	return false;
++}
++#endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+  */
+@@ -10529,6 +10552,55 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Record the statistics for this scheduler group for later
++ * use. These values guide load balancing on aggregating tasks
++ * to a LLC.
++ */
++static void record_sg_llc_stats(struct lb_env *env,
++				struct sg_lb_stats *sgs,
++				struct sched_group *group)
++{
++	/*
++	 * Find the child domain on env->dst_cpu. This domain
++	 * is either the domain that spans this group(if the
++	 * group is a local group), or the sibling domain of
++	 * this group.
++	 */
++	struct sched_domain *sd = env->sd->child;
++	struct sched_domain_shared *sd_share;
++
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++		return;
++
++	/* only care about sched domains spanning a LLC */
++	if (sd != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
++		return;
++
++	/*
++	 * At this point we know this group spans a LLC domain.
++	 * Record the statistic of this group in its corresponding
++	 * shared LLC domain.
++	 */
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
++					   cpumask_first(sched_group_span(group))));
++	if (!sd_share)
++		return;
++
++	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
++		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
++
++	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
++		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
++}
++#else
++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
++				       struct sched_group *group)
++{
++}
++#endif
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10618,6 +10690,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	record_sg_llc_stats(env, sgs, group);
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch b/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch
new file mode 100644
index 0000000..eb1895b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch
@@ -0,0 +1,335 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F9012652B7
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:22 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206704; cv=none; b=oUXwn7ZLltUxrcsLLRQdMkG+rOj3I6N99RIlDJViVMyN84ZxeHx7+Ziq9zOEmnN6HNfk258hdIef+3nAkETeBkCnWEbZ8Lcj64n3OoXf0SrXkICA1KPwc1TZ230lpQNfogVeErSJlu4VOhrgueBPexZRP8Ng8MlzAqpdxuV0fQw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206704; c=relaxed/simple;
+	bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=h155j6xc5cDWdV6bfIecXus0Znq8M6zidqbVhtVjeQT/UoiHcyIrY8v1abXoVw27R0/39P2bQUH4GyYEjMOV8PSTvlLp8J+kYh4mcI1SSe5ftkudSs2ubZG59uaM4B6xXwz85tEAhPwwNkRLqFlmW7J/wyi3Ynw+ec/ie7a3Ft4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n7smfE6o; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n7smfE6o"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206702; x=1791742702;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=;
+  b=n7smfE6oCjv1Z9pv/7dg2JDtqoMwaTw0XnoJhqh6krIk55XD846r100l
+   CQyKNCviKGlIlQvhs/a27sgH4IgQduwhbRn6XT0KlUibkjI+C8DxLau1W
+   bQGlFOBkWVF6N/GWfn6y0ss98uylK337lt84xU7aPoM+QWTzjR+VkOrKT
+   0bIzxevMwLmEG4vuOleJ69vSQP6G0PZSGpGrTBTnbFEemOJQO4Ufh8Z3S
+   CBvnKym+IUG+WQx9TQa+cFfFXkPxhSkobYj2dyGq+CWyc4oBsOiaaIfuN
+   mb6/NAGjVnTGTjlIsC3a7QsDovld1JkhMvVnrniOZGCbMVHv6vrIMp6no
+   g==;
+X-CSE-ConnectionGUID: y8Q0FIVVTeyqh+iA7G7QGw==
+X-CSE-MsgGUID: NHnFhDxxRvChXLKbODkIZw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339652"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339652"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:21 -0700
+X-CSE-ConnectionGUID: r3BrcjKDSJONY4pZr3YdUQ==
+X-CSE-MsgGUID: 9FSjHRHPTQWyN3aom4KQIA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487189"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:21 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy
+Date: Sat, 11 Oct 2025 11:24:40 -0700
+Message-Id: <d6830774db8e260af4da728c44e9f899376a0ea6.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Cache-aware scheduling aggregates threads onto their preferred LLC,
+mainly through load balancing. When the preferred LLC becomes
+saturated, more threads are still placed there, increasing latency.
+A mechanism is needed to limit aggregation so that the preferred LLC
+does not become overloaded.
+
+Introduce helper functions can_migrate_llc() and
+can_migrate_llc_task() to enforce the LLC migration policy:
+
+  1. Aggregate a task to its preferred LLC if both source and
+     destination LLCs are not too busy (<50% utilization, tunable),
+     or if doing so will not leave the preferred LLC much more
+     imbalanced than the non-preferred one (>20% utilization
+     difference, tunable, similar to imbalance_pct of the LLC domain).
+  2. Allow moving a task from overloaded preferred LLC to a non preferred
+     LLC if this will not cause the non preferred LLC to become
+     too imbalanced to cause a later migration back.
+  3. If both LLCs are too busy, let the generic load balance to spread
+     the tasks.
+
+This hysteresis prevents tasks from being migrated into and out of the
+preferred LLC frequently (back and forth): the threshold for migrating
+a task out of its preferred LLC is higher than that for migrating it
+into the LLC.
+
+Since aggregation tends to make the preferred LLC busier than others,
+the imbalance tolerance is controlled by llc_imb_pct. If set to 0,
+tasks may still aggregate to the preferred LLC as long as it is
+not more utilized than the source LLC, preserving the preference.
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/debug.c |   4 ++
+ kernel/sched/fair.c  | 145 +++++++++++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h |   5 ++
+ 3 files changed, 154 insertions(+)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 02e16b70a790..57bb04ebbf96 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -523,6 +523,10 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct);
++	debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct);
++#endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+ 	debugfs_fair_server_init();
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 1ebb0d99a906..cd080468ddc9 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ #define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
+ #define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
+ 
++__read_mostly unsigned int llc_overload_pct       = 50;
++__read_mostly unsigned int llc_imb_pct            = 20;
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -9560,6 +9563,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ }
+ 
+ #ifdef CONFIG_SCHED_CACHE
++/*
++ * The margin used when comparing LLC utilization with CPU capacity.
++ * Parameter llc_overload_pct determines the LLC load level where
++ * active LLC aggregation is done.
++ * Derived from fits_capacity().
++ *
++ * (default: ~50%)
++ */
++#define fits_llc_capacity(util, max)	\
++	((util) * 100 < (max) * llc_overload_pct)
++
++/*
++ * The margin used when comparing utilization.
++ * is 'util1' noticeably greater than 'util2'
++ * Derived from capacity_greater().
++ * Bias is in perentage.
++ */
++/* Allows dst util to be bigger than src util by up to bias percent */
++#define util_greater(util1, util2) \
++	((util1) * 100 > (util2) * (100 + llc_imb_pct))
++
+ /* Called from load balancing paths with rcu_read_lock held */
+ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 					 unsigned long *cap)
+@@ -9575,6 +9599,127 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 
+ 	return true;
+ }
++
++/*
++ * Decision matrix according to the LLC utilization. To
++ * decide whether we can do task aggregation across LLC.
++ *
++ * By default, 50% is the threshold to treat the LLC as busy,
++ * and 20% is the utilization imbalance percentage to decide
++ * if the preferred LLC is busier than the non-preferred LLC.
++ *
++ * 1. moving towards the preferred LLC, dst is the preferred
++ *    LLC, src is not.
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            Y    Y    Y    N
++ * 40%            Y    Y    Y    Y
++ * 50%            Y    Y    G    G
++ * 60%            Y    Y    G    G
++ *
++ * 2. moving out of the preferred LLC, src is the preferred
++ *    LLC, dst is not:
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            N    N    N    N
++ * 40%            N    N    N    N
++ * 50%            N    N    G    G
++ * 60%            Y    N    G    G
++ *
++ * src :      src_util
++ * dst :      dst_util
++ * Y :        Yes, migrate
++ * N :        No, do not migrate
++ * G :        let the Generic load balance to even the load.
++ *
++ * The intention is that if both LLCs are quite busy, cache aware
++ * load balance should not be performed, and generic load balance
++ * should take effect. However, if one is busy and the other is not,
++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
++ * be considered to determine whether LLC aggregation should be
++ * performed to bias the load towards the preferred LLC.
++ */
++
++/* migration decision, 3 states are orthogonal. */
++enum llc_mig {
++	mig_forbid = 0,		/* N: Don't migrate task, respect LLC preference */
++	mig_llc,		/* Y: Do LLC preference based migration */
++	mig_unrestricted	/* G: Don't restrict generic load balance migration */
++};
++
++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
++				    unsigned long tsk_util,
++				    bool to_pref)
++{
++	unsigned long src_util, dst_util, src_cap, dst_cap;
++
++	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
++	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
++		return mig_unrestricted;
++
++	if (!fits_llc_capacity(dst_util, dst_cap) &&
++	    !fits_llc_capacity(src_util, src_cap))
++		return mig_unrestricted;
++
++	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
++	dst_util = dst_util + tsk_util;
++	if (to_pref) {
++		/*
++		 * llc_imb_pct is the imbalance allowed between
++		 * preferred LLC and non-preferred LLC.
++		 * Don't migrate if we will get preferred LLC too
++		 * heavily loaded and if the dest is much busier
++		 * than the src, in which case migration will
++		 * increase the imbalance too much.
++		 */
++		if (!fits_llc_capacity(dst_util, dst_cap) &&
++		    util_greater(dst_util, src_util))
++			return mig_forbid;
++	} else {
++		/*
++		 * Don't migrate if we will leave preferred LLC
++		 * too idle, or if this migration leads to the
++		 * non-preferred LLC falls within sysctl_aggr_imb percent
++		 * of preferred LLC, leading to migration again
++		 * back to preferred LLC.
++		 */
++		if (fits_llc_capacity(src_util, src_cap) ||
++		    !util_greater(src_util, dst_util))
++			return mig_forbid;
++	}
++	return mig_llc;
++}
++
++/*
++ * Check if task p can migrate from src_cpu to dst_cpu
++ * in terms of cache aware load balance.
++ */
++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++							struct task_struct *p)
++{
++	struct mm_struct *mm;
++	bool to_pref;
++	int cpu;
++
++	mm = p->mm;
++	if (!mm)
++		return mig_unrestricted;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
++		return mig_unrestricted;
++
++	if (cpus_share_cache(dst_cpu, cpu))
++		to_pref = true;
++	else if (cpus_share_cache(src_cpu, cpu))
++		to_pref = false;
++	else
++		return mig_unrestricted;
++
++	return can_migrate_llc(src_cpu, dst_cpu,
++			       task_util(p), to_pref);
++}
++
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 2ded8d3d0ecc..a52c96064b36 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2797,6 +2797,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern unsigned int llc_overload_pct;
++extern unsigned int llc_imb_pct;
++#endif
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch b/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
new file mode 100644
index 0000000..233f3fe
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch
@@ -0,0 +1,208 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD9FE27F75F
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:23 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206705; cv=none; b=toy7mYgrkMShyfYM+pYJVnlk2kT96KNiv5DNY2SPeZNG+C4hUMbzxW+QMLoY5P4G0gxMEqPJZD1oRcx17kku+G6SaznXM9qHf6TbjE3y6E+5eW6mFGs9F7x17MH+po42oQIBeMuQONsrqKSl7XLcK2ag8qWKJC1Xr5w/c8efzqg=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206705; c=relaxed/simple;
+	bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=BrQQpH91F+AYLu9pNsP5vrblllGBIiYSrf9Tqy9EYC4wS0n0udak+gKeFf8J19+3f0P2Q81tPIF74K0DC5ETs6YeanXYBydnXlUojA//lO1O300HBm7E4ONxjKjmsrUvcSI3JT5Le3EHo8kdx7whhv843/P3GIna7MP3njXDV14=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BHqKXCIn; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BHqKXCIn"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206704; x=1791742704;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=;
+  b=BHqKXCInpJ9FMs87LCbtbTr8sCx+I94vOdw+YhnA01VGi2y2vrviHuha
+   44dYUBEYMQCSqJ0LZTT2V+2kshxkaTOgIYxGLcnue8xZcdvJE+tFA1vNK
+   e3l/bHsCjqNkzuXBC7xQTcdlcOk0RWIbIkbhlcUaSh6K3yuxlVHUHJcmE
+   r0xmWO+olPuADPa5P30u0Ohf3HcjIqBXZsxBvV5VI21iprKzNU2fqZx7i
+   dnB6Mbk+VkrpWYKhn8UVMBHAO40Hwj1qg7dTaTpQfAWXx8+nbbBZeHxKl
+   1QcSW4+uLMzTxhbUTINvxL6mxdB/i7FkzCBGLbgZ013YwkDLFD2+4CBnX
+   w==;
+X-CSE-ConnectionGUID: XU0Bp+klQCiSCfmyOaBeOA==
+X-CSE-MsgGUID: qUdy5aE4QB+ndas2O3JrjQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339674"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339674"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:22 -0700
+X-CSE-ConnectionGUID: veyEE6PBTGirh+PomEioDQ==
+X-CSE-MsgGUID: eht/GZN/S/ekMdaQtDO0ag==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487193"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:22 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 04/19] sched/fair: Introduce a static key to enable cache aware only for multi LLCs
+Date: Sat, 11 Oct 2025 11:24:41 -0700
+Message-Id: <ef136e6a6f5a2ef840b1f9571c47411f04705b6a.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Enable cache-aware load balancing only if at least 1 NUMA node has
+more than one LLC.
+
+Suggested-by: Libo Chen <libo.chen@oracle.com>
+Suggested-by: Adam Li <adamli@os.amperecomputing.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 15 ++++++++++++---
+ kernel/sched/sched.h    |  1 +
+ kernel/sched/topology.c | 14 ++++++++++++--
+ 3 files changed, 25 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cd080468ddc9..3d643449c48c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1208,6 +1208,14 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ __read_mostly unsigned int llc_overload_pct       = 50;
+ __read_mostly unsigned int llc_imb_pct            = 20;
+ 
++DEFINE_STATIC_KEY_FALSE(sched_cache_allowed);
++
++static inline bool sched_cache_enabled(void)
++{
++	return sched_feat(SCHED_CACHE) &&
++		static_branch_likely(&sched_cache_allowed);
++}
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -1294,7 +1302,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	if (p->sched_class != &fair_sched_class)
+@@ -1330,7 +1338,7 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 	struct callback_head *work = &p->cache_work;
+ 	struct mm_struct *mm = p->mm;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	if (!mm || !mm->pcpu_sched)
+@@ -10716,7 +10724,8 @@ static void record_sg_llc_stats(struct lb_env *env,
+ 	struct sched_domain *sd = env->sd->child;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++	if (!sched_cache_enabled() ||
++	    env->idle == CPU_NEWLY_IDLE)
+ 		return;
+ 
+ 	/* only care about sched domains spanning a LLC */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index a52c96064b36..60f1e51685ec 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2800,6 +2800,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int llc_overload_pct;
+ extern unsigned int llc_imb_pct;
++extern struct static_key_false sched_cache_allowed;
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 6e2f54169e66..2675db980f70 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -2444,6 +2444,7 @@ static int
+ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+ {
+ 	enum s_alloc alloc_state = sa_none;
++	bool has_multi_llcs = false;
+ 	struct sched_domain *sd;
+ 	struct s_data d;
+ 	struct rq *rq = NULL;
+@@ -2530,10 +2531,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 				 * between LLCs and memory channels.
+ 				 */
+ 				nr_llcs = sd->span_weight / child->span_weight;
+-				if (nr_llcs == 1)
++				if (nr_llcs == 1) {
+ 					imb = sd->span_weight >> 3;
+-				else
++				} else {
+ 					imb = nr_llcs;
++					has_multi_llcs = true;
++				}
+ 				imb = max(1U, imb);
+ 				sd->imb_numa_nr = imb;
+ 
+@@ -2581,6 +2584,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	if (has_cluster)
+ 		static_branch_inc_cpuslocked(&sched_cluster_active);
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (has_multi_llcs) {
++		static_branch_enable_cpuslocked(&sched_cache_allowed);
++		pr_info("Cache aware load balance enabled.\n");
++	}
++#endif
++
+ 	if (rq && sched_debug_verbose)
+ 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch b/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch
new file mode 100644
index 0000000..cd2305a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch
@@ -0,0 +1,291 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E929283153
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:24 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206706; cv=none; b=l9o+r3tPneRXt3UimsPhWTyfqr4rcCBrkqPagUsuj236psyVrtVREf1eV9bh9i5x6sqiX/93/2fGTQOd3tDyAfM2x8nQDBG2tniRFTa1AjKlI5Hs36x8WGu+npNUTYaShkti1wSxrqntJys6VhwZ+aL+o6PQ3k1GyXMU2JJL3bw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206706; c=relaxed/simple;
+	bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=MDjhwzZYr3m7pwdhzj9TlyV526H5WJLBGHEilCqY27+WQSI1yxnPWT6k5Mm6bFKl/0I+sfGQBi/7HzzHe1S3ts6bk23EZaJB+w94GLEZKAcc8cSHQMDIbKKzGRMgBrwPnT0sZBkKxiooppSIJhtXCA86kWL70YWS1bZ1PVuSOI8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BzReY9Ll; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BzReY9Ll"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206705; x=1791742705;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=;
+  b=BzReY9LlEh9sk7OgZDcp2VjjY3mwnRzW5hp4d8rSX40TSJQm31n7pNsD
+   pGDX4pGNqIL2dKhB0TWBOakqdMqoEJBGhhFnbP0SML4ddRpmP22b3hhKk
+   66OBjK6EOlIiBTx96elcU0fwjNnZqBKTvf/i3IuC2HlilzxwoimPLi7ym
+   OqUTRkCWmlqgJ5BjvtUEaD2eb97VkiEAs6iUC5FsMQPohIZRE0ZJGIQT2
+   rLWb4YevoZUYtWiZQU/yYmcq5sU7eCp84d/YBPYTw8uDxW2au989TrB9t
+   olL4givIBdX+ieIJw7430Yz/Es1H+8Ji46MflznNqafshDKBuL8HbpSmx
+   A==;
+X-CSE-ConnectionGUID: xTVpDyXiQYmCxiG8vc8uKg==
+X-CSE-MsgGUID: ouYA76mXSo+MkfJ9ZAYryA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339693"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339693"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:23 -0700
+X-CSE-ConnectionGUID: Vda9/GgFQc2uyKt8dn0epA==
+X-CSE-MsgGUID: 2SFdpXMCSGKC8Z5YqgWCow==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487198"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:23 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs
+Date: Sat, 11 Oct 2025 11:24:42 -0700
+Message-Id: <7d75af576986cf447a171ce11f5e8a15a692e780.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce an index mapping between CPUs and their LLCs. This provides
+a continuous per LLC index needed for cache-aware load balancing in
+later patches.
+
+The existing per_cpu llc_id usually points to the first CPU of the
+LLC domain, which is sparse and unsuitable as an array index. Using
+llc_id directly would waste memory.
+
+With the new mapping, CPUs in the same LLC share a continuous index:
+
+  per_cpu(llc_idx, CPU=0...15)  = 0
+  per_cpu(llc_idx, CPU=16...31) = 1
+  per_cpu(llc_idx, CPU=32...47) = 2
+  ...
+
+The maximum number of LLCs is limited by CONFIG_NR_LLCS. If the number
+of LLCs available exceeds CONFIG_NR_LLCS, the cache aware load balance
+is disabled. To further save memory, this array could be converted to
+dynamic allocation in the future, or the LLC index could be made NUMA
+node-wide.
+
+As mentioned by Adam, if there is no domain with SD_SHARE_LLC, the
+function update_llc_idx() should not be invoked to update the index;
+otherwise, it will generate an invalid index.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/threads.h | 10 +++++++++
+ init/Kconfig            |  9 ++++++++
+ kernel/sched/fair.c     | 11 ++++++++++
+ kernel/sched/sched.h    |  2 ++
+ kernel/sched/topology.c | 47 +++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 79 insertions(+)
+
+diff --git a/include/linux/threads.h b/include/linux/threads.h
+index 1674a471b0b4..2c9b1adfe024 100644
+--- a/include/linux/threads.h
++++ b/include/linux/threads.h
+@@ -20,6 +20,16 @@
+ /* Places which use this should consider cpumask_var_t. */
+ #define NR_CPUS		CONFIG_NR_CPUS
+ 
++#ifndef CONFIG_NR_LLCS
++#define CONFIG_NR_LLCS 1
++#endif
++
++#if CONFIG_NR_LLCS > NR_CPUS
++#define NR_LLCS		NR_CPUS
++#else
++#define NR_LLCS		CONFIG_NR_LLCS
++#endif
++
+ #define MIN_THREADS_LEFT_FOR_ROOT 4
+ 
+ /*
+diff --git a/init/Kconfig b/init/Kconfig
+index 4e625db7920a..6e4c96ccdda0 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -981,6 +981,15 @@ config SCHED_CACHE
+ 	  resources within the same cache domain, reducing cache misses and
+ 	  lowering data access latency.
+ 
++config NR_LLCS
++	int "Maximum number of Last Level Caches"
++	range 2 1024
++	depends on SMP && SCHED_CACHE
++	default 64
++	help
++	  This allows you to specify the maximum number of last level caches
++	  this kernel will support for cache aware scheduling.
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 3d643449c48c..61c129bde8b6 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1224,6 +1224,17 @@ static int llc_id(int cpu)
+ 	return per_cpu(sd_llc_id, cpu);
+ }
+ 
++/*
++ * continuous LLC index, starting from 0.
++ */
++static inline int llc_idx(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_idx, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 60f1e51685ec..b448ad6dc51d 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2039,6 +2039,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DECLARE_PER_CPU(int, sd_llc_size);
+ DECLARE_PER_CPU(int, sd_llc_id);
++DECLARE_PER_CPU(int, sd_llc_idx);
+ DECLARE_PER_CPU(int, sd_share_id);
+ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -2047,6 +2048,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ 
+ extern struct static_key_false sched_asym_cpucapacity;
+ extern struct static_key_false sched_cluster_active;
++extern int max_llcs;
+ 
+ static __always_inline bool sched_asym_cpucap_active(void)
+ {
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 2675db980f70..4bd033060f1d 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -659,6 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DEFINE_PER_CPU(int, sd_llc_size);
+ DEFINE_PER_CPU(int, sd_llc_id);
++DEFINE_PER_CPU(int, sd_llc_idx);
+ DEFINE_PER_CPU(int, sd_share_id);
+ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -668,6 +669,40 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+ 
++int max_llcs = -1;
++
++static void update_llc_idx(int cpu)
++{
++#ifdef CONFIG_SCHED_CACHE
++	int idx = -1, llc_id = -1;
++
++	if (max_llcs > NR_LLCS)
++		return;
++
++	llc_id = per_cpu(sd_llc_id, cpu);
++	idx = per_cpu(sd_llc_idx, llc_id);
++
++	/*
++	 * A new LLC is detected, increase the index
++	 * by 1.
++	 */
++	if (idx < 0) {
++		idx = max_llcs++;
++
++		if (max_llcs > NR_LLCS) {
++			if (static_branch_unlikely(&sched_cache_allowed))
++				static_branch_disable_cpuslocked(&sched_cache_allowed);
++
++			pr_warn_once("CONFIG_NR_LLCS is too small, disable cache aware load balance\n");
++			return;
++		}
++
++		per_cpu(sd_llc_idx, llc_id) = idx;
++	}
++	per_cpu(sd_llc_idx, cpu) = idx;
++#endif
++}
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+@@ -687,6 +722,10 @@ static void update_top_cache_domain(int cpu)
+ 	per_cpu(sd_llc_id, cpu) = id;
+ 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ 
++	/* only update the llc index for domain with SD_SHARE_LLC */
++	if (sd)
++		update_llc_idx(cpu);
++
+ 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ 	if (sd)
+ 		id = cpumask_first(sched_domain_span(sd));
+@@ -2452,6 +2491,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (max_llcs < 0) {
++		for_each_possible_cpu(i)
++			per_cpu(sd_llc_idx, i) = -1;
++		max_llcs = 0;
++	}
++#endif
++
+ 	if (WARN_ON(cpumask_empty(cpu_map)))
+ 		goto error;
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch b/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch
new file mode 100644
index 0000000..33e7efa
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch
@@ -0,0 +1,156 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A93492836B1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:24 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206706; cv=none; b=S6xTZtgG4gDit+VImk9W2UzS4qpXEGkcWHMUVoYyOSnpNNw4aucqYAXSSje8zYLjl3z3dX3Jt3ztt7bwcuxWrRrv6qxUGactOiUWUNrvSPN2VWKScV6w3ksMM6saX0NH5ZC3WBABiX0+fpwQlzvqkQFNz80/YqP8x3hbG8jBKng=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206706; c=relaxed/simple;
+	bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=JsV8TTkODWXWFKIKrzZGo3NxMw8hU5p/OWk4qVG3F1HoqgFqWBsu2TcQGUVWw1R9rnOAFP+1s9fHghtr+g8SHhcTCX8Srq+6rXX7gAPQLfCi2R3P+f6W+h6FG6DDQXFxrgsSAi265RFjsNyqSNVDyYiSw0j1kUou9k2jg/TFWas=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=maHNOTTa; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="maHNOTTa"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206705; x=1791742705;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=;
+  b=maHNOTTaUom4zOfjF9aQgzk/EHInefpcQXQBpZ407o2A6QAh7rtx4d1V
+   uIUh04rGM6MxEKMGQGzPbEcwmEUVnQVNQXhq0m60vo8GIlq3nI3UFHh2/
+   okHOmrxdhoN3uwbNZN5d2mGAMO3ADHunEGtbLYRsJ5ffyJXYwvK9ZYj6n
+   ZqWJDYCygmb5LDln/D3icLbLhH8Zm6QWr4yAgVZQ73wl/I3EgDdp+pIYb
+   aLimiW5HUOhIlD+krR4Rg02sINFyPrZ2h5VJdZ1v01hMqilwa2zgPVcWi
+   tEJ0OmQs9iwf0mBA0kNnJx5l2NSvLy+2FE84H8lwtH6U/4ySfKAnmdVGc
+   Q==;
+X-CSE-ConnectionGUID: LhZ9XN5ESr6ORNd5zvY9sA==
+X-CSE-MsgGUID: UBKHEBpdQNSkGD6fqT87jQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339711"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339711"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:24 -0700
+X-CSE-ConnectionGUID: M/4LVw/6Qg626wVKqENzqw==
+X-CSE-MsgGUID: hqk2hnIER+q1aJ8R3vcczQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487203"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:24 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 06/19] sched/fair: Assign preferred LLC ID to processes
+Date: Sat, 11 Oct 2025 11:24:43 -0700
+Message-Id: <cfa266cd6ea6fa30cbf7b07573992f18f786955e.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+With cache-aware scheduling enabled, each task is assigned a
+preferred LLC ID. This allows quick identification of the LLC domain
+where the task prefers to run, similar to numa_preferred_nid in
+NUMA balancing.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h | 1 +
+ init/init_task.c      | 3 +++
+ kernel/sched/fair.c   | 7 +++++++
+ 3 files changed, 11 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index d7ddb7ce6c4b..8a5e4038cd5c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1402,6 +1402,7 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	int				preferred_llc;
+ #endif
+ 
+ #ifdef CONFIG_RSEQ
+diff --git a/init/init_task.c b/init/init_task.c
+index e557f622bd90..5fffbe766f57 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_group	= NULL,
+ 	.numa_faults	= NULL,
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	.preferred_llc  = -1,
++#endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ 	.kasan_depth	= 1,
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 61c129bde8b6..d6167a029c47 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1312,6 +1312,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
++	int mm_sched_llc = -1;
+ 
+ 	if (!sched_cache_enabled())
+ 		return;
+@@ -1342,6 +1343,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
++
++	if (mm->mm_sched_cpu != -1)
++		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
++
++	if (p->preferred_llc != mm_sched_llc)
++		p->preferred_llc = mm_sched_llc;
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch b/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch
new file mode 100644
index 0000000..f87fefd
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch
@@ -0,0 +1,257 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7966283FE1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:25 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206707; cv=none; b=Jt9YvY3nM/0EYBih4PVmiKQ2QzO4ZDLh2TKnGqMyWerCIfIM0CWceRhOpjM2iQwiUHzLszpycQZ+UQorhwMqEi3t7Erkuc8eVsgIO7guz2r8zCqiEsDc75hJulbNVOIh4Hf5WtkLCN2FDwtJ+pKaDQzjrmQsv/RTGx24LhvBhds=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206707; c=relaxed/simple;
+	bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=PrDaFPl16+dUYVfNSWRpTD87yz4MK7/HdghB7ILX5xXggJN8vYLmcy4RQj7oE9weOCdcBzd1EZg476MST0VNTm2z3r/YGhIw0/+VWbtq1PKhfCTIEnPZWnJryrgw70ZRp0r4XDiQwz/h8bzHoZp9hMCEYHtSbHfUHW8eNSYr5z8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GvsjlkoW; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GvsjlkoW"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206706; x=1791742706;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=;
+  b=GvsjlkoWqX+zgP+tTee0MXcNRVBTPQkZKjOLBXZh33p44VICJNCiih6g
+   bdtLdnWwRkrJ2u2n2AVNyKIqQq+ELwCHQ1bUAIVe5B+Rq8F/WdKivkeVK
+   qCMdNHmRRRa8ijhdo6AEjjUZeHNS6/1dPU14KFq5zOdeXfuxJL5tGjlxb
+   ZtqhKFOWrFhhFPJwUw1KWb7C0rBkSGVoUeZH3ORagBu6Ud545g9bPF/M+
+   p6sJSBNbnSNsdtDoZzzIKVmezgct+rLH0giyW0IcdjAUJlzYg6VsmVomk
+   Zm8UHf1s2hBr8fNdeC7UuXGFmty4d2atXckCM+YB8PsOqI0JwqlHCMSZ2
+   A==;
+X-CSE-ConnectionGUID: uKPzZGMbTiObyQydogOwGQ==
+X-CSE-MsgGUID: QbxPW0yzQ4WA7VOf/APdAg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339729"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339729"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:25 -0700
+X-CSE-ConnectionGUID: GxY9AWlwTACW1S97eEsWGg==
+X-CSE-MsgGUID: +oNXqS3kSkOTENG/ySm5FA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487208"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:25 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 07/19] sched/fair: Track LLC-preferred tasks per runqueue
+Date: Sat, 11 Oct 2025 11:24:44 -0700
+Message-Id: <ccbfda37200b66177a1c1add4715a49b863ac84d.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+For each runqueue, track the number of tasks with an LLC preference
+and how many of them are running on their preferred LLC. This mirrors
+nr_numa_running and nr_preferred_running for NUMA balancing, and will
+be used by cache-aware load balancing in later patches.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/core.c  | 12 +++++++++++
+ kernel/sched/fair.c  | 47 +++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |  7 +++++++
+ 3 files changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 79d15e904d12..5940756e2da3 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -529,6 +529,18 @@ void __trace_set_current_state(int state_value)
+ }
+ EXPORT_SYMBOL(__trace_set_current_state);
+ 
++#ifdef CONFIG_SMP
++int task_llc(const struct task_struct *p)
++{
++	return per_cpu(sd_llc_id, task_cpu(p));
++}
++#else
++int task_llc(const struct task_struct *p)
++{
++	return 0;
++}
++#endif
++
+ /*
+  * Serialization rules:
+  *
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d6167a029c47..fd315937c0cf 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1235,6 +1235,24 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running += (p->preferred_llc != -1);
++	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running -= (p->preferred_llc != -1);
++	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1306,6 +1324,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
+ 	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+ }
+ 
++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
++
+ static inline
+ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+@@ -1347,8 +1367,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (mm->mm_sched_cpu != -1)
+ 		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
+ 
+-	if (p->preferred_llc != mm_sched_llc)
++	/* task not on rq accounted later in account_entity_enqueue() */
++	if (task_running_on_cpu(rq->cpu, p) &&
++	    p->preferred_llc != mm_sched_llc) {
++		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
++		account_llc_enqueue(rq, p);
++	}
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+@@ -1497,6 +1522,15 @@ void init_sched_mm(struct task_struct *p)
+ 	work->next = work;
+ }
+ 
++void reset_llc_stats(struct rq *rq)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running = 0;
++	rq->nr_pref_llc_running = 0;
++}
++
+ #else
+ 
+ static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+@@ -1506,6 +1540,11 @@ void init_sched_mm(struct task_struct *p) { }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
++
++void reset_llc_stats(struct rq *rq) {}
+ #endif
+ 
+ /*
+@@ -3999,6 +4038,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+ 		account_numa_enqueue(rq, task_of(se));
++		account_llc_enqueue(rq, task_of(se));
+ 		list_add(&se->group_node, &rq->cfs_tasks);
+ 	}
+ 	cfs_rq->nr_queued++;
+@@ -4010,9 +4050,14 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	update_load_sub(&cfs_rq->load, se->load.weight);
+ 	if (entity_is_task(se)) {
+ 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
++		account_llc_dequeue(rq_of(cfs_rq), task_of(se));
+ 		list_del_init(&se->group_node);
+ 	}
+ 	cfs_rq->nr_queued--;
++
++	/* safeguard to clear the cache aware data */
++	if (!parent_entity(se) && !cfs_rq->nr_queued)
++		reset_llc_stats(rq_of(cfs_rq));
+ }
+ 
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index b448ad6dc51d..3ab64067acc6 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1098,6 +1098,10 @@ struct rq {
+ 	unsigned int		nr_preferred_running;
+ 	unsigned int		numa_migrate_on;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int		nr_pref_llc_running;
++	unsigned int		nr_llc_running;
++#endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+ 	unsigned int		has_blocked_load;
+@@ -1952,6 +1956,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ 
+ #endif /* !CONFIG_NUMA_BALANCING */
+ 
++void reset_llc_stats(struct rq *rq);
++int task_llc(const struct task_struct *p);
++
+ static inline void
+ queue_balance_callback(struct rq *rq,
+ 		       struct balance_callback *head,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch
new file mode 100644
index 0000000..18dc0f7
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch
@@ -0,0 +1,194 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8D1D3284688
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:26 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206708; cv=none; b=W6A0Asy9e3NNDRL2ti9BvFY1go+vAlduaKJd1rmOWRr4k4IHRIEpHNJhix4g/v1mdJgDI06CWQ3sQC5YxuLOry9f66mT2W5iUkNoO1AMOa7iJYVMhxygC7dgS1riRk+Xr61GHZrfTq3glOqKoHqMJR1ChGEEIDFSijs9KJo91LU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206708; c=relaxed/simple;
+	bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=uzr/dGrFdG1v5FdOJ/f9StnRIpzjJ5uOjWV+sYvWDeYE/dxtVTZG5FXWR8UqlK4jv7ZYYOlRDJRmdwLszrh1cbzNE43kw7ueGEnBAbSwzUyXo12aLw3ckNHZHHjqr9uTbTYz7GDrN3J5K862edN4cdJHoI9buyHUDzdCkXfIheE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MiTdX6Q6; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MiTdX6Q6"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206707; x=1791742707;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=;
+  b=MiTdX6Q6R/zAjqSeS2bqz6JnSO+lVjbu/CGoRS4W48TnANXSK7FbeFq8
+   HIHNTysTrwhHCzP1gtYr6N2x0eFio/feVeyFBD5UytM6ahWF0SC67agMj
+   jWOkCg+WyPpJSmb2V4GE3mePGb9vm7kjvgiTp1tcN15ClNGhVOTqusLqF
+   ueDZKLr7dTfEr95oP3PXRNzKFZfqVSGN5aLDywe826XmjT29nykVCoMh+
+   U9I8MAfHqzZxWLRDx+EC8+DhJZRsWw9B7dXqvyz67FsBnLG+HHYrAB479
+   +0mKNo9XBbRlGAtlUlqUTEvej+mP00q1dndiGmLH/nY7e+wci1WK/1VQo
+   g==;
+X-CSE-ConnectionGUID: e2RK1jGJT9eTlAZZ8FMWJQ==
+X-CSE-MsgGUID: se6P+xZrTfOL+/m4zXf2xg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339748"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339748"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:26 -0700
+X-CSE-ConnectionGUID: Lb/G/3cTR6W6ajd8OWjDtQ==
+X-CSE-MsgGUID: f0zaj3jsRd+gLA/rNNvR9A==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487214"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:26 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 08/19] sched/fair: Introduce per runqueue task LLC preference counter
+Date: Sat, 11 Oct 2025 11:24:45 -0700
+Message-Id: <a002ffc53c06bfa0ef0700631b0cb5413bdbf06c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Each runqueue is assigned a static array where each element tracks
+the number of tasks preferring a given LLC, indexed from 0 to
+NR_LLCS.
+
+For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
+this runqueue which prefer to run within LLC3 (indexed from 0 to NR_LLCS
+
+The load balancer can use this information to identify busy runqueues
+and migrate tasks to their preferred LLC domains.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c  | 35 +++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h |  1 +
+ 2 files changed, 36 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index fd315937c0cf..b7a68fe7601b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1235,22 +1235,51 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static inline int pref_llc_idx(struct task_struct *p)
++{
++	return llc_idx(p->preferred_llc);
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running += (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	++rq->nr_pref_llc[pref_llc];
+ }
+ 
+ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running -= (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	/* avoid negative counter */
++	if (rq->nr_pref_llc[pref_llc] > 0)
++		--rq->nr_pref_llc[pref_llc];
+ }
+ 
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+@@ -1524,10 +1553,16 @@ void init_sched_mm(struct task_struct *p)
+ 
+ void reset_llc_stats(struct rq *rq)
+ {
++	int i = 0;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running = 0;
++
++	for (i = 0; i < max_llcs; ++i)
++		rq->nr_pref_llc[i] = 0;
++
+ 	rq->nr_pref_llc_running = 0;
+ }
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 3ab64067acc6..b801d32d5fba 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1101,6 +1101,7 @@ struct rq {
+ #ifdef CONFIG_SCHED_CACHE
+ 	unsigned int		nr_pref_llc_running;
+ 	unsigned int		nr_llc_running;
++	unsigned int		nr_pref_llc[NR_LLCS];
+ #endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch b/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch
new file mode 100644
index 0000000..caf0c08
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch
@@ -0,0 +1,143 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 896A92848A1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:27 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206709; cv=none; b=OEtiMJ0EXsYmk/b2RpkCvrola+Tb5ZlnJVLLgRLqGiICx7t2qJcij9yw0SgiiThPPPTMrbIdFBAm4w8howvUGPAJFc0ItOZDXO+gwbi0GCrU/MRny5Tre78B7YMgEyxZMXkI05Eu0+fODpObrBBk2c09F8OXQKZ4o5hgptBzDK8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206709; c=relaxed/simple;
+	bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=J6bK9CIrnn+dpoeG8RJW1aH3SE1Yc7QYj7Dgh7cqTjdsd3fsWZdu3E2SAwDjyqT5ptCJzWnqjXDoxnW3sFv/aeRC7QnnQkB9bTzAgmfskcoHsp0hZI6c042fUlYpwgsk0j6PmWc4xM8hZNNktu5sqG8t6W1tVMFc+pGngTuF0j8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n3R+hIU0; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n3R+hIU0"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206708; x=1791742708;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=;
+  b=n3R+hIU0WDMCAOT74Si47T0DHUQFpP/mOPOr4EFjzfrMTg20mocMFVue
+   SPJYeD3u+HI/S8DzRBSopnypgjipAk03R2jKWcm5OSqY338iFWIhO44pH
+   Rkbh2OZ1rpYHNaif/qBdzoG/S0GRuxE4+p6SgnYPob1i1tRz5kFPtKtWI
+   Em/YtXT8s7M8i1lwEkDGhNlIAeWj5yl5FVsHoShyMoDnOs/ZKpz9fa1vH
+   yY+/JK9y5B5Rh8CVo9sz+iLl5gL/zxPW+ETtFRKayHPWInq1R4rGuUz8D
+   OVUSiTUoZeUSI+4YJPz+v9iatJmNEpwFlvZeVYR4+WsdGyv8IT5qlNl3i
+   g==;
+X-CSE-ConnectionGUID: VcC/511LSz6QngP8mD/4Fw==
+X-CSE-MsgGUID: cm5ykdK+Tza9czQo0iIcIQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339767"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339767"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:27 -0700
+X-CSE-ConnectionGUID: +fnFCaxeROy1X1/2M3UOCQ==
+X-CSE-MsgGUID: cAIBkdx0SvqbyNLUptq1pw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487219"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 09/19] sched/fair: Count tasks prefering each LLC in a sched group
+Date: Sat, 11 Oct 2025 11:24:46 -0700
+Message-Id: <00e5f2cb6eadc3738e33858d3c4563a0775ee1c0.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, tabulate the number of tasks on each runqueue
+that prefer a given destination LLC in a sched group.
+
+For example, consider a system with 4 LLC sched groups (LLC0 to LLC3)
+balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has
+2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is
+selected as the busiest source to pick tasks from.
+
+Within a source LLC, the total number of tasks preferring a destination
+LLC is computed by summing counts across all CPUs in that runqueue. For
+instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring
+LLC3, the total for LLC0 is 3.
+
+These statistics allow the load balancer to choose tasks from source
+sched groups that best match their preferred LLCs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b7a68fe7601b..cbd1e97bca4b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10399,6 +10399,9 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int nr_pref_llc[NR_LLCS];
++#endif
+ };
+ 
+ /*
+@@ -10891,6 +10894,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		if (sched_cache_enabled()) {
++			int j;
++
++			for (j = 0; j < max_llcs; ++j)
++				sgs->nr_pref_llc[j] += rq->nr_pref_llc[j];
++		}
++#endif
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch
new file mode 100644
index 0000000..4bcffad
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch
@@ -0,0 +1,187 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F13BA28505C
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:28 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206711; cv=none; b=EhBerRhJhQXPW7xGyw0P5bxJnRZLdUKLIQ12NKKqVw4ZWFGkcALuZ8VykNWnycAafmMkb5kBWaZT15xr3ZuPia1hqPYipqCAVEd34Wn9NgZ7h0Lqr4/FQP1HOI9Yp9naliJ5jjs5uaj5L1/4fJBsGwV0wle3JatN24KLVnEBxK8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206711; c=relaxed/simple;
+	bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=XF3a1nw/8EN0FU+PNi1yIJ/227PxHRBRy24uDZNEkqQuRuIG35Ap7GIvbGG+L1n9ZlEPV0A8eM5UvEqTGNXZktaeA+OJjX4avu9hw9uu6rqowoIWWNlLa6/0iuozmn5jhIZJJqDbWB7j1stg+x51fnwnSbNrDkb2H27S3usCnzQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Loa6o7d1; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Loa6o7d1"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206709; x=1791742709;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=;
+  b=Loa6o7d1Mzs3ouslW83UWTdxmyggGuWTcpizCbNq+GcghqOrvTfXSRIV
+   0EP9sedHVH3VdKCqAQHV/ZX3VHfUXCRKy9+NcdVchFLL8bKi/9buFRwhw
+   ZWmkcnGopsf975TA51MaL7sh2sNrOAvPuHmiA1plKNFBBesobcOlf5xbr
+   aZ9W/S+Mv3Ykf28JPDwOIYzvtKZi5pCgwvqz5wqJHrujBfUq//kuxX1xD
+   44PevqjxkAnPNbnm/C3CdQgNXiNta5xW/ZKmACOzIkYXaOsL8kl9jvdQl
+   4VJ6pV7RaGBpMqmBXGMhRqdKmN0HSByZ1kvmH46v45jRNYG2/U+7kgbrO
+   A==;
+X-CSE-ConnectionGUID: 7OsmkTE2T2eIFyDjRKp/ig==
+X-CSE-MsgGUID: oqLf97jbSIOB+8Rk4LLqqA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339788"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339788"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:28 -0700
+X-CSE-ConnectionGUID: jHLQbWxOTR2E4C2/k5j7Wg==
+X-CSE-MsgGUID: sQhO8wOTQIuj4/5Og2eBgw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487222"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 10/19] sched/fair: Prioritize tasks preferring destination LLC during balancing
+Date: Sat, 11 Oct 2025 11:24:47 -0700
+Message-Id: <ca1946de63ad9f0ae99e079a74d70c55879cc0b6.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, first check for tasks that prefer the
+destination LLC and balance them to it before others.
+
+Mark source sched groups containing tasks preferring non local LLCs
+with the group_llc_balance flag. This ensures the load balancer later
+pulls or pushes these tasks toward their preferred LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 41 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cbd1e97bca4b..af7b578eaa06 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9822,8 +9822,7 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu
+ 	else
+ 		return mig_unrestricted;
+ 
+-	return can_migrate_llc(src_cpu, dst_cpu,
+-			       task_util(p), to_pref);
++	return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref);
+ }
+ 
+ #else
+@@ -10394,6 +10393,7 @@ struct sg_lb_stats {
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
++	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	unsigned int nr_numa_running;
+@@ -10849,11 +10849,45 @@ static void record_sg_llc_stats(struct lb_env *env,
+ 	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+ }
++
++/*
++ * Do LLC balance on sched group that contains LLC, and have tasks preferring
++ * to run on LLC in idle dst_cpu.
++ */
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	struct sched_domain *child = env->sd->child;
++	int llc;
++
++	if (!sched_cache_enabled())
++		return false;
++
++	if (env->sd->flags & SD_SHARE_LLC)
++		return false;
++
++	/* only care about task migration among LLCs */
++	if (child && !(child->flags & SD_SHARE_LLC))
++		return false;
++
++	llc = llc_idx(env->dst_cpu);
++	if (sgs->nr_pref_llc[llc] > 0 &&
++	    can_migrate_llc(env->src_cpu, env->dst_cpu, 0, true) == mig_llc)
++		return true;
++
++	return false;
++}
+ #else
+ static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ 				       struct sched_group *group)
+ {
+ }
++
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	return false;
++}
+ #endif
+ 
+ /**
+@@ -10954,6 +10988,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
+ 	record_sg_llc_stats(env, sgs, group);
++
++	/* Check for tasks in this group can be moved to their preferred LLC */
++	if (!local_group && llc_balance(env, sgs, group))
++		sgs->group_llc_balance = 1;
++
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch
new file mode 100644
index 0000000..ee39ef0
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch
@@ -0,0 +1,184 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3802857E0
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:29 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206711; cv=none; b=t2IkYrrS4OEW0rLnZ4Ph2aLp/ob7UBcUobZQPFlHPmpcJEG5m0pUt/86mOssLKuYpjefjiUDrjFelfxhjAxq8hkNJqtOEMJPbTz+zzT3SsVZRdrqKE8v+5YoRbLqXRQPim2ll3DhWUtUyVjcOo+wuodh/CEa974mbGOLa7mTgCc=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206711; c=relaxed/simple;
+	bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=CFlB5zhIcHUsbSOo/sD1pZdSFz7frR0zFFzgb5/20MqZiItU17WC0G8ifB7ANEAoWHl+sZ1UBTS2HXkckShm7SoSJJXvPBbw6XxQCBJK6yrElYIzS1CzXKAx7vBmkFFghPyfHOK4JpsmMAKYxqatpcWaHZwO7N1+tqHPYDwlFpo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y9YkqrBb; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y9YkqrBb"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206709; x=1791742709;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=;
+  b=Y9YkqrBbsakXirsuA3GK7ppNmtxnJk2cm0iimpzRLvMdIlTwXGPf3Jxq
+   CO6EwYbc/Esxx5TDgaH0h7SVW6eQY5e38xqt9oEwqeMZQtQ13URaPfC2Q
+   Mwk/v0qwxo5jXbC8xa2O9JpbH1ZyVCsabZmLtbPS2e8WfQbQS4lgRoeof
+   RbwLkRXbWC69JnwGxh3aUM7ZF9q8ziMLuIK7nYhL3utheouiHtWkbs+nW
+   RBMmwNo592e9Wh6g7Ht+Vdc051U+njdgUo7aZRqY6DlKoIGZaJJSG2c0W
+   jAF73DWLcSoTQT2Ii9M9dPOTvOCcojIDgIVpILvlasXm0wG4u+s+OJFGn
+   Q==;
+X-CSE-ConnectionGUID: bcFBDLOoTw6TYukUkbI3wQ==
+X-CSE-MsgGUID: 0WEdTBqUR0WG7HuYHYySDg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339807"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339807"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:29 -0700
+X-CSE-ConnectionGUID: teKUgYrNS8ayzrTmALf01w==
+X-CSE-MsgGUID: OBuR3uU9Q8qKO64uzC8h4Q==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487230"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:28 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 11/19] sched/fair: Identify busiest sched_group for LLC-aware load balancing
+Date: Sat, 11 Oct 2025 11:24:48 -0700
+Message-Id: <fcdf37780eeb409cf10925f8b8dcef486c92b218.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+The load balancer selects the busiest sched_group and migrates tasks
+to less busy groups to distribute load across CPUs.
+
+With cache-aware scheduling enabled, the busiest sched_group is
+the one with most tasks preferring the destination LLC. If
+the group has the llc_balance flag set, cache aware load balancing is
+triggered.
+
+Introduce the helper function update_llc_busiest() to identify the
+sched_group with the most tasks preferring the destination LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 38 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index af7b578eaa06..8469ec528cb1 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10877,6 +10877,23 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	int idx;
++
++	/* Only the candidate with llc_balance needs to be taken care of */
++	if (!sgs->group_llc_balance)
++		return false;
++
++	/*
++	 * There are more tasks that want to run on dst_cpu's LLC.
++	 */
++	idx = llc_idx(env->dst_cpu);
++	return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx];
++}
+ #else
+ static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ 				       struct sched_group *group)
+@@ -10888,6 +10905,13 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ {
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	return false;
++}
+ #endif
+ 
+ /**
+@@ -11035,6 +11059,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 	     sds->local_stat.group_type != group_has_spare))
+ 		return false;
+ 
++	/* deal with prefer LLC load balance, if failed, fall into normal load balance */
++	if (update_llc_busiest(env, busiest, sgs))
++		return true;
++
++	/*
++	 * If the busiest group has tasks with LLC preference,
++	 * skip normal load balance.
++	 */
++	if (busiest->group_llc_balance)
++		return false;
++
+ 	if (sgs->group_type > busiest->group_type)
+ 		return true;
+ 
+@@ -11942,9 +11977,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
+ 	/*
+ 	 * Try to move all excess tasks to a sibling domain of the busiest
+ 	 * group's child domain.
++	 * Also do so if we can move some tasks that prefer the local LLC.
+ 	 */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+-	    sibling_imbalance(env, &sds, busiest, local) > 1)
++	    (busiest->group_llc_balance ||
++	    sibling_imbalance(env, &sds, busiest, local) > 1))
+ 		goto force_balance;
+ 
+ 	if (busiest->group_type != group_overloaded) {
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch
new file mode 100644
index 0000000..e9edb7a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch
@@ -0,0 +1,185 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 99E5F28642E
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:30 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206712; cv=none; b=CcfwsAyp1OHHqY4mNPYPcN6bUrl09ci4+a/v8FtP9azgYQzfS6lmRwWajeweUonIlhrYSa3k3Uk+3iau8s00TJMHIq9pc69gZThbuJO24GmjHBtcGot6LsPzytIaUPaB8oNg5fj064BJxFXz948iENpfk/rfsglOKxpcJkX9wG0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206712; c=relaxed/simple;
+	bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ScMEWl2DOAQMR5u9bpXgwKEadirbrSNG1X0vBv1Qm5M7qzeQRW6zyzR/0wZ49Stn9ftQ28uc0NLCvRH6mwbydhKFD3kpg3JgxWk9NBUU+Qnt+t7g3WQ/pDx7wFSEDUiofgdlic68Cqje1J43vJo7n57s1boIMbDvvtchvPGoTXM=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=WEVJOxO1; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="WEVJOxO1"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206711; x=1791742711;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=;
+  b=WEVJOxO1Uy4x+GEHukYgK7cjQhJ+ZPzArevJFx6r0uwjLvVHXCsCVf0d
+   U5oZ9qGbRNsQ961+swsJygnl0Xp69gaKKJFDcVvaKlw28OYtLWeCcKxy5
+   4DN0Azrktm8AXYGwp3idVSw3VynSmNbW2dqVmCfWn3Np2iYv1w7hTpRfb
+   SetW2PMNCXc4Fk5w1ve3GEJ9Bax25e3mUvpabN2XIbAEnlZu4rHyR3ovD
+   1WzBrpK45tvGmB0FKRXCfsKbMFF1KdXCgjW4lAJ2KU2k2bhxv6SPWDjA8
+   0qVm8erW2mgP7HqJHVa71uZn8ehzzZAPeMVO4wyBDdQns/j8tkr67uAC6
+   w==;
+X-CSE-ConnectionGUID: osVAgR9XSEi43ydURnxquA==
+X-CSE-MsgGUID: sgSrXMaOTSCJRnEynSu6Vg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339827"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339827"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:30 -0700
+X-CSE-ConnectionGUID: U/XiMYdrQLyr4smIn6sKwQ==
+X-CSE-MsgGUID: iE4re5OqR+eOwHWBOdmfKA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487233"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:29 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 12/19] sched/fair: Add migrate_llc_task migration type for cache-aware balancing
+Date: Sat, 11 Oct 2025 11:24:49 -0700
+Message-Id: <f22827867d2c245c00063a3fa9f2aeddddaacca1.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce a new migration type, migrate_llc_task, to support
+cache-aware load balancing.
+
+After identifying the busiest sched_group (having the most tasks
+preferring the destination LLC), mark migrations with this type.
+During load balancing, each runqueue in the busiest sched_group is
+examined, and the runqueue with the highest number of tasks preferring
+the destination CPU is selected as the busiest runqueue.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
+ 1 file changed, 31 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 8469ec528cb1..bec6354d7841 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9504,7 +9504,8 @@ enum migration_type {
+ 	migrate_load = 0,
+ 	migrate_util,
+ 	migrate_task,
+-	migrate_misfit
++	migrate_misfit,
++	migrate_llc_task
+ };
+ 
+ #define LBF_ALL_PINNED	0x01
+@@ -10082,6 +10083,10 @@ static int detach_tasks(struct lb_env *env)
+ 			env->imbalance -= util;
+ 			break;
+ 
++		case migrate_llc_task:
++			env->imbalance--;
++			break;
++
+ 		case migrate_task:
+ 			env->imbalance--;
+ 			break;
+@@ -11733,6 +11738,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (busiest->group_llc_balance) {
++		/* Move a task that prefer local LLC */
++		env->migration_type = migrate_llc_task;
++		env->imbalance = 1;
++		return;
++	}
++#endif
++
+ 	if (busiest->group_type == group_imbalanced) {
+ 		/*
+ 		 * In the group_imb case we cannot rely on group-wide averages
+@@ -12041,6 +12055,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ 	unsigned int busiest_nr = 0;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int busiest_pref_llc = 0;
++	int dst_llc;
++#endif
+ 	int i;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+@@ -12149,6 +12167,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 			}
+ 			break;
+ 
++		case migrate_llc_task:
++#ifdef CONFIG_SCHED_CACHE
++			dst_llc = llc_idx(env->dst_cpu);
++			if (!cpus_share_cache(env->dst_cpu, rq->cpu) &&
++			    busiest_pref_llc < rq->nr_pref_llc[dst_llc]) {
++				busiest_pref_llc = rq->nr_pref_llc[dst_llc];
++				busiest = rq;
++			}
++#endif
++			break;
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
+ 				busiest_nr = nr_running;
+@@ -12331,6 +12359,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 	case migrate_misfit:
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
++	case migrate_llc_task:
++		break;
+ 	}
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch
new file mode 100644
index 0000000..50e470a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch
@@ -0,0 +1,208 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CE0E286D56
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:31 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206713; cv=none; b=GHTSZiD43H1BP9udGQWGRTSdycj0dFbwOFNYssvdtvgDyjDEnOhEZuZ3tF7d4Oxq4KjVh/REHJdk8e5qmA0nk91pFvjTrD7ew0sadW9X2+TjejBiKi+Z4u/nZlJeGc29rI3I01ytNZfNGLLusPB2P/4mVx6bLIuv9bhIea7/KOQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206713; c=relaxed/simple;
+	bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Z43NaTPGAIlscL0L7fDhRwzngl1+8YayCbuXKnJJO/leht3IttqnVKWti2tJx4O3Ad4+Bxa7ijhsxQg7lysYNstcyC73l5FTr0P11m80kqmUiNRrC4pt99E80BCBIbFo2SatFJnTKT4Q1ux117UKVwuy6P9Rh922Z1naN6x4Wgc=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JdkwbeJq; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JdkwbeJq"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206711; x=1791742711;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=;
+  b=JdkwbeJqpvNLxxR/C5J1ZH6Sc5bkBzINB0NUowykgcoSMh+IrKTz9SEs
+   3TI4U2WqUZ4fGfcXVpbX1N2vbaAfyQUv4dhr3bMb1WSUcBz4dSrMfVdBf
+   Gdlpc/LwIyV72Eyt8t+mfF176Y/vv2GuGHN9WuXsK8/fBvzDMB20NsZLB
+   QBg0I+M7oRSQsaiygrqnGBFHiCS3p2JbXoqghWgigPrv6u1iqo8HXxcYs
+   HtDa1JUkhRKqPvvWxmzbfQzJYS+Coi/HVD3eewtzP+ILLi56XMzOKLHfR
+   iZqHJ/1cq2a50rc7YQNpk4EmPQ7vkE0qnNCf9o39KpjsRQh5qnu3HCaul
+   A==;
+X-CSE-ConnectionGUID: VRcX2cnOQSeMAY0e8g4K3w==
+X-CSE-MsgGUID: SPoQqM3DQk6EyvXMnqQjmg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339847"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339847"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:31 -0700
+X-CSE-ConnectionGUID: pKVZhrKMR8K6LBYqzMOqAA==
+X-CSE-MsgGUID: CK8cGt1oRtCxjPN4nS/YdA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487238"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:30 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 13/19] sched/fair: Handle moving single tasks to/from their preferred LLC
+Date: Sat, 11 Oct 2025 11:24:50 -0700
+Message-Id: <231864b303906a60491bbb9eb7b2e3f083bff248.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If the busiest runqueue has only one task, active balancing may be
+invoked to move it. However, before migration, check whether the task
+is running on its preferred LLC.
+
+Do not move a lone task to another LLC if it would move the task
+away from its preferred LLC or cause excessive imbalance between LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 59 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index bec6354d7841..19ba9c1b9a63 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9826,12 +9826,53 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu
+ 	return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref);
+ }
+ 
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	if (!sched_cache_enabled())
++		return false;
++
++	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
++		return false;
++	/*
++	 * All tasks prefer to stay on their current CPU.
++	 * Do not pull a task from its preferred CPU if:
++	 * 1. It is the only task running there; OR
++	 * 2. Migrating it away from its preferred LLC would violate
++	 *    the cache-aware scheduling policy.
++	 */
++	if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
++		unsigned long util = 0;
++		struct task_struct *cur;
++
++		if (env->src_rq->nr_running <= 1)
++			return true;
++
++		rcu_read_lock();
++		cur = rcu_dereference(env->src_rq->curr);
++		if (cur)
++			util = task_util(cur);
++		rcu_read_unlock();
++
++		if (can_migrate_llc(env->src_cpu, env->dst_cpu,
++				    util, false) == mig_forbid)
++			return true;
++	}
++
++	return false;
++}
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+ {
+ 	return false;
+ }
++
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	return false;
++}
+ #endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+@@ -12247,6 +12288,9 @@ static int need_active_balance(struct lb_env *env)
+ {
+ 	struct sched_domain *sd = env->sd;
+ 
++	if (break_llc_locality(env))
++		return 0;
++
+ 	if (asym_active_balance(env))
+ 		return 1;
+ 
+@@ -12266,7 +12310,8 @@ static int need_active_balance(struct lb_env *env)
+ 			return 1;
+ 	}
+ 
+-	if (env->migration_type == migrate_misfit)
++	if (env->migration_type == migrate_misfit ||
++	    env->migration_type == migrate_llc_task)
+ 		return 1;
+ 
+ 	return 0;
+@@ -12711,9 +12756,20 @@ static int active_load_balance_cpu_stop(void *data)
+ 		goto out_unlock;
+ 
+ 	/* Is there any task to move? */
+-	if (busiest_rq->nr_running <= 1)
+-		goto out_unlock;
++	if (busiest_rq->nr_running <= 1) {
++#ifdef CONFIG_SCHED_CACHE
++		int llc = llc_idx(target_cpu);
+ 
++		if (!sched_cache_enabled())
++			goto out_unlock;
++
++		if (llc < 0)
++			goto out_unlock;
++		/* don't migrate if no task prefers target */
++		if (busiest_rq->nr_pref_llc[llc] < 1)
++#endif
++			goto out_unlock;
++	}
+ 	/*
+ 	 * This condition is "impossible", if it occurs
+ 	 * we need to fix it. Originally reported by
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch
new file mode 100644
index 0000000..2839724
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch
@@ -0,0 +1,201 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 417D42874EA
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:32 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206714; cv=none; b=P5dnBcm/QdLKKHwOdHn/8WuPNdfAOl/PRiR2K2uOEI4cNFkN+3QA9gv1poGLydzEv/LcejqEay5DpC4q4pFVQXAYgNISmcWGnnkZt2WJ1RNwtLhNEUFXZhx40ubXDsBOhhphD04ToZpipNp3wabmP7EXcOk+GqqMg1ATyjn68eQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206714; c=relaxed/simple;
+	bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=K4DMRiSFceKlJzje7FYPpzQtMciS8INZnGsYmfTeHw6oUtErbWyqEJzurxfkaj/0e2BYrqNZ34Rdy0dGMjqeQWLbOVlQosaArztC6x5+Kes0uifkkB7Pj+Ot9ll7+ydHo4UrJOvNc7oKS/beZOgPG9FPfh7UCSuuvvMEgE2IUTo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=H3CAEs3w; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="H3CAEs3w"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206712; x=1791742712;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=;
+  b=H3CAEs3wXo6bis/3Dkhtptw+Q7vtaAFDMqK8g5XXqpoTWnnoOviYRAT9
+   w6Ikfty6wJNr1MlZJ1pp/FTRrzxJpmwm8JYX2yaBiDeoJDyx/agfVsZPY
+   MklgYKNASSHcEaoYoXP3gsqWfSwXldul6nD1Cye5tqr86XkWjK3gJK3C2
+   XHWF6ABgRrpsZ6WaBAuzrKten6FRqGkbA1i+aWIRwXqoWsGPVsgAC8AT4
+   v51P3tS4APRavdFpCNPn2xNzJPdUZAW7dgqXMB0AkpdRadIZ72DIu+BFu
+   J9oJpUAr+gFfhWThceV6xrW/Bi4Emncs3GIHURfaahEgiLmzNa/UX2/Km
+   w==;
+X-CSE-ConnectionGUID: L4/6SpgURcKa2MOypuG0Tw==
+X-CSE-MsgGUID: s8jp3cejRyWqoo8mO6QU6Q==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339866"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339866"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:32 -0700
+X-CSE-ConnectionGUID: IV2+5+btQ3GmLWn4UVfGIA==
+X-CSE-MsgGUID: Ti8qIpzsSjywiCl630piRA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487243"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:31 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 14/19] sched/fair: Consider LLC preference when selecting tasks for load balancing
+Date: Sat, 11 Oct 2025 11:24:51 -0700
+Message-Id: <26e7bfa88163e13ba1ebefbb54ecf5f42d84f884.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Currently, task selection from the busiest runqueue ignores LLC
+preferences. Reorder tasks in the busiest queue to prioritize selection
+as follows:
+
+  1. Tasks preferring the destination CPU's LLC
+  2. Tasks with no LLC preference
+  3. Tasks preferring an LLC different from their current one
+  4. Tasks preferring the LLC they are currently on
+
+This improves the likelihood that tasks are migrated to their
+preferred LLC.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 19ba9c1b9a63..0fafbfedb21d 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10036,6 +10036,68 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+ 	return NULL;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Prepare lists to detach tasks in the following order:
++ * 1. tasks that prefer dst cpu's LLC
++ * 2. tasks that have no preference in LLC
++ * 3. tasks that prefer LLC other than the ones they are on
++ * 4. tasks that prefer the LLC that they are currently on.
++ */
++static struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	struct task_struct *p;
++	LIST_HEAD(pref_old_llc);
++	LIST_HEAD(pref_new_llc);
++	LIST_HEAD(no_pref_llc);
++	LIST_HEAD(pref_other_llc);
++
++	if (!sched_cache_enabled())
++		return tasks;
++
++	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
++		return tasks;
++
++	while (!list_empty(tasks)) {
++		p = list_last_entry(tasks, struct task_struct, se.group_node);
++
++		if (p->preferred_llc == llc_id(env->dst_cpu)) {
++			list_move(&p->se.group_node, &pref_new_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == llc_id(env->src_cpu)) {
++			list_move(&p->se.group_node, &pref_old_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == -1) {
++			list_move(&p->se.group_node, &no_pref_llc);
++			continue;
++		}
++
++		list_move(&p->se.group_node, &pref_other_llc);
++	}
++
++	/*
++	 * We detach tasks from list tail in detach tasks.  Put tasks
++	 * to be chosen first at end of list.
++	 */
++	list_splice(&pref_new_llc, tasks);
++	list_splice(&no_pref_llc, tasks);
++	list_splice(&pref_other_llc, tasks);
++	list_splice(&pref_old_llc, tasks);
++	return tasks;
++}
++#else
++static inline struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	return tasks;
++}
++#endif
++
+ /*
+  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
+  * busiest_rq, as part of a balancing operation within domain "sd".
+@@ -10044,7 +10106,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+  */
+ static int detach_tasks(struct lb_env *env)
+ {
+-	struct list_head *tasks = &env->src_rq->cfs_tasks;
++	struct list_head *tasks;
+ 	unsigned long util, load;
+ 	struct task_struct *p;
+ 	int detached = 0;
+@@ -10063,6 +10125,8 @@ static int detach_tasks(struct lb_env *env)
+ 	if (env->imbalance <= 0)
+ 		return 0;
+ 
++	tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks);
++
+ 	while (!list_empty(tasks)) {
+ 		/*
+ 		 * We don't want to steal all, otherwise we may be treated likewise,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch b/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch
new file mode 100644
index 0000000..0a36e52
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch
@@ -0,0 +1,156 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 118912877EE
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:33 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206714; cv=none; b=Sgvo8eIzN/unUNmW2/+OixP9udhyNkmi4AZEZzDVPWK1PLnNoYAhA0isU11HgcQC7ul1i5aP8jgG2uHE7Cy8Asrdz+Y08qynhym2Y4X0S+xgTgNOkVzp41IhyzMl092I4cMjY7ziOvFvK6idsHZ/FR3VwQydRvg8d5aWYp64rpE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206714; c=relaxed/simple;
+	bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=TyYasSHPSqMZlN51+4bjWq8Z7cAg9IakiA1ZSJzbhlx8KJc6/UktRCAzZaEkZtQ3d+2B5EUSEDoefcCsbcoCPxFRSCAzN4VD9lBw94R0aIvRHbenlFVxgsvkmUCy9pzg5jZh5zHq/4CLUC+EDPmK622ZE8JNMYgUcZgPpxmosck=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Lw7L05el; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Lw7L05el"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206713; x=1791742713;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=;
+  b=Lw7L05elkbwdOCxozPfNxC8qRTe1i2iYshjZC2z6ZaIHRqDa3MmTXW5p
+   zHG6+auYcjgaRRcY16sdCyIbi7MCQxhd1rhIdaLh0bWrCs4ImE5P1VD8f
+   E+1GcTkJVgNbzLAR5f6+G7KZsA/sstlz5uIOTmFm5WpAXCY87MaYrAMAn
+   AO+uoYvLDh1ME4/gSK2T7C+P7K4lX/jQuif20ZGD72jW5wnQNob4g08JW
+   Z2MLtsd0WXxmCEXIKBfa0mtDIGY2FVs5/FvLd831/0grQYgT8vo1t80Kc
+   spuxB5OU6NgYwRfX7rKRRiLNfth6YUS68l+iwJeWbASwMAqE6PVWIEmJu
+   Q==;
+X-CSE-ConnectionGUID: eDbtoCrOQHyIZtGmIsjSMQ==
+X-CSE-MsgGUID: +ry6w/ChQZGrUwocr7gK9A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339887"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339887"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:33 -0700
+X-CSE-ConnectionGUID: 1LsFjRblTkmkQu9Zwyc6pQ==
+X-CSE-MsgGUID: 7olPURVrSrW53T9U5Kz7mw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487247"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:32 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 15/19] sched/fair: Respect LLC preference in task migration and detach
+Date: Sat, 11 Oct 2025 11:24:52 -0700
+Message-Id: <d3afcff5622222523c843f5c1b023bfe43f9c67c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During the final step of load balancing, can_migrate_task() now
+considers a task's LLC preference before moving it out of its
+preferred LLC.
+
+Additionally, add checks in detach_tasks() to prevent selecting tasks
+that prefer their current LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 24 ++++++++++++++++++++++--
+ 1 file changed, 22 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0fafbfedb21d..65ff7c306a2f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9801,8 +9801,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+  * Check if task p can migrate from src_cpu to dst_cpu
+  * in terms of cache aware load balance.
+  */
+-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+-							struct task_struct *p)
++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++					 struct task_struct *p)
+ {
+ 	struct mm_struct *mm;
+ 	bool to_pref;
+@@ -9969,6 +9969,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (env->flags & LBF_ACTIVE_LB)
+ 		return 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_cache_enabled() &&
++	    can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid)
++		return 0;
++#endif
++
+ 	degrades = migrate_degrades_locality(p, env);
+ 	if (!degrades)
+ 		hot = task_hot(p, env);
+@@ -10227,6 +10233,20 @@ static int detach_tasks(struct lb_env *env)
+ 		if (env->imbalance <= 0)
+ 			break;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Don't detach more tasks if the remaining tasks want
++		 * to stay. We know the remaining tasks all prefer the
++		 * current LLC, because after order_tasks_by_llc(), the
++		 * tasks that prefer the current LLC are at the tail of
++		 * the list. The inhibition of detachment is to avoid too
++		 * many tasks being migrated out of the preferred LLC.
++		 */
++		if (sched_cache_enabled() && detached && p->preferred_llc != -1 &&
++		    llc_id(env->src_cpu) == p->preferred_llc)
++			break;
++#endif
++
+ 		continue;
+ next:
+ 		if (p->sched_task_hot)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch
new file mode 100644
index 0000000..88914b1
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch
@@ -0,0 +1,172 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 16F11288C02
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:34 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206715; cv=none; b=msFA8TC41v9oEIuXxPkwmaUs9Guya5oz4k0g+kGWjFkx5t6zbq1fE/hqkiyOdPEhHS8cUTNX+aARYrbMu+YFzDRmUGhKnyOYkbiJD/UnEPwa2emEYG8RrqlU6lMxzm4wiDBJLxqnLLfKGSPXyWwXrM560Mia1tgl6K9uKsnEgFE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206715; c=relaxed/simple;
+	bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=pIYwSq6151qmo6KEbEr6KofmYMtBvZvl9VphDwsqPX3hTLP897hu66I6LFuek1xE2EdzY5hJ64po/YPEKcNn99hwknIHDQx8uamJBxPh8I2WV7/JQ8MBTxUclp3YSgTWiAJSRjNR9EBM7PkdUJqtsU69m11ei/HsbibGYzaOOwk=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TQVK1fUD; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TQVK1fUD"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206714; x=1791742714;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=;
+  b=TQVK1fUDtuFQmuxj0h/H6B3W/u2cJ2GkGOiUH7Lt/dRtHWxu09UqD683
+   GE9GznGGwwF/Ima7vRS1ctHwsI6Xpw4SijdVGn66soleS5/ydNjcGaSKg
+   ygudPZpTfNaQrBfM0sFvdqPmdg50LMShstL+8pxYWf160UzvXjzOECyon
+   VuIxmxxlfPMnN2wMIOyjbQiDBL/LsnnHbGArR4IFK3zGWts6KMkvPzkiR
+   EwWOPnHMmqriXFYLM8wcDjSverDfcRP6MlQsXXusYG7bdxJhhuwymEiBB
+   InFNxWr5/xEksEDfouM5jLx/TVwLUkF4o8vAQ8HbkYgDi57JrvvbuA4Mr
+   g==;
+X-CSE-ConnectionGUID: dN0cE9kLQ3yeKYNXwwT83A==
+X-CSE-MsgGUID: KDX51V55RvaEAIpyi8Kcxg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339905"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339905"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:34 -0700
+X-CSE-ConnectionGUID: SHt2rwkJR6+JML7EmRAXVw==
+X-CSE-MsgGUID: 7457bVysSBes9Wezrb15EQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487250"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:33 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 16/19] sched/fair: Exclude processes with many threads from cache-aware scheduling
+Date: Sat, 11 Oct 2025 11:24:53 -0700
+Message-Id: <637cdb8ab11b1b978d697ed744cc402d32443ecc.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+A performance regression was observed by Prateek when running hackbench
+with many threads per process (high fd count). To avoid this, processes
+with a large number of active threads are excluded from cache-aware
+scheduling.
+
+With sched_cache enabled, record the number of active threads in each
+process during the periodic task_cache_work(). While iterating over
+CPUs, if the currently running task belongs to the same process as the
+task that launched task_cache_work(), increment the active thread count.
+
+If the count exceeds the number of CPUs in the process's preferred LLC,
+sched_cache will avoid aggregating too many threads into a single LLC
+domain.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/mm_types.h |  1 +
+ kernel/sched/fair.c      | 14 ++++++++++++--
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 3ca557c2f36d..b307f81b2fde 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1031,6 +1031,7 @@ struct mm_struct {
+ 		raw_spinlock_t mm_sched_lock;
+ 		unsigned long mm_sched_epoch;
+ 		int mm_sched_cpu;
++		u64 nr_running_avg ____cacheline_aligned_in_smp;
+ #endif
+ 
+ #ifdef CONFIG_MMU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 65ff7c306a2f..79d109f8a09f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1451,12 +1451,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
+ 
+ static void __no_profile task_cache_work(struct callback_head *work)
+ {
+-	struct task_struct *p = current;
++	struct task_struct *p = current, *cur;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long curr_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1, cache_cpu,
+-	    pref_nid = NUMA_NO_NODE, curr_cpu;
++	    pref_nid = NUMA_NO_NODE, curr_cpu,
++	    nr_running = 0;
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1497,6 +1498,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					m_occ = occ;
+ 					m_cpu = i;
+ 				}
++
++				rcu_read_lock();
++				cur = rcu_dereference(cpu_rq(i)->curr);
++				if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
++				    cur->mm == mm)
++					nr_running++;
++				rcu_read_unlock();
++
+ 			}
+ 
+ 			/*
+@@ -1540,6 +1549,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		mm->mm_sched_cpu = m_a_cpu;
+ 	}
+ 
++	update_avg(&mm->nr_running_avg, nr_running);
+ 	free_cpumask_var(cpus);
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch b/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch
new file mode 100644
index 0000000..0bb796c
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch
@@ -0,0 +1,170 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBF228B400
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:35 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206717; cv=none; b=YyEz/CWTR29mSwIUaPFMfMePzkOh+JM5Sy6daDO5bi2qr7vVNV19xi6LQHHFuh3wAPmGhaJZO0psSS/hmmAhEm9YYTN/Jgc2pWxCyI+xWhQCLC7I/PnTVjCiCQif4wqMsrxoWCBWSb2OUxPbQQvBrskdsdNoyUkJX7OfjisrPEo=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206717; c=relaxed/simple;
+	bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=cDrry+jPMrDILm/r9QUVZNGIrsE561nMMRjz9ay5n5LBA0g4KQ5jFwtQhbKMvroO4a5axJHedJTHbl6aSfvc0uCnQwzJq+eaxxOqXVEOWsoi3zdhUNBrxg97Vqp+GrazIyVFmuyXj145vhjyv4Ug8nfP5dYxkUNSPkfjany2j50=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=lrCuBiww; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="lrCuBiww"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206715; x=1791742715;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=;
+  b=lrCuBiwwXTTaUUesVoUKShmqNypNMcjFctaFnNlL8Jy17kFhV1UkeZza
+   ZuX0GXcNA+d1mgjVrCdwx7TgVROgGBNK4U8k00nbzT6TvTcewZUk7QGtM
+   ze+FjZ8AcXNEy5AhOAJw/Pg8vbtTnZ1loNcqp57iteVrKQqHWUMDyfSYU
+   8P+nCqWidGuZDOqQcaEjQH4wD2Jn2+QsEcLHNMZnZLw6R3C8jci7hl1aG
+   MGxs8mPuw6pSR4ah1MI8YVoYS5wwLulLaJK/V5D02tGg7pdRILUMNtqsB
+   x0389trQkin/UccLwrCAMIGVL3znx7/2JW/py3nOY6EKojcOWTOyEIt0N
+   Q==;
+X-CSE-ConnectionGUID: WfwYlMtNQVe279pYYOUBnA==
+X-CSE-MsgGUID: AjSkDrsURkOZNf5ZbyXbNQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339923"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339923"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700
+X-CSE-ConnectionGUID: ezHUeA30SCiDTeB7wo76Nw==
+X-CSE-MsgGUID: YeYwMr00ThmPUWDQc0+YAw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487255"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:34 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 17/19] sched/fair: Disable cache aware scheduling for processes with high thread counts
+Date: Sat, 11 Oct 2025 11:24:54 -0700
+Message-Id: <a098a60d9b4fc8ccea3392096f8bb0cf03af070b.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+If the number of active threads within the process
+exceeds the number of Cores(divided by SMTs number)
+in the LLC, do not enable cache-aware scheduling.
+This is because there is a risk of cache contention
+within the preferred LLC when too many threads are
+present.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 27 +++++++++++++++++++++++++--
+ 1 file changed, 25 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 79d109f8a09f..6b8eace79eee 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1240,6 +1240,18 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
++{
++	int smt_nr = 1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active())
++		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
++#endif
++
++	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
+ 	int pref_llc;
+@@ -1385,10 +1397,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 
+ 	/*
+ 	 * If this task hasn't hit task_cache_work() for a while, or it
+-	 * has only 1 thread, invalidate its preferred state.
++	 * has only 1 thread, or has too many active threads, invalidate
++	 * its preferred state.
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+-	    get_nr_threads(p) <= 1) {
++	    get_nr_threads(p) <= 1 ||
++	    exceed_llc_nr(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1467,6 +1481,11 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
++	if (get_nr_threads(p) <= 1) {
++		mm->mm_sched_cpu = -1;
++		return;
++	}
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9826,6 +9845,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
++	 /* skip cache aware load balance for single/too many threads */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++		return mig_unrestricted;
++
+ 	if (cpus_share_cache(dst_cpu, cpu))
+ 		to_pref = true;
+ 	else if (cpus_share_cache(src_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch b/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch
new file mode 100644
index 0000000..b614ebc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch
@@ -0,0 +1,246 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC76C28C03B
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:35 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206717; cv=none; b=Gsl1htdC3Y7gJ6c3ywcidI/bSse8yUz6irs7/iI8KWV8rK5Ae95mMS6V4kE386ZpRZ64YVuSevPlw/gCCcGexlKVEsnpJGvjAMVnB6E3r26Sb5PQDcAwlJhgczIF0vnORN//ryXKWaGJdpyTLOi1a78IAJp76Mm0Cc1+XjF2rGQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206717; c=relaxed/simple;
+	bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=hT0pK7n3dH+PZ5LGb1wwP8mkt2A7mUf1PCIeydCbZfOqNSbSKOwNGkxWRp3xr4aPGGtMx1eK61Xyt7h2YGrFfvdSUCRdLGNS2BunlIUuq8SqGdxHIK829DTsOGKBUbEPWJzj/d6E4FC8xaBfUuz6ugBEq47VdX8vEtuc1XwNFis=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eyspbvXX; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eyspbvXX"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206716; x=1791742716;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=;
+  b=eyspbvXX6JZaLuPx9mP9k7AsJvdPNK3nA7Eu1n1ZjnjSeOqzlt2GEvCx
+   IIbDfmBwRBwDACT7YDm/5WXc6cuJLsO02ejx9sBoouGuZkUHl1/nB7J2O
+   i/e0/jcb0J2buciIQ3OvuzUhegT0ZaiQoJUm0tinSNJAyHv/2LoJKLT6E
+   1wncP9sm103omUQyz2nIdzytwxhPLCdaTXt3R4jfGDM0HbNy1TRA5Ex3O
+   eiDpNNIsPslVI7J8r5viBVFuJFJIfp1atbqNY5xQ3zDqGyLEqF5FJMEHK
+   BGBjTx2SYuiM3sv4eOtztesROh9S4vRoc6wieYXXgBwOgrHLMjZB8S3CI
+   A==;
+X-CSE-ConnectionGUID: 15+3n+5PQLG8KotmRvuIMw==
+X-CSE-MsgGUID: Dj1GwDBDRtWs7ASTeti8MA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339940"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339940"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700
+X-CSE-ConnectionGUID: O+LhKbX0QNyBYwHUAp0ttw==
+X-CSE-MsgGUID: PfPvzLkATc2Ca+B9H6Dwng==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487259"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:35 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 18/19] sched/fair: Avoid cache-aware scheduling for memory-heavy processes
+Date: Sat, 11 Oct 2025 11:24:55 -0700
+Message-Id: <00da49fd590b95baad0525660bda4c0ba178243d.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Prateek and Tingyin reported that memory-intensive workloads (such as
+stream) can saturate memory bandwidth and caches on the preferred LLC
+when sched_cache aggregates too many threads.
+
+To mitigate this, estimate a process's memory footprint by comparing
+its RSS (anonymous and shared pages) to the size of the LLC. If RSS
+exceeds the LLC size, skip cache-aware scheduling.
+
+Note that RSS is only an approximation of the memory footprint.
+By default, the comparison is strict, but a later patch will allow
+users to provide a hint to adjust this threshold.
+
+According to the test from Adam, some systems do not have shared L3
+but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].
+
+Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/cacheinfo.h | 21 ++++++++++------
+ kernel/sched/fair.c       | 51 ++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 61 insertions(+), 11 deletions(-)
+
+diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
+index c8f4f0a0b874..82d0d59ca0e1 100644
+--- a/include/linux/cacheinfo.h
++++ b/include/linux/cacheinfo.h
+@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
+ 
+ const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
+ 
+-/*
+- * Get the cacheinfo structure for the cache associated with @cpu at
+- * level @level.
+- * cpuhp lock must be held.
+- */
+-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level)
+ {
+ 	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ 	int i;
+ 
+-	lockdep_assert_cpus_held();
+-
+ 	for (i = 0; i < ci->num_leaves; i++) {
+ 		if (ci->info_list[i].level == level) {
+ 			if (ci->info_list[i].attributes & CACHE_ID)
+@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
+ 	return NULL;
+ }
+ 
++/*
++ * Get the cacheinfo structure for the cache associated with @cpu at
++ * level @level.
++ * cpuhp lock must be held.
++ */
++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++{
++	lockdep_assert_cpus_held();
++
++	return _get_cpu_cacheinfo_level(cpu, level);
++}
++
+ /*
+  * Get the id of the cache associated with @cpu at level @level.
+  * cpuhp lock must be held.
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6b8eace79eee..46dfcd2a01b3 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1240,6 +1240,38 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
++{
++	struct cacheinfo *ci;
++	unsigned long rss;
++	unsigned int llc;
++
++	/*
++	 * get_cpu_cacheinfo_level() can not be used
++	 * because it requires the cpu_hotplug_lock
++	 * to be held. Use _get_cpu_cacheinfo_level()
++	 * directly because the 'cpu' can not be
++	 * offlined at the moment.
++	 */
++	ci = _get_cpu_cacheinfo_level(cpu, 3);
++	if (!ci) {
++		/*
++		 * On system without L3 but with shared L2,
++		 * L2 becomes the LLC.
++		 */
++		ci = _get_cpu_cacheinfo_level(cpu, 2);
++		if (!ci)
++			return true;
++	}
++
++	llc = ci->size;
++
++	rss = get_mm_counter(mm, MM_ANONPAGES) +
++		get_mm_counter(mm, MM_SHMEMPAGES);
++
++	return (llc <= (rss * PAGE_SIZE));
++}
++
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+ 	int smt_nr = 1;
+@@ -1402,7 +1434,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+ 	    get_nr_threads(p) <= 1 ||
+-	    exceed_llc_nr(mm, cpu_of(rq))) {
++	    exceed_llc_nr(mm, cpu_of(rq)) ||
++	    exceed_llc_capacity(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1486,6 +1519,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		return;
+ 	}
+ 
++	/*
++	 * Do not check exceed_llc_nr() because
++	 * the active number of threads needs to
++	 * been updated anyway.
++	 */
++	if (exceed_llc_capacity(mm, curr_cpu))
++		return;
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9845,8 +9886,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
+-	 /* skip cache aware load balance for single/too many threads */
+-	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++	/*
++	 * skip cache aware load balance for single/too many threads
++	 * or large footprint.
++	 */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
++	    exceed_llc_capacity(mm, dst_cpu))
+ 		return mig_unrestricted;
+ 
+ 	if (cpus_share_cache(dst_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch
new file mode 100644
index 0000000..893d5f6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch
@@ -0,0 +1,366 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18E8128D850
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:37 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206719; cv=none; b=Xx1TJtOzMlihMYBSPUxuHxJ0Qjx1gDS60TVsBbaW2YAWG207+fLDuebhtY/m9byeKfuUMx/7RVc7mR4xE94pKemXSaF1s6z/Ug1MSbyJDL/f+gYUVN9JWyZVsl4nskC5I36GvI9Reswdcqif7FIqp4+OT03g4Ursen0Zl0KoJs4=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206719; c=relaxed/simple;
+	bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=TqNPDsqjikNan+1NtjFEbAg77jx9c3inhDW4V8l0uRiJhbQOXCuc9b1G6bYocgAvzvRSIQ0C9pHEOzGrnitQnTKHR4lM01jV+sq5AGE2Z0YUwNbJ3G2iOFzcz198JhG1QAmKUE7Vocf7AQigiloGd31ZcAGpFcHlx+XOPevHRzQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iOR0vW8+; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iOR0vW8+"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206717; x=1791742717;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=;
+  b=iOR0vW8+BW1BG+CuQKpeekNgIJXVik0HqP3JsArGSk608O/BAqQp2/2V
+   NevdC5FBoGU0UJqaEBq3eyHXjM8fq6f/t4e0BsD23dpBBveuXe++OVX8Y
+   Aapb+EWCp+mFsFeSqc6EHn1EKVQFE1axOMUnDuAWrAcUGMdrmUl0Sqt8l
+   gPm1isDiRNA4VWnGAtuiefQtTbQsCK7LA3hCWV2kYbD78VwasjvY/a8Zs
+   eIWoDg9eon7/Ajv/YxTCU8u2KHeYWmlazBkEjZ2+x2uGykUr+ha3ebndP
+   Ilvnp7dapSvlsm6l5tNbjmODs4GBS1SErTGbDlGwNscJODVWeB1whKGtb
+   g==;
+X-CSE-ConnectionGUID: iwkdIGQ9QpepiaCCmITr2A==
+X-CSE-MsgGUID: vpqcAnIxSGm05xalZwxCuA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339958"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339958"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:36 -0700
+X-CSE-ConnectionGUID: l0yVaxC3RhO6SKkG+8NgJA==
+X-CSE-MsgGUID: KHjGlLwMQh2OAr5o5sZaPw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487263"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:36 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 19/19] sched/fair: Add user control to adjust the tolerance of cache-aware scheduling
+Date: Sat, 11 Oct 2025 11:24:56 -0700
+Message-Id: <afe7603c37fe76064d769ce9d78df494347a748c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+With sched_cache enabled, the scheduler uses a process's RSS as a
+proxy for its LLC footprint to determine if aggregating tasks on the
+preferred LLC could cause cache contention. If RSS exceeds the LLC
+size, aggregation is skipped. Some workloads with large RSS but small
+actual memory footprints may still benefit from aggregation. Since
+the kernel cannot efficiently track per-task cache usage (resctrl is
+user-space only), userspace can provide a more accurate hint.
+
+Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
+users control how strictly RSS limits aggregation. Values range from
+0 to 100:
+
+  - 0: Cache-aware scheduling is disabled.
+  - 1: Strict; tasks with RSS larger than LLC size are skipped.
+  - 100: Aggressive; tasks are aggregated regardless of RSS.
+
+For example, with a 32MB L3 cache:
+
+  - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
+  - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
+    (784GB = (1 + (99 - 1) * 256) * 32MB).
+
+Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
+how strictly the number of active threads is considered when doing
+cache aware load balance. The number of SMTs is also considered.
+High SMT counts reduce the aggregation capacity, preventing excessive
+task aggregation on SMT-heavy systems like Power10/Power11.
+
+For example, with 8 Cores/16 CPUs in a L3:
+
+  - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
+  - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
+    785 = (1 + (99 - 1) * 8).
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Reported-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
+Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Reported-by: Tingyin Duan <tingyin.duan@gmail.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/debug.c | 56 ++++++++++++++++++++++++++++++--
+ kernel/sched/fair.c  | 76 ++++++++++++++++++++++++++++++++++++++++----
+ kernel/sched/sched.h |  3 ++
+ 3 files changed, 126 insertions(+), 9 deletions(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 57bb04ebbf96..cfcd8b436cc5 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -169,6 +169,50 @@ static const struct file_operations sched_feat_fops = {
+ 	.release	= single_release,
+ };
+ 
++#ifdef CONFIG_SCHED_CACHE
++#define SCHED_CACHE_CREATE_CONTROL(name)			  \
++static ssize_t sched_cache_write_##name(struct file *filp,	  \
++					const char __user *ubuf,  \
++					size_t cnt, loff_t *ppos) \
++{								  \
++	char buf[16];						  \
++	unsigned int percent;					  \
++	if (cnt > 15)						  \
++		cnt = 15;					  \
++	if (copy_from_user(&buf, ubuf, cnt))			  \
++		return -EFAULT;					  \
++	buf[cnt] = '\0';					  \
++	if (kstrtouint(buf, 10, &percent))			  \
++		return -EINVAL;					  \
++	if (percent > 100)					  \
++		return -EINVAL;					  \
++	llc_##name = percent;					  \
++	*ppos += cnt;						  \
++	return cnt;						  \
++}								  \
++static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
++{								  \
++	seq_printf(m, "%d\n", llc_##name);			  \
++	return 0;						  \
++}								  \
++static int sched_cache_open_##name(struct inode *inode,		  \
++				   struct file *filp)		  \
++{								  \
++	return single_open(filp, sched_cache_show_##name, NULL);  \
++}								  \
++static const struct file_operations sched_cache_fops_##name = {	  \
++	.open		= sched_cache_open_##name,		  \
++	.write		= sched_cache_write_##name,		  \
++	.read		= seq_read,				  \
++	.llseek		= seq_lseek,				  \
++	.release	= single_release,			  \
++}
++
++SCHED_CACHE_CREATE_CONTROL(overload_pct);
++SCHED_CACHE_CREATE_CONTROL(imb_pct);
++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance);
++#endif /* SCHED_CACHE */
++
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -524,8 +568,16 @@ static __init int sched_init_debug(void)
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct);
+-	debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct);
++	debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_overload_pct);
++	debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_imb_pct);
++	debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_aggr_tolerance);
++	debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
++			   &llc_epoch_period);
++	debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
++			   &llc_epoch_affinity_timeout);
+ #endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 46dfcd2a01b3..f9084e2f9ef2 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1207,9 +1207,62 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ __read_mostly unsigned int llc_overload_pct       = 50;
+ __read_mostly unsigned int llc_imb_pct            = 20;
++__read_mostly unsigned int llc_aggr_tolerance     = 1;
++__read_mostly unsigned int llc_epoch_period       = EPOCH_PERIOD;
++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_cache_allowed);
+ 
++static inline int get_sched_cache_scale(int mul)
++{
++	if (!llc_aggr_tolerance)
++		return 0;
++
++	if (llc_aggr_tolerance == 100)
++		return INT_MAX;
++
++	return (1 + (llc_aggr_tolerance - 1) * mul);
++}
++
++static inline int get_sched_cache_rss_scale(void)
++{
++	/*
++	 * Suppose the L3 size is 32MB. If the
++	 * llc_aggr_tolerance is 1:
++	 * When the RSS is larger than 32MB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity. If the
++	 * llc_aggr_tolerance is 99:
++	 * When the RSS is larger than 784GB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 784GB = (1 + (99 - 1) * 256) * 32MB
++	 */
++	return get_sched_cache_scale(256);
++}
++
++static inline int get_sched_cache_nr_scale(void)
++{
++	/*
++	 * Suppose the number of Cores in LLC is 8.
++	 * Every core has 2 SMTs.
++	 * If the llc_aggr_tolerance is 1: When the
++	 * nr_running is larger than 8, the process
++	 * is regarded as exceeding the LLC capacity.
++	 * If the llc_aggr_tolerance is 99:
++	 * When the nr_running is larger than 785,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 785 = 1 + (99 - 1) * 8
++	 */
++	return get_sched_cache_scale(1);
++}
++
++static inline int get_sched_cache_cap_scale(void)
++{
++	return (llc_overload_pct / cpu_smt_num_threads);
++}
++
+ static inline bool sched_cache_enabled(void)
+ {
+ 	return sched_feat(SCHED_CACHE) &&
+@@ -1245,6 +1298,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	struct cacheinfo *ci;
+ 	unsigned long rss;
+ 	unsigned int llc;
++	int scale;
+ 
+ 	/*
+ 	 * get_cpu_cacheinfo_level() can not be used
+@@ -1269,19 +1323,27 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	rss = get_mm_counter(mm, MM_ANONPAGES) +
+ 		get_mm_counter(mm, MM_SHMEMPAGES);
+ 
+-	return (llc <= (rss * PAGE_SIZE));
++	scale = get_sched_cache_rss_scale();
++	if (scale == INT_MAX)
++		return false;
++
++	return ((llc * scale) <= (rss * PAGE_SIZE));
+ }
+ 
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+-	int smt_nr = 1;
++	int smt_nr = 1, scale;
+ 
+ #ifdef CONFIG_SCHED_SMT
+ 	if (sched_smt_active())
+ 		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
+ #endif
+ 
+-	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++	scale = get_sched_cache_nr_scale();
++	if (scale == INT_MAX)
++		return false;
++
++	return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
+ }
+ 
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+@@ -1370,9 +1432,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ 	long delta = now - rq->cpu_epoch_next;
+ 
+ 	if (delta > 0) {
+-		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		n = (delta + llc_epoch_period - 1) / llc_epoch_period;
+ 		rq->cpu_epoch += n;
+-		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		rq->cpu_epoch_next += n * llc_epoch_period;
+ 		__shr_u64(&rq->cpu_runtime, n);
+ 	}
+ 
+@@ -1432,7 +1494,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 * has only 1 thread, or has too many active threads, invalidate
+ 	 * its preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
+ 	    get_nr_threads(p) <= 1 ||
+ 	    exceed_llc_nr(mm, cpu_of(rq)) ||
+ 	    exceed_llc_capacity(mm, cpu_of(rq))) {
+@@ -9749,7 +9811,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+  * (default: ~50%)
+  */
+ #define fits_llc_capacity(util, max)	\
+-	((util) * 100 < (max) * llc_overload_pct)
++	((util) * 100 < (max) * get_sched_cache_cap_scale())
+ 
+ /*
+  * The margin used when comparing utilization.
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index b801d32d5fba..97e8558b0530 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2810,6 +2810,9 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int llc_overload_pct;
+ extern unsigned int llc_imb_pct;
++extern unsigned int llc_aggr_tolerance;
++extern unsigned int llc_epoch_period;
++extern unsigned int llc_epoch_affinity_timeout;
+ extern struct static_key_false sched_cache_allowed;
+ #endif
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch b/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch
new file mode 100644
index 0000000..c0a6d12
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch
@@ -0,0 +1,120 @@
+From 8716b3a723c94c85da3c28a01ce5c23e46341562 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 18 Dec 2025 16:40:32 +0100
+Subject: [PATCH 01/11] amd-pstate
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/cpufreq/amd-pstate.c | 33 ++++++++++++++-------------------
+ 1 file changed, 14 insertions(+), 19 deletions(-)
+
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index 602e4fa81d6c..c45bc98721d2 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -65,13 +65,13 @@ static const char * const amd_pstate_mode_string[] = {
+ 	[AMD_PSTATE_PASSIVE]     = "passive",
+ 	[AMD_PSTATE_ACTIVE]      = "active",
+ 	[AMD_PSTATE_GUIDED]      = "guided",
+-	NULL,
+ };
++static_assert(ARRAY_SIZE(amd_pstate_mode_string) == AMD_PSTATE_MAX);
+ 
+ const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
+ {
+-	if (mode < 0 || mode >= AMD_PSTATE_MAX)
+-		return NULL;
++	if (mode < AMD_PSTATE_UNDEFINED || mode >= AMD_PSTATE_MAX)
++		mode = AMD_PSTATE_UNDEFINED;
+ 	return amd_pstate_mode_string[mode];
+ }
+ EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
+@@ -110,6 +110,7 @@ enum energy_perf_value_index {
+ 	EPP_INDEX_BALANCE_PERFORMANCE,
+ 	EPP_INDEX_BALANCE_POWERSAVE,
+ 	EPP_INDEX_POWERSAVE,
++	EPP_INDEX_MAX,
+ };
+ 
+ static const char * const energy_perf_strings[] = {
+@@ -118,8 +119,8 @@ static const char * const energy_perf_strings[] = {
+ 	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
+ 	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
+ 	[EPP_INDEX_POWERSAVE] = "power",
+-	NULL
+ };
++static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX);
+ 
+ static unsigned int epp_values[] = {
+ 	[EPP_INDEX_DEFAULT] = 0,
+@@ -127,7 +128,8 @@ static unsigned int epp_values[] = {
+ 	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
+ 	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
+ 	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
+- };
++};
++static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX);
+ 
+ typedef int (*cppc_mode_transition_fn)(int);
+ 
+@@ -183,7 +185,7 @@ static inline int get_mode_idx_from_str(const char *str, size_t size)
+ {
+ 	int i;
+ 
+-	for (i=0; i < AMD_PSTATE_MAX; i++) {
++	for (i = 0; i < AMD_PSTATE_MAX; i++) {
+ 		if (!strncmp(str, amd_pstate_mode_string[i], size))
+ 			return i;
+ 	}
+@@ -1137,16 +1139,15 @@ static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy,
+ static ssize_t show_energy_performance_available_preferences(
+ 				struct cpufreq_policy *policy, char *buf)
+ {
+-	int i = 0;
+-	int offset = 0;
++	int offset = 0, i;
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+ 
+ 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+ 		return sysfs_emit_at(buf, offset, "%s\n",
+ 				energy_perf_strings[EPP_INDEX_PERFORMANCE]);
+ 
+-	while (energy_perf_strings[i] != NULL)
+-		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]);
++	for (i = 0; i < ARRAY_SIZE(energy_perf_strings); i++)
++		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i]);
+ 
+ 	offset += sysfs_emit_at(buf, offset, "\n");
+ 
+@@ -1157,15 +1158,10 @@ static ssize_t store_energy_performance_preference(
+ 		struct cpufreq_policy *policy, const char *buf, size_t count)
+ {
+ 	struct amd_cpudata *cpudata = policy->driver_data;
+-	char str_preference[21];
+ 	ssize_t ret;
+ 	u8 epp;
+ 
+-	ret = sscanf(buf, "%20s", str_preference);
+-	if (ret != 1)
+-		return -EINVAL;
+-
+-	ret = match_string(energy_perf_strings, -1, str_preference);
++	ret = sysfs_match_string(energy_perf_strings, buf);
+ 	if (ret < 0)
+ 		return -EINVAL;
+ 
+@@ -1353,9 +1349,8 @@ int amd_pstate_update_status(const char *buf, size_t size)
+ 		return -EINVAL;
+ 
+ 	mode_idx = get_mode_idx_from_str(buf, size);
+-
+-	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
+-		return -EINVAL;
++	if (mode_idx < 0)
++		return mode_idx;
+ 
+ 	if (mode_state_machine[cppc_state][mode_idx]) {
+ 		guard(mutex)(&amd_pstate_driver_lock);
+-- 
+2.52.0
+
diff --git a/sys-kernel/gentoo-sources-6.6/0002-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.18/0002-glitched-additional-timer-tick-frequencies.patch
similarity index 100%
rename from sys-kernel/gentoo-sources-6.6/0002-glitched-additional-timer-tick-frequencies.patch
rename to sys-kernel/gentoo-sources-6.18/0002-glitched-additional-timer-tick-frequencies.patch
diff --git a/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch
new file mode 100644
index 0000000..0522c37
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch
@@ -0,0 +1,3394 @@
+From f475869a64305975245f8d0f4ab1942bacbabf5a Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 18 Dec 2025 16:41:09 +0100
+Subject: [PATCH 04/11] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/net/tcp_ecn.h              |    6 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2233 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   42 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 17 files changed, 1939 insertions(+), 554 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 20b8c6e21fef..e334b7a7aac2 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -236,7 +236,8 @@ struct tcp_sock {
+ 		tcp_usec_ts : 1, /* TSval values in usec */
+ 		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+ 		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+-		recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_txrx);
+ 
+ 	/* RX read-mostly hotpath cache lines */
+@@ -292,7 +293,8 @@ struct tcp_sock {
+  *	0x5?10 << 16 + snd_wnd in net byte order
+  */
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	u8	received_ce_pending:4, /* Not yet transmit cnt of received_ce */
+ 		unused2:4;
+ 	u8	accecn_minlen:2,/* Minimum length of AccECN option sent */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index b4b886647607..0dcce6489e56 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -132,8 +132,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index ab20f549b8f9..e3bcdc0be05e 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -403,6 +403,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_DEMAND_CWR	BIT(2)
+ #define	TCP_ECN_SEEN		BIT(3)
+ #define	TCP_ECN_MODE_ACCECN	BIT(4)
++#define	TCP_ECN_LOW		BIT(5)
++#define	TCP_ECN_ECT_PERMANENT	BIT(6)
+ 
+ #define	TCP_ECN_DISABLED	0
+ #define	TCP_ECN_MODE_PENDING	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+@@ -838,6 +840,15 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -943,6 +954,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -1053,9 +1069,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1168,6 +1189,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1190,7 +1212,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED		BIT(0)
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN		BIT(1)
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	BIT(2)
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1210,10 +1236,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1224,7 +1253,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1248,8 +1279,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1315,6 +1349,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1334,6 +1376,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1346,6 +1389,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2531,7 +2589,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
+index f13e5cd2b1ac..bc5de05260eb 100644
+--- a/include/net/tcp_ecn.h
++++ b/include/net/tcp_ecn.h
+@@ -583,10 +583,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
+ 		  tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
+ 		  tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -604,6 +603,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 			tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ 		} else {
+ 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
++
++			if (dst)
++				tcp_set_ecn_low_from_dst(sk, dst);
+ 		}
+ 	}
+ }
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index dab9493c791b..cce4975fdcfe 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -517,12 +517,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dce3113787a7..6efba4f74f6f 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+ #define TCPI_OPT_TFO_CHILD	128 /* child from a Fast Open option on SYN */
++#define TCPI_OPT_ECN_LOW	256 /* Low-latency ECN enabled at conn init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 12850a277251..3b8b96692fb4 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index e01492234b0b..27893b774e08 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 8a18aeca7ab0..fe4c1b143de1 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3469,6 +3469,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4226,6 +4227,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..9279be755c16 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,123 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++
++	return tcp_ecn_mode_any(tp) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +384,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +411,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +435,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +458,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +475,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +536,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +549,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +581,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +601,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +672,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +683,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +712,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +741,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +797,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +805,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +851,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +860,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +888,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +925,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +948,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +973,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2362,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2399,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index e4a979b75cc6..a0d7b9586e36 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -358,7 +358,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+@@ -376,7 +376,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		if (!tcp_ecn_mode_rfc3168(tp))
+ 			break;
+@@ -1289,7 +1289,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1654,6 +1659,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3889,7 +3905,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3906,6 +3923,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3916,6 +3934,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -4042,6 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4114,7 +4138,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_in_ack_event(sk, flag);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4139,6 +4163,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4164,7 +4189,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5891,13 +5916,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 2ec8c6f1cdcc..3e39a40867b5 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -500,6 +500,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index b94efb3050d2..3f11efcf0e98 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -347,7 +347,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1759,7 +1760,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	u16 flags;
+ 	int nlen;
+@@ -1834,6 +1835,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2190,13 +2215,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2937,6 +2961,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -3149,6 +3174,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 2dd73a4e8e51..3d35afdbf803 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -702,6 +702,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 			       icsk_timeout(icsk));
+ 		return;
+ 	}
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.52.0
+
diff --git a/sys-kernel/gentoo-sources-6.18/0005-block.patch b/sys-kernel/gentoo-sources-6.18/0005-block.patch
new file mode 100644
index 0000000..f11165a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/0005-block.patch
@@ -0,0 +1,214 @@
+From fa484998399ea55d03e50fb401ee0992f4666793 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 18 Dec 2025 16:56:28 +0100
+Subject: [PATCH 05/11] block
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------
+ block/bfq-iosched.h | 12 +++++++++--
+ block/mq-deadline.c | 19 +++++++++++++----
+ 3 files changed, 70 insertions(+), 13 deletions(-)
+
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
+index 4a8d3d96bfe4..4c0c9e125211 100644
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -460,6 +460,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
+ 	return icq_to_bic(ioc_lookup_icq(q));
+ }
+ 
++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
++{
++	if (!current->io_context)
++		return NULL;
++	if (spin_trylock_irq(&q->queue_lock)) {
++		struct bfq_io_cq *icq;
++
++		icq = icq_to_bic(ioc_lookup_icq(q));
++		spin_unlock_irq(&q->queue_lock);
++		return icq;
++	}
++
++	return NULL;
++}
++
+ /*
+  * Scheduler run of queue, if there are requests pending and no one in the
+  * driver that will restart queueing.
+@@ -2448,11 +2463,22 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
+ 		unsigned int nr_segs)
+ {
+ 	struct bfq_data *bfqd = q->elevator->elevator_data;
+-	struct bfq_io_cq *bic = bfq_bic_lookup(q);
++	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
+ 	struct request *free = NULL;
+ 	bool ret;
+ 
+-	spin_lock_irq(&bfqd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock_irq(&bfqd->lock))
++		return false;
+ 
+ 	if (bic) {
+ 		/*
+@@ -5301,6 +5327,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	struct bfq_queue *in_serv_queue;
+ 	bool waiting_rq, idle_timer_disabled = false;
+ 
++	/*
++	 * If someone else is already dispatching, skip this one. This will
++	 * defer the next dispatch event to when something completes, and could
++	 * potentially lower the queue depth for contended cases.
++	 *
++	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
++	 * retries if nothing is dispatched.
++	 */
++	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
++	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
++		return NULL;
++
+ 	spin_lock_irq(&bfqd->lock);
+ 
+ 	in_serv_queue = bfqd->in_service_queue;
+@@ -5312,6 +5350,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ 	}
+ 
++	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
+ 	spin_unlock_irq(&bfqd->lock);
+ 	bfq_update_dispatch_stats(hctx->queue, rq,
+ 			idle_timer_disabled ? in_serv_queue : NULL,
+@@ -6233,10 +6272,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
+ 
+ static struct bfq_queue *bfq_init_rq(struct request *rq);
+ 
+-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void bfq_insert_request(struct request_queue *q, struct request *rq,
+ 			       blk_insert_t flags)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct bfq_data *bfqd = q->elevator->elevator_data;
+ 	struct bfq_queue *bfqq;
+ 	bool idle_timer_disabled = false;
+@@ -6298,7 +6336,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		bfq_insert_request(hctx, rq, flags);
++		bfq_insert_request(hctx->queue, rq, flags);
+ 	}
+ }
+ 
+@@ -7218,6 +7256,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
+ 	q->elevator = eq;
+ 	spin_unlock_irq(&q->queue_lock);
+ 
++	spin_lock_init(&bfqd->lock);
++
+ 	/*
+ 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ 	 * Grab a permanent reference to it, so that the normal code flow
+@@ -7335,8 +7375,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
+ 	/* see comments on the definition of next field inside bfq_data */
+ 	bfqd->actuator_load_threshold = 4;
+ 
+-	spin_lock_init(&bfqd->lock);
+-
+ 	/*
+ 	 * The invocation of the next bfq_create_group_hierarchy
+ 	 * function is the head of a chain of function calls
+diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
+index 34a498e6b2a5..bef03e57b0a5 100644
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -504,12 +504,22 @@ struct bfq_io_cq {
+ 	unsigned int requests;	/* Number of requests this process has in flight */
+ };
+ 
++enum {
++	BFQ_DISPATCHING	= 0,
++};
++
+ /**
+  * struct bfq_data - per-device data structure.
+  *
+  * All the fields are protected by @lock.
+  */
+ struct bfq_data {
++	struct {
++		spinlock_t lock;
++	} ____cacheline_aligned_in_smp;
++
++	unsigned long run_state;
++
+ 	/* device request queue */
+ 	struct request_queue *queue;
+ 	/* dispatch queue */
+@@ -795,8 +805,6 @@ struct bfq_data {
+ 	/* fallback dummy bfqq for extreme OOM conditions */
+ 	struct bfq_queue oom_bfqq;
+ 
+-	spinlock_t lock;
+-
+ 	/*
+ 	 * bic associated with the task issuing current bio for
+ 	 * merging. This and the next field are used as a support to
+diff --git a/block/mq-deadline.c b/block/mq-deadline.c
+index 3e3719093aec..525ce44bd14b 100644
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -623,7 +623,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ 	struct request *free = NULL;
+ 	bool ret;
+ 
+-	spin_lock(&dd->lock);
++	/*
++	 * bio merging is called for every bio queued, and it's very easy
++	 * to run into contention because of that. If we fail getting
++	 * the dd lock, just skip this merge attempt. For related IO, the
++	 * plug will be the successful merging point. If we get here, we
++	 * already failed doing the obvious merge. Chances of actually
++	 * getting a merge off this path is a lot slimmer, so skipping an
++	 * occassional lookup that will most likely not succeed anyway should
++	 * not be a problem.
++	 */
++	if (!spin_trylock(&dd->lock))
++		return false;
++
+ 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ 	spin_unlock(&dd->lock);
+ 
+@@ -636,10 +648,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ /*
+  * add rq to rbtree and fifo
+  */
+-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
++static void dd_insert_request(struct request_queue *q, struct request *rq,
+ 			      blk_insert_t flags, struct list_head *free)
+ {
+-	struct request_queue *q = hctx->queue;
+ 	struct deadline_data *dd = q->elevator->elevator_data;
+ 	const enum dd_data_dir data_dir = rq_data_dir(rq);
+ 	u16 ioprio = req_get_ioprio(rq);
+@@ -697,7 +708,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ 
+ 		rq = list_first_entry(list, struct request, queuelist);
+ 		list_del_init(&rq->queuelist);
+-		dd_insert_request(hctx, rq, flags, &free);
++		dd_insert_request(q, rq, flags, &free);
+ 	}
+ 	spin_unlock(&dd->lock);
+ 
+-- 
+2.52.0
+
diff --git a/sys-kernel/gentoo-sources-6.18/0007-crypto.patch b/sys-kernel/gentoo-sources-6.18/0007-crypto.patch
new file mode 100644
index 0000000..8cb7cd9
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/0007-crypto.patch
@@ -0,0 +1,3441 @@
+From 19c062d3d4cd46ac9095f8ef8133c0e3c01a9d4f Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 18 Dec 2025 16:42:00 +0100
+Subject: [PATCH 07/11] crypto
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/crypto/Makefile                      |    5 +-
+ arch/x86/crypto/aes-gcm-aesni-x86_64.S        |   12 +-
+ arch/x86/crypto/aes-gcm-vaes-avx2.S           | 1150 +++++++++++++++++
+ ...m-avx10-x86_64.S => aes-gcm-vaes-avx512.S} |  722 +++++------
+ arch/x86/crypto/aesni-intel_glue.c            |  264 ++--
+ drivers/md/Kconfig                            |    1 +
+ drivers/md/dm-verity-fec.c                    |   21 +-
+ drivers/md/dm-verity-fec.h                    |    5 +-
+ drivers/md/dm-verity-target.c                 |  203 ++-
+ drivers/md/dm-verity.h                        |   52 +-
+ include/linux/rhashtable.h                    |   70 +-
+ 11 files changed, 1921 insertions(+), 584 deletions(-)
+ create mode 100644 arch/x86/crypto/aes-gcm-vaes-avx2.S
+ rename arch/x86/crypto/{aes-gcm-avx10-x86_64.S => aes-gcm-vaes-avx512.S} (69%)
+
+diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
+index 2d30d5d36145..6409e3009524 100644
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -46,8 +46,9 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
+ 			       aes-gcm-aesni-x86_64.o \
+-			       aes-xts-avx-x86_64.o \
+-			       aes-gcm-avx10-x86_64.o
++			       aes-gcm-vaes-avx2.o \
++			       aes-gcm-vaes-avx512.o \
++			       aes-xts-avx-x86_64.o
+ 
+ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
+index 45940e2883a0..7c8a8a32bd3c 100644
+--- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S
++++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
+@@ -61,15 +61,15 @@
+ // for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
+ // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+ //
+-// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
++// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
+ // more thoroughly commented.  This file has the following notable changes:
+ //
+ //    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
+ //      there is only one AES block (and GHASH block) per register.
+ //
+-//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
+-//      32.  We work around this by being much more careful about using
+-//      registers, relying heavily on loads to load values as they are needed.
++//    - Without AVX512, only 16 SIMD registers are available instead of 32.  We
++//      work around this by being much more careful about using registers,
++//      relying heavily on loads to load values as they are needed.
+ //
+ //    - Masking is not available either.  We work around this by implementing
+ //      partial block loads and stores using overlapping scalar loads and stores
+@@ -90,8 +90,8 @@
+ //      multiplication instead of schoolbook multiplication.  This saves one
+ //      pclmulqdq instruction per block, at the cost of one 64-bit load, one
+ //      pshufd, and 0.25 pxors per block.  (This is without the three-argument
+-//      XOR support that would be provided by AVX512 / AVX10, which would be
+-//      more beneficial to schoolbook than Karatsuba.)
++//      XOR support that would be provided by AVX512, which would be more
++//      beneficial to schoolbook than Karatsuba.)
+ //
+ //      As a rough approximation, we can assume that Karatsuba multiplication is
+ //      faster than schoolbook multiplication in this context if one pshufd and
+diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S
+new file mode 100644
+index 000000000000..5ccbd85383cd
+--- /dev/null
++++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S
+@@ -0,0 +1,1150 @@
++/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
++//
++// AES-GCM implementation for x86_64 CPUs that support the following CPU
++// features: VAES && VPCLMULQDQ && AVX2
++//
++// Copyright 2025 Google LLC
++//
++// Author: Eric Biggers <ebiggers@google.com>
++//
++//------------------------------------------------------------------------------
++//
++// This file is dual-licensed, meaning that you can use it under your choice of
++// either of the following two licenses:
++//
++// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
++// of the License at
++//
++//	http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++//
++// or
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are met:
++//
++// 1. Redistributions of source code must retain the above copyright notice,
++//    this list of conditions and the following disclaimer.
++//
++// 2. Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++// POSSIBILITY OF SUCH DAMAGE.
++//
++// -----------------------------------------------------------------------------
++//
++// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512.
++// This means it can only use 16 vector registers instead of 32, the maximum
++// vector length is 32 bytes, and some instructions such as vpternlogd and
++// masked loads/stores are unavailable.  However, it is able to run on CPUs that
++// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs),
++// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest.
++//
++// This implementation also uses Karatsuba multiplication instead of schoolbook
++// multiplication for GHASH in its main loop.  This does not help much on Intel,
++// but it improves performance by ~5% on AMD Zen 3.  Other factors weighing
++// slightly in favor of Karatsuba multiplication in this implementation are the
++// lower maximum vector length (which means there are fewer key powers, so we
++// can cache the halves of each key power XOR'd together and still use less
++// memory than the AVX512 implementation), and the unavailability of the
++// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba).
++
++#include <linux/linkage.h>
++
++.section .rodata
++.p2align 4
++
++	// The below three 16-byte values must be in the order that they are, as
++	// they are really two 32-byte tables and a 16-byte value that overlap:
++	//
++	// - The first 32-byte table begins at .Lselect_high_bytes_table.
++	//   For 0 <= len <= 16, the 16-byte value at
++	//   '.Lselect_high_bytes_table + len' selects the high 'len' bytes of
++	//   another 16-byte value when AND'ed with it.
++	//
++	// - The second 32-byte table begins at .Lrshift_and_bswap_table.
++	//   For 0 <= len <= 16, the 16-byte value at
++	//   '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the
++	//   following operation: right-shift by '16 - len' bytes (shifting in
++	//   zeroes), then reflect all 16 bytes.
++	//
++	// - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects
++	//   all 16 bytes.
++.Lselect_high_bytes_table:
++	.octa	0
++.Lrshift_and_bswap_table:
++	.octa	0xffffffffffffffffffffffffffffffff
++.Lbswap_mask:
++	.octa	0x000102030405060708090a0b0c0d0e0f
++
++	// Sixteen 0x0f bytes.  By XOR'ing an entry of .Lrshift_and_bswap_table
++	// with this, we get a mask that left-shifts by '16 - len' bytes.
++.Lfifteens:
++	.octa	0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
++
++	// This is the GHASH reducing polynomial without its constant term, i.e.
++	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
++	// between bits and polynomial coefficients.
++	//
++	// Alternatively, it can be interpreted as the naturally-ordered
++	// representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
++	// "reversed" GHASH reducing polynomial without its x^128 term.
++.Lgfpoly:
++	.octa	0xc2000000000000000000000000000001
++
++	// Same as above, but with the (1 << 64) bit set.
++.Lgfpoly_and_internal_carrybit:
++	.octa	0xc2000000000000010000000000000001
++
++	// Values needed to prepare the initial vector of counter blocks.
++.Lctr_pattern:
++	.octa	0
++	.octa	1
++
++	// The number of AES blocks per vector, as a 128-bit value.
++.Linc_2blocks:
++	.octa	2
++
++// Offsets in struct aes_gcm_key_vaes_avx2
++#define OFFSETOF_AESKEYLEN	480
++#define OFFSETOF_H_POWERS	512
++#define NUM_H_POWERS		8
++#define OFFSETOFEND_H_POWERS    (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
++#define OFFSETOF_H_POWERS_XORED	OFFSETOFEND_H_POWERS
++
++.text
++
++// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes
++// of \b and storing the reduced products in \dst.  Uses schoolbook
++// multiplication.
++.macro	_ghash_mul_step	i, a, b, dst, gfpoly, t0, t1, t2
++.if \i == 0
++	vpclmulqdq	$0x00, \a, \b, \t0	  // LO = a_L * b_L
++	vpclmulqdq	$0x01, \a, \b, \t1	  // MI_0 = a_L * b_H
++.elseif \i == 1
++	vpclmulqdq	$0x10, \a, \b, \t2	  // MI_1 = a_H * b_L
++.elseif \i == 2
++	vpxor		\t2, \t1, \t1		  // MI = MI_0 + MI_1
++.elseif \i == 3
++	vpclmulqdq	$0x01, \t0, \gfpoly, \t2  // LO_L*(x^63 + x^62 + x^57)
++.elseif \i == 4
++	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
++.elseif \i == 5
++	vpxor		\t0, \t1, \t1		  // Fold LO into MI (part 1)
++	vpxor		\t2, \t1, \t1		  // Fold LO into MI (part 2)
++.elseif \i == 6
++	vpclmulqdq	$0x11, \a, \b, \dst	  // HI = a_H * b_H
++.elseif \i == 7
++	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
++.elseif \i == 8
++	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
++.elseif \i == 9
++	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
++	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
++.endif
++.endm
++
++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
++// the reduced products in \dst.  See _ghash_mul_step for full explanation.
++.macro	_ghash_mul	a, b, dst, gfpoly, t0, t1, t2
++.irp i, 0,1,2,3,4,5,6,7,8,9
++	_ghash_mul_step	\i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
++.endr
++.endm
++
++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
++// *unreduced* products to \lo, \mi, and \hi.
++.macro	_ghash_mul_noreduce	a, b, lo, mi, hi, t0
++	vpclmulqdq	$0x00, \a, \b, \t0	// a_L * b_L
++	vpxor		\t0, \lo, \lo
++	vpclmulqdq	$0x01, \a, \b, \t0	// a_L * b_H
++	vpxor		\t0, \mi, \mi
++	vpclmulqdq	$0x10, \a, \b, \t0	// a_H * b_L
++	vpxor		\t0, \mi, \mi
++	vpclmulqdq	$0x11, \a, \b, \t0	// a_H * b_H
++	vpxor		\t0, \hi, \hi
++.endm
++
++// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
++// reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
++.macro	_ghash_reduce	lo, mi, hi, gfpoly, t0
++	vpclmulqdq	$0x01, \lo, \gfpoly, \t0
++	vpshufd		$0x4e, \lo, \lo
++	vpxor		\lo, \mi, \mi
++	vpxor		\t0, \mi, \mi
++	vpclmulqdq	$0x01, \mi, \gfpoly, \t0
++	vpshufd		$0x4e, \mi, \mi
++	vpxor		\mi, \hi, \hi
++	vpxor		\t0, \hi, \hi
++.endm
++
++// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
++// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
++.macro	_ghash_square	a, dst, gfpoly, t0, t1
++	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
++	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
++	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
++	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
++	vpxor		\t0, \t1, \t1		  // Fold LO into MI
++	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
++	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
++	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
++	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
++.endm
++
++// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
++//
++// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
++// initialize |key->h_powers| and |key->h_powers_xored|.
++//
++// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to
++// store the 64-bit halves of the key powers XOR'd together (for Karatsuba
++// multiplication) in the order 8,6,7,5,4,2,3,1.
++SYM_FUNC_START(aes_gcm_precompute_vaes_avx2)
++
++	// Function arguments
++	.set	KEY,		%rdi
++
++	// Additional local variables
++	.set	POWERS_PTR,	%rsi
++	.set	RNDKEYLAST_PTR,	%rdx
++	.set	TMP0,		%ymm0
++	.set	TMP0_XMM,	%xmm0
++	.set	TMP1,		%ymm1
++	.set	TMP1_XMM,	%xmm1
++	.set	TMP2,		%ymm2
++	.set	TMP2_XMM,	%xmm2
++	.set	H_CUR,		%ymm3
++	.set	H_CUR_XMM,	%xmm3
++	.set	H_CUR2,		%ymm4
++	.set	H_CUR2_XMM,	%xmm4
++	.set	H_INC,		%ymm5
++	.set	H_INC_XMM,	%xmm5
++	.set	GFPOLY,		%ymm6
++	.set	GFPOLY_XMM,	%xmm6
++
++	// Encrypt an all-zeroes block to get the raw hash subkey.
++	movl		OFFSETOF_AESKEYLEN(KEY), %eax
++	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
++	vmovdqu		(KEY), H_CUR_XMM  // Zero-th round key XOR all-zeroes block
++	lea		16(KEY), %rax
++1:
++	vaesenc		(%rax), H_CUR_XMM, H_CUR_XMM
++	add		$16, %rax
++	cmp		%rax, RNDKEYLAST_PTR
++	jne		1b
++	vaesenclast	(RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM
++
++	// Reflect the bytes of the raw hash subkey.
++	vpshufb		.Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM
++
++	// Finish preprocessing the byte-reflected hash subkey by multiplying it
++	// by x^-1 ("standard" interpretation of polynomial coefficients) or
++	// equivalently x^1 (natural interpretation).  This gets the key into a
++	// format that avoids having to bit-reflect the data blocks later.
++	vpshufd		$0xd3, H_CUR_XMM, TMP0_XMM
++	vpsrad		$31, TMP0_XMM, TMP0_XMM
++	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
++	vpand		.Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM
++	vpxor		TMP0_XMM, H_CUR_XMM, H_CUR_XMM
++
++	// Load the gfpoly constant.
++	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
++
++	// Square H^1 to get H^2.
++	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM
++
++	// Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
++	vinserti128	$1, H_CUR_XMM, H_INC, H_CUR
++	vinserti128	$1, H_INC_XMM, H_INC, H_INC
++
++	// Compute H_CUR2 = [H^4, H^3].
++	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
++
++	// Store [H^2, H^1] and [H^4, H^3].
++	vmovdqu		H_CUR, OFFSETOF_H_POWERS+3*32(KEY)
++	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+2*32(KEY)
++
++	// For Karatsuba multiplication: compute and store the two 64-bit halves
++	// of each key power XOR'd together.  Order is 4,2,3,1.
++	vpunpcklqdq	H_CUR, H_CUR2, TMP0
++	vpunpckhqdq	H_CUR, H_CUR2, TMP1
++	vpxor		TMP1, TMP0, TMP0
++	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED+32(KEY)
++
++	// Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
++	_ghash_mul	H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2
++	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
++	vmovdqu		H_CUR, OFFSETOF_H_POWERS+1*32(KEY)
++	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+0*32(KEY)
++
++	// Again, compute and store the two 64-bit halves of each key power
++	// XOR'd together.  Order is 8,6,7,5.
++	vpunpcklqdq	H_CUR, H_CUR2, TMP0
++	vpunpckhqdq	H_CUR, H_CUR2, TMP1
++	vpxor		TMP1, TMP0, TMP0
++	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED(KEY)
++
++	vzeroupper
++	RET
++SYM_FUNC_END(aes_gcm_precompute_vaes_avx2)
++
++// Do one step of the GHASH update of four vectors of data blocks.
++//   \i: the step to do, 0 through 9
++//   \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
++//   KEY: pointer to struct aes_gcm_key_vaes_avx2
++//   BSWAP_MASK: mask for reflecting the bytes of blocks
++//   H_POW[2-1]_XORED: cached values from KEY->h_powers_xored
++//   TMP[0-2]: temporary registers.  TMP[1-2] must be preserved across steps.
++//   LO, MI: working state for this macro that must be preserved across steps
++//   GHASH_ACC: the GHASH accumulator (input/output)
++.macro	_ghash_step_4x	i, ghashdata_ptr
++	.set		HI, GHASH_ACC # alias
++	.set		HI_XMM, GHASH_ACC_XMM
++.if \i == 0
++	// First vector
++	vmovdqu		0*32(\ghashdata_ptr), TMP1
++	vpshufb		BSWAP_MASK, TMP1, TMP1
++	vmovdqu		OFFSETOF_H_POWERS+0*32(KEY), TMP2
++	vpxor		GHASH_ACC, TMP1, TMP1
++	vpclmulqdq	$0x00, TMP2, TMP1, LO
++	vpclmulqdq	$0x11, TMP2, TMP1, HI
++	vpunpckhqdq	TMP1, TMP1, TMP0
++	vpxor		TMP1, TMP0, TMP0
++	vpclmulqdq	$0x00, H_POW2_XORED, TMP0, MI
++.elseif \i == 1
++.elseif \i == 2
++	// Second vector
++	vmovdqu		1*32(\ghashdata_ptr), TMP1
++	vpshufb		BSWAP_MASK, TMP1, TMP1
++	vmovdqu		OFFSETOF_H_POWERS+1*32(KEY), TMP2
++	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
++	vpxor		TMP0, LO, LO
++	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
++	vpxor		TMP0, HI, HI
++	vpunpckhqdq	TMP1, TMP1, TMP0
++	vpxor		TMP1, TMP0, TMP0
++	vpclmulqdq	$0x10, H_POW2_XORED, TMP0, TMP0
++	vpxor		TMP0, MI, MI
++.elseif \i == 3
++	// Third vector
++	vmovdqu		2*32(\ghashdata_ptr), TMP1
++	vpshufb		BSWAP_MASK, TMP1, TMP1
++	vmovdqu		OFFSETOF_H_POWERS+2*32(KEY), TMP2
++.elseif \i == 4
++	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
++	vpxor		TMP0, LO, LO
++	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
++	vpxor		TMP0, HI, HI
++.elseif \i == 5
++	vpunpckhqdq	TMP1, TMP1, TMP0
++	vpxor		TMP1, TMP0, TMP0
++	vpclmulqdq	$0x00, H_POW1_XORED, TMP0, TMP0
++	vpxor		TMP0, MI, MI
++
++	// Fourth vector
++	vmovdqu		3*32(\ghashdata_ptr), TMP1
++	vpshufb		BSWAP_MASK, TMP1, TMP1
++.elseif \i == 6
++	vmovdqu		OFFSETOF_H_POWERS+3*32(KEY), TMP2
++	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
++	vpxor		TMP0, LO, LO
++	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
++	vpxor		TMP0, HI, HI
++	vpunpckhqdq	TMP1, TMP1, TMP0
++	vpxor		TMP1, TMP0, TMP0
++	vpclmulqdq	$0x10, H_POW1_XORED, TMP0, TMP0
++	vpxor		TMP0, MI, MI
++.elseif \i == 7
++	// Finalize 'mi' following Karatsuba multiplication.
++	vpxor		LO, MI, MI
++	vpxor		HI, MI, MI
++
++	// Fold lo into mi.
++	vbroadcasti128	.Lgfpoly(%rip), TMP2
++	vpclmulqdq	$0x01, LO, TMP2, TMP0
++	vpshufd		$0x4e, LO, LO
++	vpxor		LO, MI, MI
++	vpxor		TMP0, MI, MI
++.elseif \i == 8
++	// Fold mi into hi.
++	vpclmulqdq	$0x01, MI, TMP2, TMP0
++	vpshufd		$0x4e, MI, MI
++	vpxor		MI, HI, HI
++	vpxor		TMP0, HI, HI
++.elseif \i == 9
++	vextracti128	$1, HI, TMP0_XMM
++	vpxor		TMP0_XMM, HI_XMM, GHASH_ACC_XMM
++.endif
++.endm
++
++// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
++// explanation.
++.macro	_ghash_4x	ghashdata_ptr
++.irp i, 0,1,2,3,4,5,6,7,8,9
++	_ghash_step_4x	\i, \ghashdata_ptr
++.endr
++.endm
++
++// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst
++// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
++.macro	_load_partial_block	src, dst, tmp64, tmp32
++	sub		$8, %ecx		// LEN - 8
++	jle		.Lle8\@
++
++	// Load 9 <= LEN <= 16 bytes.
++	vmovq		(\src), \dst		// Load first 8 bytes
++	mov		(\src, %rcx), %rax	// Load last 8 bytes
++	neg		%ecx
++	shl		$3, %ecx
++	shr		%cl, %rax		// Discard overlapping bytes
++	vpinsrq		$1, %rax, \dst, \dst
++	jmp		.Ldone\@
++
++.Lle8\@:
++	add		$4, %ecx		// LEN - 4
++	jl		.Llt4\@
++
++	// Load 4 <= LEN <= 8 bytes.
++	mov		(\src), %eax		// Load first 4 bytes
++	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
++	jmp		.Lcombine\@
++
++.Llt4\@:
++	// Load 1 <= LEN <= 3 bytes.
++	add		$2, %ecx		// LEN - 2
++	movzbl		(\src), %eax		// Load first byte
++	jl		.Lmovq\@
++	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
++.Lcombine\@:
++	shl		$3, %ecx
++	shl		%cl, \tmp64
++	or		\tmp64, %rax		// Combine the two parts
++.Lmovq\@:
++	vmovq		%rax, \dst
++.Ldone\@:
++.endm
++
++// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst.
++// Clobbers %rax, %rcx, and \tmp{64,32}.
++.macro	_store_partial_block	src, dst, tmp64, tmp32
++	sub		$8, %ecx		// LEN - 8
++	jl		.Llt8\@
++
++	// Store 8 <= LEN <= 16 bytes.
++	vpextrq		$1, \src, %rax
++	mov		%ecx, \tmp32
++	shl		$3, %ecx
++	ror		%cl, %rax
++	mov		%rax, (\dst, \tmp64)	// Store last LEN - 8 bytes
++	vmovq		\src, (\dst)		// Store first 8 bytes
++	jmp		.Ldone\@
++
++.Llt8\@:
++	add		$4, %ecx		// LEN - 4
++	jl		.Llt4\@
++
++	// Store 4 <= LEN <= 7 bytes.
++	vpextrd		$1, \src, %eax
++	mov		%ecx, \tmp32
++	shl		$3, %ecx
++	ror		%cl, %eax
++	mov		%eax, (\dst, \tmp64)	// Store last LEN - 4 bytes
++	vmovd		\src, (\dst)		// Store first 4 bytes
++	jmp		.Ldone\@
++
++.Llt4\@:
++	// Store 1 <= LEN <= 3 bytes.
++	vpextrb		$0, \src, 0(\dst)
++	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
++	jl		.Ldone\@
++	vpextrb		$1, \src, 1(\dst)
++	je		.Ldone\@
++	vpextrb		$2, \src, 2(\dst)
++.Ldone\@:
++.endm
++
++// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++//				     u8 ghash_acc[16],
++//				     const u8 *aad, int aadlen);
++//
++// This function processes the AAD (Additional Authenticated Data) in GCM.
++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
++// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
++// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
++// can be any length.  The caller must do any buffering needed to ensure this.
++//
++// This handles large amounts of AAD efficiently, while also keeping overhead
++// low for small amounts which is the common case.  TLS and IPsec use less than
++// one block of AAD, but (uncommonly) other use cases may use much more.
++SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2)
++
++	// Function arguments
++	.set	KEY,		%rdi
++	.set	GHASH_ACC_PTR,	%rsi
++	.set	AAD,		%rdx
++	.set	AADLEN,		%ecx	// Must be %ecx for _load_partial_block
++	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
++
++	// Additional local variables.
++	// %rax and %r8 are used as temporary registers.
++	.set	TMP0,		%ymm0
++	.set	TMP0_XMM,	%xmm0
++	.set	TMP1,		%ymm1
++	.set	TMP1_XMM,	%xmm1
++	.set	TMP2,		%ymm2
++	.set	TMP2_XMM,	%xmm2
++	.set	LO,		%ymm3
++	.set	LO_XMM,		%xmm3
++	.set	MI,		%ymm4
++	.set	MI_XMM,		%xmm4
++	.set	GHASH_ACC,	%ymm5
++	.set	GHASH_ACC_XMM,	%xmm5
++	.set	BSWAP_MASK,	%ymm6
++	.set	BSWAP_MASK_XMM,	%xmm6
++	.set	GFPOLY,		%ymm7
++	.set	GFPOLY_XMM,	%xmm7
++	.set	H_POW2_XORED,	%ymm8
++	.set	H_POW1_XORED,	%ymm9
++
++	// Load the bswap_mask and gfpoly constants.  Since AADLEN is usually
++	// small, usually only 128-bit vectors will be used.  So as an
++	// optimization, don't broadcast these constants to both 128-bit lanes
++	// quite yet.
++	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
++	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
++
++	// Load the GHASH accumulator.
++	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
++
++	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
++	test		AADLEN, AADLEN
++	jz		.Laad_done
++	cmp		$16, AADLEN
++	jle		.Laad_lastblock
++
++	// AADLEN > 16, so we'll operate on full vectors.  Broadcast bswap_mask
++	// and gfpoly to both 128-bit lanes.
++	vinserti128	$1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK
++	vinserti128	$1, GFPOLY_XMM, GFPOLY, GFPOLY
++
++	// If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time.
++	add		$-128, AADLEN	// 128 is 4 bytes, -128 is 1 byte
++	jl		.Laad_loop_4x_done
++	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
++	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
++.Laad_loop_4x:
++	_ghash_4x	AAD
++	sub		$-128, AAD
++	add		$-128, AADLEN
++	jge		.Laad_loop_4x
++.Laad_loop_4x_done:
++
++	// If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time.
++	add		$96, AADLEN
++	jl		.Laad_loop_1x_done
++.Laad_loop_1x:
++	vmovdqu		(AAD), TMP0
++	vpshufb		BSWAP_MASK, TMP0, TMP0
++	vpxor		TMP0, GHASH_ACC, GHASH_ACC
++	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
++	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
++	vextracti128	$1, GHASH_ACC, TMP0_XMM
++	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	add		$32, AAD
++	sub		$32, AADLEN
++	jge		.Laad_loop_1x
++.Laad_loop_1x_done:
++	add		$32, AADLEN
++	// Now 0 <= AADLEN < 32.
++
++	jz		.Laad_done
++	cmp		$16, AADLEN
++	jle		.Laad_lastblock
++
++.Laad_last2blocks:
++	// Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD.
++	mov		AADLEN, AADLEN	// Zero-extend AADLEN to AADLEN64.
++	vmovdqu		(AAD), TMP0_XMM
++	vmovdqu		-16(AAD, AADLEN64), TMP1_XMM
++	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
++	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	lea		.Lrshift_and_bswap_table(%rip), %rax
++	vpshufb		-16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM
++	vinserti128	$1, TMP1_XMM, GHASH_ACC, GHASH_ACC
++	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
++	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
++	vextracti128	$1, GHASH_ACC, TMP0_XMM
++	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	jmp		.Laad_done
++
++.Laad_lastblock:
++	// Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD.
++	_load_partial_block	AAD, TMP0_XMM, %r8, %r8d
++	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
++	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM
++	_ghash_mul	TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
++			TMP1_XMM, TMP2_XMM, LO_XMM
++
++.Laad_done:
++	// Store the updated GHASH accumulator back to memory.
++	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
++
++	vzeroupper
++	RET
++SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2)
++
++// Do one non-last round of AES encryption on the blocks in the given AESDATA
++// vectors using the round key that has been broadcast to all 128-bit lanes of
++// \round_key.
++.macro	_vaesenc	round_key, vecs:vararg
++.irp i, \vecs
++	vaesenc		\round_key, AESDATA\i, AESDATA\i
++.endr
++.endm
++
++// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES
++// round on them.  Clobbers TMP0.
++.macro	_ctr_begin	vecs:vararg
++	vbroadcasti128	.Linc_2blocks(%rip), TMP0
++.irp i, \vecs
++	vpshufb		BSWAP_MASK, LE_CTR, AESDATA\i
++	vpaddd		TMP0, LE_CTR, LE_CTR
++.endr
++.irp i, \vecs
++	vpxor		RNDKEY0, AESDATA\i, AESDATA\i
++.endr
++.endm
++
++// Generate and encrypt counter blocks in the given AESDATA vectors, excluding
++// the last AES round.  Clobbers TMP0.
++.macro	_aesenc_loop	vecs:vararg
++	_ctr_begin	\vecs
++	lea		16(KEY), %rax
++.Laesenc_loop\@:
++	vbroadcasti128	(%rax), TMP0
++	_vaesenc	TMP0, \vecs
++	add		$16, %rax
++	cmp		%rax, RNDKEYLAST_PTR
++	jne		.Laesenc_loop\@
++.endm
++
++// Finalize the keystream blocks in the given AESDATA vectors by doing the last
++// AES round, then XOR those keystream blocks with the corresponding data.
++// Reduce latency by doing the XOR before the vaesenclast, utilizing the
++// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).  Clobbers TMP0.
++.macro	_aesenclast_and_xor	vecs:vararg
++.irp i, \vecs
++	vpxor		\i*32(SRC), RNDKEYLAST, TMP0
++	vaesenclast	TMP0, AESDATA\i, AESDATA\i
++.endr
++.irp i, \vecs
++	vmovdqu		AESDATA\i, \i*32(DST)
++.endr
++.endm
++
++// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++//					   const u32 le_ctr[4], u8 ghash_acc[16],
++//					   const u8 *src, u8 *dst, int datalen);
++//
++// This macro generates a GCM encryption or decryption update function with the
++// above prototype (with \enc selecting which one).  The function computes the
++// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
++// and writes the resulting encrypted or decrypted data to |dst|.  It also
++// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
++// bytes.
++//
++// |datalen| must be a multiple of 16, except on the last call where it can be
++// any length.  The caller must do any buffering needed to ensure this.  Both
++// in-place and out-of-place en/decryption are supported.
++//
++// |le_ctr| must give the current counter in little-endian format.  This
++// function loads the counter from |le_ctr| and increments the loaded counter as
++// needed, but it does *not* store the updated counter back to |le_ctr|.  The
++// caller must update |le_ctr| if any more data segments follow.  Internally,
++// only the low 32-bit word of the counter is incremented, following the GCM
++// standard.
++.macro	_aes_gcm_update	enc
++
++	// Function arguments
++	.set	KEY,		%rdi
++	.set	LE_CTR_PTR,	%rsi
++	.set	LE_CTR_PTR32,	%esi
++	.set	GHASH_ACC_PTR,	%rdx
++	.set	SRC,		%rcx	// Assumed to be %rcx.
++					// See .Ltail_xor_and_ghash_partial_vec
++	.set	DST,		%r8
++	.set	DATALEN,	%r9d
++	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
++
++	// Additional local variables
++
++	// %rax is used as a temporary register.  LE_CTR_PTR is also available
++	// as a temporary register after the counter is loaded.
++
++	// AES key length in bytes
++	.set	AESKEYLEN,	%r10d
++	.set	AESKEYLEN64,	%r10
++
++	// Pointer to the last AES round key for the chosen AES variant
++	.set	RNDKEYLAST_PTR,	%r11
++
++	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
++	// using vpshufb, copied to all 128-bit lanes.
++	.set	BSWAP_MASK,	%ymm0
++	.set	BSWAP_MASK_XMM,	%xmm0
++
++	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
++	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
++	// more than one lane may be used, and they need to be XOR'd together.
++	.set	GHASH_ACC,	%ymm1
++	.set	GHASH_ACC_XMM,	%xmm1
++
++	// TMP[0-2] are temporary registers.
++	.set	TMP0,		%ymm2
++	.set	TMP0_XMM,	%xmm2
++	.set	TMP1,		%ymm3
++	.set	TMP1_XMM,	%xmm3
++	.set	TMP2,		%ymm4
++	.set	TMP2_XMM,	%xmm4
++
++	// LO and MI are used to accumulate unreduced GHASH products.
++	.set	LO,		%ymm5
++	.set	LO_XMM,		%xmm5
++	.set	MI,		%ymm6
++	.set	MI_XMM,		%xmm6
++
++	// H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored.  The
++	// descending numbering reflects the order of the key powers.
++	.set	H_POW2_XORED,	%ymm7
++	.set	H_POW2_XORED_XMM, %xmm7
++	.set	H_POW1_XORED,	%ymm8
++	.set	H_POW1_XORED_XMM, %xmm8
++
++	// RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
++	.set	RNDKEY0,	%ymm9
++	.set	RNDKEYLAST,	%ymm10
++
++	// LE_CTR contains the next set of little-endian counter blocks.
++	.set	LE_CTR,		%ymm11
++
++	// AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
++	.set	AESDATA0,	%ymm12
++	.set	AESDATA0_XMM,	%xmm12
++	.set	AESDATA1,	%ymm13
++	.set	AESDATA1_XMM,	%xmm13
++	.set	AESDATA2,	%ymm14
++	.set	AESDATA2_XMM,	%xmm14
++	.set	AESDATA3,	%ymm15
++	.set	AESDATA3_XMM,	%xmm15
++
++.if \enc
++	.set	GHASHDATA_PTR,	DST
++.else
++	.set	GHASHDATA_PTR,	SRC
++.endif
++
++	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
++
++	// Load the GHASH accumulator and the starting counter.
++	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
++	vbroadcasti128	(LE_CTR_PTR), LE_CTR
++
++	// Load the AES key length in bytes.
++	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
++
++	// Make RNDKEYLAST_PTR point to the last AES round key.  This is the
++	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
++	// respectively.  Then load the zero-th and last round keys.
++	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
++	vbroadcasti128	(KEY), RNDKEY0
++	vbroadcasti128	(RNDKEYLAST_PTR), RNDKEYLAST
++
++	// Finish initializing LE_CTR by adding 1 to the second block.
++	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
++
++	// If there are at least 128 bytes of data, then continue into the loop
++	// that processes 128 bytes of data at a time.  Otherwise skip it.
++	add		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
++	jl		.Lcrypt_loop_4x_done\@
++
++	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
++	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
++
++	// Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
++
++.if \enc
++	// Encrypt the first 4 vectors of plaintext blocks.
++	_aesenc_loop	0,1,2,3
++	_aesenclast_and_xor	0,1,2,3
++	sub		$-128, SRC	// 128 is 4 bytes, -128 is 1 byte
++	add		$-128, DATALEN
++	jl		.Lghash_last_ciphertext_4x\@
++.endif
++
++.align 16
++.Lcrypt_loop_4x\@:
++
++	// Start the AES encryption of the counter blocks.
++	_ctr_begin	0,1,2,3
++	cmp		$24, AESKEYLEN
++	jl		128f	// AES-128?
++	je		192f	// AES-192?
++	// AES-256
++	vbroadcasti128	-13*16(RNDKEYLAST_PTR), TMP0
++	_vaesenc	TMP0, 0,1,2,3
++	vbroadcasti128	-12*16(RNDKEYLAST_PTR), TMP0
++	_vaesenc	TMP0, 0,1,2,3
++192:
++	vbroadcasti128	-11*16(RNDKEYLAST_PTR), TMP0
++	_vaesenc	TMP0, 0,1,2,3
++	vbroadcasti128	-10*16(RNDKEYLAST_PTR), TMP0
++	_vaesenc	TMP0, 0,1,2,3
++128:
++
++	// Finish the AES encryption of the counter blocks in AESDATA[0-3],
++	// interleaved with the GHASH update of the ciphertext blocks.
++.irp i, 9,8,7,6,5,4,3,2,1
++	_ghash_step_4x  (9 - \i), GHASHDATA_PTR
++	vbroadcasti128	-\i*16(RNDKEYLAST_PTR), TMP0
++	_vaesenc	TMP0, 0,1,2,3
++.endr
++	_ghash_step_4x	9, GHASHDATA_PTR
++.if \enc
++	sub		$-128, DST	// 128 is 4 bytes, -128 is 1 byte
++.endif
++	_aesenclast_and_xor	0,1,2,3
++	sub		$-128, SRC
++.if !\enc
++	sub		$-128, DST
++.endif
++	add		$-128, DATALEN
++	jge		.Lcrypt_loop_4x\@
++
++.if \enc
++.Lghash_last_ciphertext_4x\@:
++	// Update GHASH with the last set of ciphertext blocks.
++	_ghash_4x	DST
++	sub		$-128, DST
++.endif
++
++.Lcrypt_loop_4x_done\@:
++
++	// Undo the extra subtraction by 128 and check whether data remains.
++	sub		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
++	jz		.Ldone\@
++
++	// The data length isn't a multiple of 128 bytes.  Process the remaining
++	// data of length 1 <= DATALEN < 128.
++	//
++	// Since there are enough key powers available for all remaining data,
++	// there is no need to do a GHASH reduction after each iteration.
++	// Instead, multiply each remaining block by its own key power, and only
++	// do a GHASH reduction at the very end.
++
++	// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
++	// is the number of blocks that remain.
++	.set		POWERS_PTR, LE_CTR_PTR	// LE_CTR_PTR is free to be reused.
++	.set		POWERS_PTR32, LE_CTR_PTR32
++	mov		DATALEN, %eax
++	neg		%rax
++	and		$~15, %rax  // -round_up(DATALEN, 16)
++	lea		OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
++
++	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
++	.set		HI, H_POW2_XORED	// H_POW2_XORED is free to be reused.
++	.set		HI_XMM, H_POW2_XORED_XMM
++	vpxor		LO_XMM, LO_XMM, LO_XMM
++	vpxor		MI_XMM, MI_XMM, MI_XMM
++	vpxor		HI_XMM, HI_XMM, HI_XMM
++
++	// 1 <= DATALEN < 128.  Generate 2 or 4 more vectors of keystream blocks
++	// excluding the last AES round, depending on the remaining DATALEN.
++	cmp		$64, DATALEN
++	jg		.Ltail_gen_4_keystream_vecs\@
++	_aesenc_loop	0,1
++	cmp		$32, DATALEN
++	jge		.Ltail_xor_and_ghash_full_vec_loop\@
++	jmp		.Ltail_xor_and_ghash_partial_vec\@
++.Ltail_gen_4_keystream_vecs\@:
++	_aesenc_loop	0,1,2,3
++
++	// XOR the remaining data and accumulate the unreduced GHASH products
++	// for DATALEN >= 32, starting with one full 32-byte vector at a time.
++.Ltail_xor_and_ghash_full_vec_loop\@:
++.if \enc
++	_aesenclast_and_xor	0
++	vpshufb		BSWAP_MASK, AESDATA0, AESDATA0
++.else
++	vmovdqu		(SRC), TMP1
++	vpxor		TMP1, RNDKEYLAST, TMP0
++	vaesenclast	TMP0, AESDATA0, AESDATA0
++	vmovdqu		AESDATA0, (DST)
++	vpshufb		BSWAP_MASK, TMP1, AESDATA0
++.endif
++	// The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0.
++	vpxor		GHASH_ACC, AESDATA0, AESDATA0
++	vmovdqu		(POWERS_PTR), TMP2
++	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
++	vmovdqa		AESDATA1, AESDATA0
++	vmovdqa		AESDATA2, AESDATA1
++	vmovdqa		AESDATA3, AESDATA2
++	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	add		$32, SRC
++	add		$32, DST
++	add		$32, POWERS_PTR
++	sub		$32, DATALEN
++	cmp		$32, DATALEN
++	jge		.Ltail_xor_and_ghash_full_vec_loop\@
++	test		DATALEN, DATALEN
++	jz		.Ltail_ghash_reduce\@
++
++.Ltail_xor_and_ghash_partial_vec\@:
++	// XOR the remaining data and accumulate the unreduced GHASH products,
++	// for 1 <= DATALEN < 32.
++	vaesenclast	RNDKEYLAST, AESDATA0, AESDATA0
++	cmp		$16, DATALEN
++	jle		.Ltail_xor_and_ghash_1to16bytes\@
++
++	// Handle 17 <= DATALEN < 32.
++
++	// Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes
++	// (shifting in zeroes), then reflect all 16 bytes.
++	lea		.Lrshift_and_bswap_table(%rip), %rax
++	vmovdqu		-16(%rax, DATALEN64), TMP2_XMM
++
++	// Move the second keystream block to its own register and left-align it
++	vextracti128	$1, AESDATA0, AESDATA1_XMM
++	vpxor		.Lfifteens(%rip), TMP2_XMM, TMP0_XMM
++	vpshufb		TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM
++
++	// Using overlapping loads and stores, XOR the source data with the
++	// keystream and write the destination data.  Then prepare the GHASH
++	// input data: the full ciphertext block and the zero-padded partial
++	// ciphertext block, both byte-reflected, in AESDATA0.
++.if \enc
++	vpxor		-16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM
++	vpxor		(SRC), AESDATA0_XMM, AESDATA0_XMM
++	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
++	vmovdqu		AESDATA0_XMM, (DST)
++	vpshufb		TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM
++	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
++.else
++	vmovdqu		-16(SRC, DATALEN64), TMP1_XMM
++	vmovdqu		(SRC), TMP0_XMM
++	vpxor		TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM
++	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
++	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
++	vmovdqu		AESDATA0_XMM, (DST)
++	vpshufb		TMP2_XMM, TMP1_XMM, AESDATA1_XMM
++	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
++.endif
++	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
++	vinserti128	$1, AESDATA1_XMM, AESDATA0, AESDATA0
++	vmovdqu		(POWERS_PTR), TMP2
++	jmp		.Ltail_ghash_last_vec\@
++
++.Ltail_xor_and_ghash_1to16bytes\@:
++	// Handle 1 <= DATALEN <= 16.  Carefully load and store the
++	// possibly-partial block, which we mustn't access out of bounds.
++	vmovdqu		(POWERS_PTR), TMP2_XMM
++	mov		SRC, KEY	// Free up %rcx, assuming SRC == %rcx
++	mov		DATALEN, %ecx
++	_load_partial_block	KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32
++	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
++	mov		DATALEN, %ecx
++	_store_partial_block	AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32
++.if \enc
++	lea		.Lselect_high_bytes_table(%rip), %rax
++	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
++	vpand		(%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM
++.else
++	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
++.endif
++	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
++
++.Ltail_ghash_last_vec\@:
++	// Accumulate the unreduced GHASH products for the last 1-2 blocks.  The
++	// GHASH input data is in AESDATA0.  If only one block remains, then the
++	// second block in AESDATA0 is zero and does not affect the result.
++	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
++
++.Ltail_ghash_reduce\@:
++	// Finally, do the GHASH reduction.
++	vbroadcasti128	.Lgfpoly(%rip), TMP0
++	_ghash_reduce	LO, MI, HI, TMP0, TMP1
++	vextracti128	$1, HI, GHASH_ACC_XMM
++	vpxor		HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++
++.Ldone\@:
++	// Store the updated GHASH accumulator back to memory.
++	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
++
++	vzeroupper
++	RET
++.endm
++
++// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++//				    const u32 le_ctr[4], u8 ghash_acc[16],
++//				    u64 total_aadlen, u64 total_datalen);
++// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++//				    const u32 le_ctr[4], const u8 ghash_acc[16],
++//				    u64 total_aadlen, u64 total_datalen,
++//				    const u8 tag[16], int taglen);
++//
++// This macro generates one of the above two functions (with \enc selecting
++// which one).  Both functions finish computing the GCM authentication tag by
++// updating GHASH with the lengths block and encrypting the GHASH accumulator.
++// |total_aadlen| and |total_datalen| must be the total length of the additional
++// authenticated data and the en/decrypted data in bytes, respectively.
++//
++// The encryption function then stores the full-length (16-byte) computed
++// authentication tag to |ghash_acc|.  The decryption function instead loads the
++// expected authentication tag (the one that was transmitted) from the 16-byte
++// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
++// computed tag in constant time, and returns true if and only if they match.
++.macro	_aes_gcm_final	enc
++
++	// Function arguments
++	.set	KEY,		%rdi
++	.set	LE_CTR_PTR,	%rsi
++	.set	GHASH_ACC_PTR,	%rdx
++	.set	TOTAL_AADLEN,	%rcx
++	.set	TOTAL_DATALEN,	%r8
++	.set	TAG,		%r9
++	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
++	.set	TAGLEN64,	%r10
++
++	// Additional local variables.
++	// %rax and %xmm0-%xmm3 are used as temporary registers.
++	.set	AESKEYLEN,	%r11d
++	.set	AESKEYLEN64,	%r11
++	.set	GFPOLY,		%xmm4
++	.set	BSWAP_MASK,	%xmm5
++	.set	LE_CTR,		%xmm6
++	.set	GHASH_ACC,	%xmm7
++	.set	H_POW1,		%xmm8
++
++	// Load some constants.
++	vmovdqa		.Lgfpoly(%rip), GFPOLY
++	vmovdqa		.Lbswap_mask(%rip), BSWAP_MASK
++
++	// Load the AES key length in bytes.
++	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
++
++	// Set up a counter block with 1 in the low 32-bit word.  This is the
++	// counter that produces the ciphertext needed to encrypt the auth tag.
++	// GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
++	vpblendd	$0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
++
++	// Build the lengths block and XOR it with the GHASH accumulator.
++	// Although the lengths block is defined as the AAD length followed by
++	// the en/decrypted data length, both in big-endian byte order, a byte
++	// reflection of the full block is needed because of the way we compute
++	// GHASH (see _ghash_mul_step).  By using little-endian values in the
++	// opposite order, we avoid having to reflect any bytes here.
++	vmovq		TOTAL_DATALEN, %xmm0
++	vpinsrq		$1, TOTAL_AADLEN, %xmm0, %xmm0
++	vpsllq		$3, %xmm0, %xmm0	// Bytes to bits
++	vpxor		(GHASH_ACC_PTR), %xmm0, GHASH_ACC
++
++	// Load the first hash key power (H^1), which is stored last.
++	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1
++
++	// Load TAGLEN if decrypting.
++.if !\enc
++	movl		8(%rsp), TAGLEN
++.endif
++
++	// Make %rax point to the last AES round key for the chosen AES variant.
++	lea		6*16(KEY,AESKEYLEN64,4), %rax
++
++	// Start the AES encryption of the counter block by swapping the counter
++	// block to big-endian and XOR-ing it with the zero-th AES round key.
++	vpshufb		BSWAP_MASK, LE_CTR, %xmm0
++	vpxor		(KEY), %xmm0, %xmm0
++
++	// Complete the AES encryption and multiply GHASH_ACC by H^1.
++	// Interleave the AES and GHASH instructions to improve performance.
++	cmp		$24, AESKEYLEN
++	jl		128f	// AES-128?
++	je		192f	// AES-192?
++	// AES-256
++	vaesenc		-13*16(%rax), %xmm0, %xmm0
++	vaesenc		-12*16(%rax), %xmm0, %xmm0
++192:
++	vaesenc		-11*16(%rax), %xmm0, %xmm0
++	vaesenc		-10*16(%rax), %xmm0, %xmm0
++128:
++.irp i, 0,1,2,3,4,5,6,7,8
++	_ghash_mul_step	\i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
++			%xmm1, %xmm2, %xmm3
++	vaesenc		(\i-9)*16(%rax), %xmm0, %xmm0
++.endr
++	_ghash_mul_step	9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
++			%xmm1, %xmm2, %xmm3
++
++	// Undo the byte reflection of the GHASH accumulator.
++	vpshufb		BSWAP_MASK, GHASH_ACC, GHASH_ACC
++
++	// Do the last AES round and XOR the resulting keystream block with the
++	// GHASH accumulator to produce the full computed authentication tag.
++	//
++	// Reduce latency by taking advantage of the property vaesenclast(key,
++	// a) ^ b == vaesenclast(key ^ b, a).  I.e., XOR GHASH_ACC into the last
++	// round key, instead of XOR'ing the final AES output with GHASH_ACC.
++	//
++	// enc_final then returns the computed auth tag, while dec_final
++	// compares it with the transmitted one and returns a bool.  To compare
++	// the tags, dec_final XORs them together and uses vptest to check
++	// whether the result is all-zeroes.  This should be constant-time.
++	// dec_final applies the vaesenclast optimization to this additional
++	// value XOR'd too.
++.if \enc
++	vpxor		(%rax), GHASH_ACC, %xmm1
++	vaesenclast	%xmm1, %xmm0, GHASH_ACC
++	vmovdqu		GHASH_ACC, (GHASH_ACC_PTR)
++.else
++	vpxor		(TAG), GHASH_ACC, GHASH_ACC
++	vpxor		(%rax), GHASH_ACC, GHASH_ACC
++	vaesenclast	GHASH_ACC, %xmm0, %xmm0
++	lea		.Lselect_high_bytes_table(%rip), %rax
++	vmovdqu		(%rax, TAGLEN64), %xmm1
++	vpshufb		BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high
++	vptest		%xmm1, %xmm0
++	sete		%al
++.endif
++	// No need for vzeroupper here, since only used xmm registers were used.
++	RET
++.endm
++
++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2)
++	_aes_gcm_update	1
++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2)
++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2)
++	_aes_gcm_update	0
++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2)
++
++SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2)
++	_aes_gcm_final	1
++SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2)
++SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2)
++	_aes_gcm_final	0
++SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
+diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
+similarity index 69%
+rename from arch/x86/crypto/aes-gcm-avx10-x86_64.S
+rename to arch/x86/crypto/aes-gcm-vaes-avx512.S
+index 02ee11083d4f..06b71314d65c 100644
+--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
++++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
+@@ -1,6 +1,7 @@
+ /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+ //
+-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
++// AES-GCM implementation for x86_64 CPUs that support the following CPU
++// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+ //
+ // Copyright 2024 Google LLC
+ //
+@@ -45,41 +46,6 @@
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ // POSSIBILITY OF SUCH DAMAGE.
+-//
+-//------------------------------------------------------------------------------
+-//
+-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+-// either AVX512 or AVX10.  Some of the functions, notably the encryption and
+-// decryption update functions which are the most performance-critical, are
+-// provided in two variants generated from a macro: one using 256-bit vectors
+-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
+-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+-//
+-// The functions that use 512-bit vectors are intended for CPUs that support
+-// 512-bit vectors *and* where using them doesn't cause significant
+-// downclocking.  They require the following CPU features:
+-//
+-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+-//
+-// The other functions require the following CPU features:
+-//
+-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+-//
+-// All functions use the "System V" ABI.  The Windows ABI is not supported.
+-//
+-// Note that we use "avx10" in the names of the functions as a shorthand to
+-// really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
+-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+-// to be a simple way to name things that makes sense on all CPUs.
+-//
+-// Note that the macros that support both 256-bit and 512-bit vectors could
+-// fairly easily be changed to support 128-bit too.  However, this would *not*
+-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+-// because the code heavily uses several features of these extensions other than
+-// the vector length: the increase in the number of SIMD registers from 16 to
+-// 32, masking support, and new instructions such as vpternlogd (which can do a
+-// three-argument XOR).  These features are very useful for AES-GCM.
+ 
+ #include <linux/linkage.h>
+ 
+@@ -104,16 +70,14 @@
+ .Lgfpoly_and_internal_carrybit:
+ 	.octa	0xc2000000000000010000000000000001
+ 
+-	// The below constants are used for incrementing the counter blocks.
+-	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+-	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+-	// 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
++	// Values needed to prepare the initial vector of counter blocks.
+ .Lctr_pattern:
+ 	.octa	0
+ 	.octa	1
+-.Linc_2blocks:
+ 	.octa	2
+ 	.octa	3
++
++	// The number of AES blocks per vector, as a 128-bit value.
+ .Linc_4blocks:
+ 	.octa	4
+ 
+@@ -130,29 +94,13 @@
+ // Offset to end of hash key powers array in the key struct.
+ //
+ // This is immediately followed by three zeroized padding blocks, which are
+-// included so that partial vectors can be handled more easily.  E.g. if VL=64
+-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
+-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
++// included so that partial vectors can be handled more easily.  E.g. if two
++// blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most padding
++// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+ #define OFFSETOFEND_H_POWERS	(OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+ 
+ .text
+ 
+-// Set the vector length in bytes.  This sets the VL variable and defines
+-// register aliases V0-V31 that map to the ymm or zmm registers.
+-.macro	_set_veclen	vl
+-	.set	VL,	\vl
+-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+-.if VL == 32
+-	.set	V\i,	%ymm\i
+-.elseif VL == 64
+-	.set	V\i,	%zmm\i
+-.else
+-	.error "Unsupported vector length"
+-.endif
+-.endr
+-.endm
+-
+ // The _ghash_mul_step macro does one step of GHASH multiplication of the
+ // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+ // reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
+@@ -312,39 +260,44 @@
+ 	vpternlogd	$0x96, \t0, \mi, \hi
+ .endm
+ 
+-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
+-//
+-// Given the expanded AES key |key->aes_key|, this function derives the GHASH
+-// subkey and initializes |key->ghash_key_powers| with powers of it.
+-//
+-// The number of key powers initialized is NUM_H_POWERS, and they are stored in
+-// the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
+-// powers themselves are also initialized.
++// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
++// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
++.macro	_ghash_square	a, dst, gfpoly, t0, t1
++	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
++	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
++	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
++	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
++	vpxord		\t0, \t1, \t1		  // Fold LO into MI
++	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
++	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
++	vpternlogd	$0x96, \t0, \t1, \dst	  // Fold MI into HI
++.endm
++
++// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
+ //
+-// This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
+-// with the desired length.  In the VL=32 case, the function computes twice as
+-// many key powers than are actually used by the VL=32 GCM update functions.
+-// This is done to keep the key format the same regardless of vector length.
+-.macro	_aes_gcm_precompute
++// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
++// initialize |key->h_powers| and |key->padding|.
++SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
+ 
+ 	// Function arguments
+ 	.set	KEY,		%rdi
+ 
+-	// Additional local variables.  V0-V2 and %rax are used as temporaries.
++	// Additional local variables.
++	// %zmm[0-2] and %rax are used as temporaries.
+ 	.set	POWERS_PTR,	%rsi
+ 	.set	RNDKEYLAST_PTR,	%rdx
+-	.set	H_CUR,		V3
++	.set	H_CUR,		%zmm3
+ 	.set	H_CUR_YMM,	%ymm3
+ 	.set	H_CUR_XMM,	%xmm3
+-	.set	H_INC,		V4
++	.set	H_INC,		%zmm4
+ 	.set	H_INC_YMM,	%ymm4
+ 	.set	H_INC_XMM,	%xmm4
+-	.set	GFPOLY,		V5
++	.set	GFPOLY,		%zmm5
+ 	.set	GFPOLY_YMM,	%ymm5
+ 	.set	GFPOLY_XMM,	%xmm5
+ 
+ 	// Get pointer to lowest set of key powers (located at end of array).
+-	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
++	lea		OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
+ 
+ 	// Encrypt an all-zeroes block to get the raw hash subkey.
+ 	movl		OFFSETOF_AESKEYLEN(KEY), %eax
+@@ -363,8 +316,8 @@
+ 
+ 	// Zeroize the padding blocks.
+ 	vpxor		%xmm0, %xmm0, %xmm0
+-	vmovdqu		%ymm0, VL(POWERS_PTR)
+-	vmovdqu		%xmm0, VL+2*16(POWERS_PTR)
++	vmovdqu		%ymm0, 64(POWERS_PTR)
++	vmovdqu		%xmm0, 64+2*16(POWERS_PTR)
+ 
+ 	// Finish preprocessing the first key power, H^1.  Since this GHASH
+ 	// implementation operates directly on values with the backwards bit
+@@ -397,54 +350,44 @@
+ 	// special needs to be done to make this happen, though: H^1 * H^1 would
+ 	// end up with two factors of x^-1, but the multiplication consumes one.
+ 	// So the product H^2 ends up with the desired one factor of x^-1.
+-	_ghash_mul	H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
+-			%xmm0, %xmm1, %xmm2
++	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
+ 
+ 	// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+ 	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
+ 	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
+ 
+-.if VL == 64
+ 	// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+ 	_ghash_mul	H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
+ 			%ymm0, %ymm1, %ymm2
+ 	vinserti64x4	$1, H_CUR_YMM, H_INC, H_CUR
+ 	vshufi64x2	$0, H_INC, H_INC, H_INC
+-.endif
+ 
+ 	// Store the lowest set of key powers.
+ 	vmovdqu8	H_CUR, (POWERS_PTR)
+ 
+-	// Compute and store the remaining key powers.  With VL=32, repeatedly
+-	// multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+-	// With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
++	// Compute and store the remaining key powers.
++	// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ 	// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+-	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
+-.Lprecompute_next\@:
+-	sub		$VL, POWERS_PTR
+-	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
++	mov		$3, %eax
++.Lprecompute_next:
++	sub		$64, POWERS_PTR
++	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
+ 	vmovdqu8	H_CUR, (POWERS_PTR)
+ 	dec		%eax
+-	jnz		.Lprecompute_next\@
++	jnz		.Lprecompute_next
+ 
+ 	vzeroupper	// This is needed after using ymm or zmm registers.
+ 	RET
+-.endm
++SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
+ 
+ // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+ // the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
+ .macro	_horizontal_xor	src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
+ 	vextracti32x4	$1, \src, \t0_xmm
+-.if VL == 32
+-	vpxord		\t0_xmm, \src_xmm, \dst_xmm
+-.elseif VL == 64
+ 	vextracti32x4	$2, \src, \t1_xmm
+ 	vextracti32x4	$3, \src, \t2_xmm
+ 	vpxord		\t0_xmm, \src_xmm, \dst_xmm
+ 	vpternlogd	$0x96, \t1_xmm, \t2_xmm, \dst_xmm
+-.else
+-	.error "Unsupported vector length"
+-.endif
+ .endm
+ 
+ // Do one step of the GHASH update of the data blocks given in the vector
+@@ -458,25 +401,21 @@
+ //
+ // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+ // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+-// operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
+-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+-// to the following non-vectorized terms:
+-//
+-//	H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+-//	H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+-//	H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+-//	H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
++// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
++// The vectorized terms correspond to the following non-vectorized terms:
+ //
+-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
++//       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
++//              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
++//       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
++//       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
++//       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
+ //
+ // More concretely, this code does:
+ //   - Do vectorized "schoolbook" multiplications to compute the intermediate
+ //     256-bit product of each block and its corresponding hash key power.
+-//     There are 4*VL/16 of these intermediate products.
+-//   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
+-//     VL/16 256-bit intermediate values.
++//   - Sum (XOR) the intermediate 256-bit products across vectors.
+ //   - Do a vectorized reduction of these 256-bit intermediate values to
+-//     128-bits each.  This leaves VL/16 128-bit intermediate values.
++//     128-bits each.
+ //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+ //
+ // See _ghash_mul_step for the full explanation of the operations performed for
+@@ -532,85 +471,224 @@
+ .endif
+ .endm
+ 
+-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+-// the round key that has been broadcast to all 128-bit lanes of \round_key.
++// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
++// explanation.
++.macro	_ghash_4x
++.irp i, 0,1,2,3,4,5,6,7,8,9
++	_ghash_step_4x	\i
++.endr
++.endm
++
++// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++//				       u8 ghash_acc[16],
++//				       const u8 *aad, int aadlen);
++//
++// This function processes the AAD (Additional Authenticated Data) in GCM.
++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
++// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
++// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
++// can be any length.  The caller must do any buffering needed to ensure this.
++//
++// This handles large amounts of AAD efficiently, while also keeping overhead
++// low for small amounts which is the common case.  TLS and IPsec use less than
++// one block of AAD, but (uncommonly) other use cases may use much more.
++SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
++
++	// Function arguments
++	.set	KEY,		%rdi
++	.set	GHASH_ACC_PTR,	%rsi
++	.set	AAD,		%rdx
++	.set	AADLEN,		%ecx
++	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
++
++	// Additional local variables.
++	// %rax and %k1 are used as temporary registers.
++	.set	GHASHDATA0,	%zmm0
++	.set	GHASHDATA0_XMM,	%xmm0
++	.set	GHASHDATA1,	%zmm1
++	.set	GHASHDATA1_XMM,	%xmm1
++	.set	GHASHDATA2,	%zmm2
++	.set	GHASHDATA2_XMM,	%xmm2
++	.set	GHASHDATA3,	%zmm3
++	.set	BSWAP_MASK,	%zmm4
++	.set	BSWAP_MASK_XMM,	%xmm4
++	.set	GHASH_ACC,	%zmm5
++	.set	GHASH_ACC_XMM,	%xmm5
++	.set	H_POW4,		%zmm6
++	.set	H_POW3,		%zmm7
++	.set	H_POW2,		%zmm8
++	.set	H_POW1,		%zmm9
++	.set	H_POW1_XMM,	%xmm9
++	.set	GFPOLY,		%zmm10
++	.set	GFPOLY_XMM,	%xmm10
++	.set	GHASHTMP0,	%zmm11
++	.set	GHASHTMP1,	%zmm12
++	.set	GHASHTMP2,	%zmm13
++
++	// Load the GHASH accumulator.
++	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
++
++	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
++	cmp		$16, AADLEN
++	jg		.Laad_more_than_16bytes
++	test		AADLEN, AADLEN
++	jz		.Laad_done
++
++	// Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.
++	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
++	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
++	mov		$-1, %eax
++	bzhi		AADLEN, %eax, %eax
++	kmovd		%eax, %k1
++	vmovdqu8	(AAD), GHASHDATA0_XMM{%k1}{z}
++	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM
++	vpshufb		BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM
++	vpxor		GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
++	_ghash_mul	H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
++			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
++	jmp		.Laad_done
++
++.Laad_more_than_16bytes:
++	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
++	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
++
++	// If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.
++	sub		$256, AADLEN
++	jl		.Laad_loop_4x_done
++	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
++	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
++	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
++	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
++.Laad_loop_4x:
++	vmovdqu8	0*64(AAD), GHASHDATA0
++	vmovdqu8	1*64(AAD), GHASHDATA1
++	vmovdqu8	2*64(AAD), GHASHDATA2
++	vmovdqu8	3*64(AAD), GHASHDATA3
++	_ghash_4x
++	add		$256, AAD
++	sub		$256, AADLEN
++	jge		.Laad_loop_4x
++.Laad_loop_4x_done:
++
++	// If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.
++	add		$192, AADLEN
++	jl		.Laad_loop_1x_done
++	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
++.Laad_loop_1x:
++	vmovdqu8	(AAD), GHASHDATA0
++	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
++	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
++	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
++			GHASHDATA0, GHASHDATA1, GHASHDATA2
++	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
++			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
++	add		$64, AAD
++	sub		$64, AADLEN
++	jge		.Laad_loop_1x
++.Laad_loop_1x_done:
++
++	// Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.
++	add		$64, AADLEN
++	jz		.Laad_done
++	mov		$-1, %rax
++	bzhi		AADLEN64, %rax, %rax
++	kmovq		%rax, %k1
++	vmovdqu8	(AAD), GHASHDATA0{%k1}{z}
++	neg		AADLEN64
++	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
++	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
++	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
++	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
++	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
++			GHASHDATA0, GHASHDATA1, GHASHDATA2
++	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
++			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
++
++.Laad_done:
++	// Store the updated GHASH accumulator back to memory.
++	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
++
++	vzeroupper	// This is needed after using ymm or zmm registers.
++	RET
++SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
++
++// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
++// round key that has been broadcast to all 128-bit lanes of \round_key.
+ .macro	_vaesenc_4x	round_key
+-	vaesenc		\round_key, V0, V0
+-	vaesenc		\round_key, V1, V1
+-	vaesenc		\round_key, V2, V2
+-	vaesenc		\round_key, V3, V3
++	vaesenc		\round_key, %zmm0, %zmm0
++	vaesenc		\round_key, %zmm1, %zmm1
++	vaesenc		\round_key, %zmm2, %zmm2
++	vaesenc		\round_key, %zmm3, %zmm3
+ .endm
+ 
+ // Start the AES encryption of four vectors of counter blocks.
+ .macro	_ctr_begin_4x
+ 
+ 	// Increment LE_CTR four times to generate four vectors of little-endian
+-	// counter blocks, swap each to big-endian, and store them in V0-V3.
+-	vpshufb		BSWAP_MASK, LE_CTR, V0
++	// counter blocks, swap each to big-endian, and store them in %zmm[0-3].
++	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
+ 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+-	vpshufb		BSWAP_MASK, LE_CTR, V1
++	vpshufb		BSWAP_MASK, LE_CTR, %zmm1
+ 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+-	vpshufb		BSWAP_MASK, LE_CTR, V2
++	vpshufb		BSWAP_MASK, LE_CTR, %zmm2
+ 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+-	vpshufb		BSWAP_MASK, LE_CTR, V3
++	vpshufb		BSWAP_MASK, LE_CTR, %zmm3
+ 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+ 
+ 	// AES "round zero": XOR in the zero-th round key.
+-	vpxord		RNDKEY0, V0, V0
+-	vpxord		RNDKEY0, V1, V1
+-	vpxord		RNDKEY0, V2, V2
+-	vpxord		RNDKEY0, V3, V3
++	vpxord		RNDKEY0, %zmm0, %zmm0
++	vpxord		RNDKEY0, %zmm1, %zmm1
++	vpxord		RNDKEY0, %zmm2, %zmm2
++	vpxord		RNDKEY0, %zmm3, %zmm3
+ .endm
+ 
+-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+-// data with the resulting keystream, and write the result to DST and
++// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
++// source data with the resulting keystream, and write the result to DST and
+ // GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
+ .macro	_aesenclast_and_xor_4x
+ 	// XOR the source data with the last round key, saving the result in
+ 	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
+ 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+-	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
+-	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
+-	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
+-	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
++	vpxord		0*64(SRC), RNDKEYLAST, GHASHDATA0
++	vpxord		1*64(SRC), RNDKEYLAST, GHASHDATA1
++	vpxord		2*64(SRC), RNDKEYLAST, GHASHDATA2
++	vpxord		3*64(SRC), RNDKEYLAST, GHASHDATA3
+ 
+ 	// Do the last AES round.  This handles the XOR with the source data
+ 	// too, as per the optimization described above.
+-	vaesenclast	GHASHDATA0, V0, GHASHDATA0
+-	vaesenclast	GHASHDATA1, V1, GHASHDATA1
+-	vaesenclast	GHASHDATA2, V2, GHASHDATA2
+-	vaesenclast	GHASHDATA3, V3, GHASHDATA3
++	vaesenclast	GHASHDATA0, %zmm0, GHASHDATA0
++	vaesenclast	GHASHDATA1, %zmm1, GHASHDATA1
++	vaesenclast	GHASHDATA2, %zmm2, GHASHDATA2
++	vaesenclast	GHASHDATA3, %zmm3, GHASHDATA3
+ 
+ 	// Store the en/decrypted data to DST.
+-	vmovdqu8	GHASHDATA0, 0*VL(DST)
+-	vmovdqu8	GHASHDATA1, 1*VL(DST)
+-	vmovdqu8	GHASHDATA2, 2*VL(DST)
+-	vmovdqu8	GHASHDATA3, 3*VL(DST)
++	vmovdqu8	GHASHDATA0, 0*64(DST)
++	vmovdqu8	GHASHDATA1, 1*64(DST)
++	vmovdqu8	GHASHDATA2, 2*64(DST)
++	vmovdqu8	GHASHDATA3, 3*64(DST)
+ .endm
+ 
+-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
+-//					  const u32 le_ctr[4], u8 ghash_acc[16],
+-//					  const u8 *src, u8 *dst, int datalen);
++// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++//					     const u32 le_ctr[4], u8 ghash_acc[16],
++//					     const u8 *src, u8 *dst, int datalen);
+ //
+ // This macro generates a GCM encryption or decryption update function with the
+-// above prototype (with \enc selecting which one).  This macro supports both
+-// VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
+-//
+-// This function computes the next portion of the CTR keystream, XOR's it with
+-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+-// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
+-// next |datalen| ciphertext bytes.
++// above prototype (with \enc selecting which one).  The function computes the
++// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
++// and writes the resulting encrypted or decrypted data to |dst|.  It also
++// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
++// bytes.
+ //
+ // |datalen| must be a multiple of 16, except on the last call where it can be
+ // any length.  The caller must do any buffering needed to ensure this.  Both
+ // in-place and out-of-place en/decryption are supported.
+ //
+-// |le_ctr| must give the current counter in little-endian format.  For a new
+-// message, the low word of the counter must be 2.  This function loads the
+-// counter from |le_ctr| and increments the loaded counter as needed, but it
+-// does *not* store the updated counter back to |le_ctr|.  The caller must
+-// update |le_ctr| if any more data segments follow.  Internally, only the low
+-// 32-bit word of the counter is incremented, following the GCM standard.
++// |le_ctr| must give the current counter in little-endian format.  This
++// function loads the counter from |le_ctr| and increments the loaded counter as
++// needed, but it does *not* store the updated counter back to |le_ctr|.  The
++// caller must update |le_ctr| if any more data segments follow.  Internally,
++// only the low 32-bit word of the counter is incremented, following the GCM
++// standard.
+ .macro	_aes_gcm_update	enc
+ 
+ 	// Function arguments
+@@ -634,69 +712,69 @@
+ 	// Pointer to the last AES round key for the chosen AES variant
+ 	.set	RNDKEYLAST_PTR,	%r11
+ 
+-	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
+-	// they are used as temporary registers.
++	// In the main loop, %zmm[0-3] are used as AES input and output.
++	// Elsewhere they are used as temporary registers.
+ 
+ 	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+-	.set	GHASHDATA0,	V4
++	.set	GHASHDATA0,	%zmm4
+ 	.set	GHASHDATA0_XMM,	%xmm4
+-	.set	GHASHDATA1,	V5
++	.set	GHASHDATA1,	%zmm5
+ 	.set	GHASHDATA1_XMM,	%xmm5
+-	.set	GHASHDATA2,	V6
++	.set	GHASHDATA2,	%zmm6
+ 	.set	GHASHDATA2_XMM,	%xmm6
+-	.set	GHASHDATA3,	V7
++	.set	GHASHDATA3,	%zmm7
+ 
+ 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ 	// using vpshufb, copied to all 128-bit lanes.
+-	.set	BSWAP_MASK,	V8
++	.set	BSWAP_MASK,	%zmm8
+ 
+ 	// RNDKEY temporarily holds the next AES round key.
+-	.set	RNDKEY,		V9
++	.set	RNDKEY,		%zmm9
+ 
+ 	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+ 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+ 	// more than one lane may be used, and they need to be XOR'd together.
+-	.set	GHASH_ACC,	V10
++	.set	GHASH_ACC,	%zmm10
+ 	.set	GHASH_ACC_XMM,	%xmm10
+ 
+ 	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
+ 	// vector of little-endian counter blocks to advance it forwards.
+-	.set	LE_CTR_INC,	V11
++	.set	LE_CTR_INC,	%zmm11
+ 
+ 	// LE_CTR contains the next set of little-endian counter blocks.
+-	.set	LE_CTR,		V12
++	.set	LE_CTR,		%zmm12
+ 
+ 	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
+ 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
+ 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+-	.set	RNDKEY0,	V13
+-	.set	RNDKEYLAST,	V14
+-	.set	RNDKEY_M9,	V15
+-	.set	RNDKEY_M8,	V16
+-	.set	RNDKEY_M7,	V17
+-	.set	RNDKEY_M6,	V18
+-	.set	RNDKEY_M5,	V19
+-	.set	RNDKEY_M4,	V20
+-	.set	RNDKEY_M3,	V21
+-	.set	RNDKEY_M2,	V22
+-	.set	RNDKEY_M1,	V23
++	.set	RNDKEY0,	%zmm13
++	.set	RNDKEYLAST,	%zmm14
++	.set	RNDKEY_M9,	%zmm15
++	.set	RNDKEY_M8,	%zmm16
++	.set	RNDKEY_M7,	%zmm17
++	.set	RNDKEY_M6,	%zmm18
++	.set	RNDKEY_M5,	%zmm19
++	.set	RNDKEY_M4,	%zmm20
++	.set	RNDKEY_M3,	%zmm21
++	.set	RNDKEY_M2,	%zmm22
++	.set	RNDKEY_M1,	%zmm23
+ 
+ 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
+ 	// cannot coincide with anything used for AES encryption, since for
+ 	// performance reasons GHASH and AES encryption are interleaved.
+-	.set	GHASHTMP0,	V24
+-	.set	GHASHTMP1,	V25
+-	.set	GHASHTMP2,	V26
++	.set	GHASHTMP0,	%zmm24
++	.set	GHASHTMP1,	%zmm25
++	.set	GHASHTMP2,	%zmm26
+ 
+-	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
++	// H_POW[4-1] contain the powers of the hash key H^16...H^1.  The
+ 	// descending numbering reflects the order of the key powers.
+-	.set	H_POW4,		V27
+-	.set	H_POW3,		V28
+-	.set	H_POW2,		V29
+-	.set	H_POW1,		V30
++	.set	H_POW4,		%zmm27
++	.set	H_POW3,		%zmm28
++	.set	H_POW2,		%zmm29
++	.set	H_POW1,		%zmm30
+ 
+ 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+-	.set	GFPOLY,		V31
++	.set	GFPOLY,		%zmm31
+ 
+ 	// Load some constants.
+ 	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
+@@ -719,29 +797,23 @@
+ 	// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+ 	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+ 
+-	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+-.if VL == 32
+-	vbroadcasti32x4	.Linc_2blocks(%rip), LE_CTR_INC
+-.elseif VL == 64
++	// Load 4 into all 128-bit lanes of LE_CTR_INC.
+ 	vbroadcasti32x4	.Linc_4blocks(%rip), LE_CTR_INC
+-.else
+-	.error "Unsupported vector length"
+-.endif
+ 
+-	// If there are at least 4*VL bytes of data, then continue into the loop
+-	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
++	// If there are at least 256 bytes of data, then continue into the loop
++	// that processes 256 bytes of data at a time.  Otherwise skip it.
+ 	//
+-	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
++	// Pre-subtracting 256 from DATALEN saves an instruction from the main
+ 	// loop and also ensures that at least one write always occurs to
+ 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+-	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
++	sub		$256, DATALEN
+ 	jl		.Lcrypt_loop_4x_done\@
+ 
+ 	// Load powers of the hash key.
+-	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
+-	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
+-	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
+-	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
++	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
++	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
++	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
++	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+ 
+ 	// Main loop: en/decrypt and hash 4 vectors at a time.
+ 	//
+@@ -770,9 +842,9 @@
+ 	cmp		%rax, RNDKEYLAST_PTR
+ 	jne		1b
+ 	_aesenclast_and_xor_4x
+-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+-	sub		$-4*VL, DST
+-	add		$-4*VL, DATALEN
++	add		$256, SRC
++	add		$256, DST
++	sub		$256, DATALEN
+ 	jl		.Lghash_last_ciphertext_4x\@
+ .endif
+ 
+@@ -786,10 +858,10 @@
+ 	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
+ 	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+ .if !\enc
+-	vmovdqu8	0*VL(SRC), GHASHDATA0
+-	vmovdqu8	1*VL(SRC), GHASHDATA1
+-	vmovdqu8	2*VL(SRC), GHASHDATA2
+-	vmovdqu8	3*VL(SRC), GHASHDATA3
++	vmovdqu8	0*64(SRC), GHASHDATA0
++	vmovdqu8	1*64(SRC), GHASHDATA1
++	vmovdqu8	2*64(SRC), GHASHDATA2
++	vmovdqu8	3*64(SRC), GHASHDATA3
+ .endif
+ 
+ 	// Start the AES encryption of the counter blocks.
+@@ -809,44 +881,44 @@
+ 	_vaesenc_4x	RNDKEY
+ 128:
+ 
+-	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
+-	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
++	// Finish the AES encryption of the counter blocks in %zmm[0-3],
++	// interleaved with the GHASH update of the ciphertext blocks in
++	// GHASHDATA[0-3].
+ .irp i, 9,8,7,6,5,4,3,2,1
+ 	_ghash_step_4x  (9 - \i)
+ 	_vaesenc_4x	RNDKEY_M\i
+ .endr
+ 	_ghash_step_4x	9
+ 	_aesenclast_and_xor_4x
+-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+-	sub		$-4*VL, DST
+-	add		$-4*VL, DATALEN
++	add		$256, SRC
++	add		$256, DST
++	sub		$256, DATALEN
+ 	jge		.Lcrypt_loop_4x\@
+ 
+ .if \enc
+ .Lghash_last_ciphertext_4x\@:
+ 	// Update GHASH with the last set of ciphertext blocks.
+-.irp i, 0,1,2,3,4,5,6,7,8,9
+-	_ghash_step_4x	\i
+-.endr
++	_ghash_4x
+ .endif
+ 
+ .Lcrypt_loop_4x_done\@:
+ 
+-	// Undo the extra subtraction by 4*VL and check whether data remains.
+-	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
++	// Undo the extra subtraction by 256 and check whether data remains.
++	add		$256, DATALEN
+ 	jz		.Ldone\@
+ 
+-	// The data length isn't a multiple of 4*VL.  Process the remaining data
+-	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+-	// Going one vector at a time may seem inefficient compared to having
+-	// separate code paths for each possible number of vectors remaining.
+-	// However, using a loop keeps the code size down, and it performs
+-	// surprising well; modern CPUs will start executing the next iteration
+-	// before the previous one finishes and also predict the number of loop
+-	// iterations.  For a similar reason, we roll up the AES rounds.
++	// The data length isn't a multiple of 256 bytes.  Process the remaining
++	// data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
++	// time.  Going one vector at a time may seem inefficient compared to
++	// having separate code paths for each possible number of vectors
++	// remaining.  However, using a loop keeps the code size down, and it
++	// performs surprising well; modern CPUs will start executing the next
++	// iteration before the previous one finishes and also predict the
++	// number of loop iterations.  For a similar reason, we roll up the AES
++	// rounds.
+ 	//
+-	// On the last iteration, the remaining length may be less than VL.
+-	// Handle this using masking.
++	// On the last iteration, the remaining length may be less than 64
++	// bytes.  Handle this using masking.
+ 	//
+ 	// Since there are enough key powers available for all remaining data,
+ 	// there is no need to do a GHASH reduction after each iteration.
+@@ -875,65 +947,60 @@
+ .Lcrypt_loop_1x\@:
+ 
+ 	// Select the appropriate mask for this iteration: all 1's if
+-	// DATALEN >= VL, otherwise DATALEN 1's.  Do this branchlessly using the
++	// DATALEN >= 64, otherwise DATALEN 1's.  Do this branchlessly using the
+ 	// bzhi instruction from BMI2.  (This relies on DATALEN <= 255.)
+-.if VL < 64
+-	mov		$-1, %eax
+-	bzhi		DATALEN, %eax, %eax
+-	kmovd		%eax, %k1
+-.else
+ 	mov		$-1, %rax
+ 	bzhi		DATALEN64, %rax, %rax
+ 	kmovq		%rax, %k1
+-.endif
+ 
+ 	// Encrypt a vector of counter blocks.  This does not need to be masked.
+-	vpshufb		BSWAP_MASK, LE_CTR, V0
++	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
+ 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+-	vpxord		RNDKEY0, V0, V0
++	vpxord		RNDKEY0, %zmm0, %zmm0
+ 	lea		16(KEY), %rax
+ 1:
+ 	vbroadcasti32x4	(%rax), RNDKEY
+-	vaesenc		RNDKEY, V0, V0
++	vaesenc		RNDKEY, %zmm0, %zmm0
+ 	add		$16, %rax
+ 	cmp		%rax, RNDKEYLAST_PTR
+ 	jne		1b
+-	vaesenclast	RNDKEYLAST, V0, V0
++	vaesenclast	RNDKEYLAST, %zmm0, %zmm0
+ 
+ 	// XOR the data with the appropriate number of keystream bytes.
+-	vmovdqu8	(SRC), V1{%k1}{z}
+-	vpxord		V1, V0, V0
+-	vmovdqu8	V0, (DST){%k1}
++	vmovdqu8	(SRC), %zmm1{%k1}{z}
++	vpxord		%zmm1, %zmm0, %zmm0
++	vmovdqu8	%zmm0, (DST){%k1}
+ 
+ 	// Update GHASH with the ciphertext block(s), without reducing.
+ 	//
+-	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+-	// (If decrypting, it's done by the above masked load.  If encrypting,
+-	// it's done by the below masked register-to-register move.)  Note that
+-	// if DATALEN <= VL - 16, there will be additional padding beyond the
+-	// padding of the last block specified by GHASH itself; i.e., there may
+-	// be whole block(s) that get processed by the GHASH multiplication and
+-	// reduction instructions but should not actually be included in the
++	// In the case of DATALEN < 64, the ciphertext is zero-padded to 64
++	// bytes.  (If decrypting, it's done by the above masked load.  If
++	// encrypting, it's done by the below masked register-to-register move.)
++	// Note that if DATALEN <= 48, there will be additional padding beyond
++	// the padding of the last block specified by GHASH itself; i.e., there
++	// may be whole block(s) that get processed by the GHASH multiplication
++	// and reduction instructions but should not actually be included in the
+ 	// GHASH.  However, any such blocks are all-zeroes, and the values that
+ 	// they're multiplied with are also all-zeroes.  Therefore they just add
+ 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
+ 	vmovdqu8	(POWERS_PTR), H_POW1
+ .if \enc
+-	vmovdqu8	V0, V1{%k1}{z}
++	vmovdqu8	%zmm0, %zmm1{%k1}{z}
+ .endif
+-	vpshufb		BSWAP_MASK, V1, V0
+-	vpxord		GHASH_ACC, V0, V0
+-	_ghash_mul_noreduce	H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
++	vpshufb		BSWAP_MASK, %zmm1, %zmm0
++	vpxord		GHASH_ACC, %zmm0, %zmm0
++	_ghash_mul_noreduce	H_POW1, %zmm0, LO, MI, HI, \
++				GHASHDATA3, %zmm1, %zmm2, %zmm3
+ 	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ 
+-	add		$VL, POWERS_PTR
+-	add		$VL, SRC
+-	add		$VL, DST
+-	sub		$VL, DATALEN
++	add		$64, POWERS_PTR
++	add		$64, SRC
++	add		$64, DST
++	sub		$64, DATALEN
+ 	jg		.Lcrypt_loop_1x\@
+ 
+ 	// Finally, do the GHASH reduction.
+-	_ghash_reduce	LO, MI, HI, GFPOLY, V0
++	_ghash_reduce	LO, MI, HI, GFPOLY, %zmm0
+ 	_horizontal_xor	HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
+ 
+ .Ldone\@:
+@@ -944,14 +1011,14 @@
+ 	RET
+ .endm
+ 
+-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-//				     const u32 le_ctr[4], u8 ghash_acc[16],
+-//				     u64 total_aadlen, u64 total_datalen);
+-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-//				     const u32 le_ctr[4],
+-//				     const u8 ghash_acc[16],
+-//				     u64 total_aadlen, u64 total_datalen,
+-//				     const u8 tag[16], int taglen);
++// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++//				      const u32 le_ctr[4], u8 ghash_acc[16],
++//				      u64 total_aadlen, u64 total_datalen);
++// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++//				      const u32 le_ctr[4],
++//				      const u8 ghash_acc[16],
++//				      u64 total_aadlen, u64 total_datalen,
++//				      const u8 tag[16], int taglen);
+ //
+ // This macro generates one of the above two functions (with \enc selecting
+ // which one).  Both functions finish computing the GCM authentication tag by
+@@ -1081,119 +1148,16 @@
+ 	RET
+ .endm
+ 
+-_set_veclen 32
+-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
+-	_aes_gcm_precompute
+-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
+-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
+-	_aes_gcm_update	1
+-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
+-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
+-	_aes_gcm_update	0
+-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
+-
+-_set_veclen 64
+-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
+-	_aes_gcm_precompute
+-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
+-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
+ 	_aes_gcm_update	1
+-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
+-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
+ 	_aes_gcm_update	0
+-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
+-
+-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-//				      u8 ghash_acc[16],
+-//				      const u8 *aad, int aadlen);
+-//
+-// This function processes the AAD (Additional Authenticated Data) in GCM.
+-// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+-// data given by |aad| and |aadlen|.  |key->ghash_key_powers| must have been
+-// initialized.  On the first call, |ghash_acc| must be all zeroes.  |aadlen|
+-// must be a multiple of 16, except on the last call where it can be any length.
+-// The caller must do any buffering needed to ensure this.
+-//
+-// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
+-// Therefore, for AAD processing we currently only provide this implementation
+-// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop.  This
+-// keeps the code size down, and it enables some micro-optimizations, e.g. using
+-// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
+-// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
+-// provide a version using 512-bit vectors, but that doesn't seem to be useful.
+-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
+-
+-	// Function arguments
+-	.set	KEY,		%rdi
+-	.set	GHASH_ACC_PTR,	%rsi
+-	.set	AAD,		%rdx
+-	.set	AADLEN,		%ecx
+-	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+-
+-	// Additional local variables.
+-	// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
+-	.set	BSWAP_MASK,	%ymm4
+-	.set	GFPOLY,		%ymm5
+-	.set	GHASH_ACC,	%ymm6
+-	.set	GHASH_ACC_XMM,	%xmm6
+-	.set	H_POW1,		%ymm7
+-
+-	// Load some constants.
+-	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
+-	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
+-
+-	// Load the GHASH accumulator.
+-	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+-
+-	// Update GHASH with 32 bytes of AAD at a time.
+-	//
+-	// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
+-	// also ensures that at least one write always occurs to AADLEN,
+-	// zero-extending it and allowing AADLEN64 to be used later.
+-	sub		$32, AADLEN
+-	jl		.Laad_loop_1x_done
+-	vmovdqu8	OFFSETOFEND_H_POWERS-32(KEY), H_POW1	// [H^2, H^1]
+-.Laad_loop_1x:
+-	vmovdqu		(AAD), %ymm0
+-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
+-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
+-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+-			%ymm0, %ymm1, %ymm2
+-	vextracti128	$1, GHASH_ACC, %xmm0
+-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+-	add		$32, AAD
+-	sub		$32, AADLEN
+-	jge		.Laad_loop_1x
+-.Laad_loop_1x_done:
+-	add		$32, AADLEN
+-	jz		.Laad_done
+-
+-	// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
+-	mov		$-1, %eax
+-	bzhi		AADLEN, %eax, %eax
+-	kmovd		%eax, %k1
+-	vmovdqu8	(AAD), %ymm0{%k1}{z}
+-	neg		AADLEN64
+-	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
+-	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
+-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
+-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+-			%ymm0, %ymm1, %ymm2
+-	vextracti128	$1, GHASH_ACC, %xmm0
+-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+-
+-.Laad_done:
+-	// Store the updated GHASH accumulator back to memory.
+-	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+-
+-	vzeroupper	// This is needed after using ymm or zmm registers.
+-	RET
+-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
+ 
+-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
++SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
+ 	_aes_gcm_final	1
+-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
+-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
++SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
++SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
+ 	_aes_gcm_final	0
+-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
++SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index d953ac470aae..bb6e2c47ffc6 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -874,8 +874,38 @@ struct aes_gcm_key_aesni {
+ #define AES_GCM_KEY_AESNI_SIZE	\
+ 	(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
+ 
+-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
+-struct aes_gcm_key_avx10 {
++/* Key struct used by the VAES + AVX2 implementation of AES-GCM */
++struct aes_gcm_key_vaes_avx2 {
++	/*
++	 * Common part of the key.  The assembly code prefers 16-byte alignment
++	 * for the round keys; we get this by them being located at the start of
++	 * the struct and the whole struct being 32-byte aligned.
++	 */
++	struct aes_gcm_key base;
++
++	/*
++	 * Powers of the hash key H^8 through H^1.  These are 128-bit values.
++	 * They all have an extra factor of x^-1 and are byte-reversed.
++	 * The assembly code prefers 32-byte alignment for this.
++	 */
++	u64 h_powers[8][2] __aligned(32);
++
++	/*
++	 * Each entry in this array contains the two halves of an entry of
++	 * h_powers XOR'd together, in the following order:
++	 * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7.
++	 * This is used for Karatsuba multiplication.
++	 */
++	u64 h_powers_xored[8];
++};
++
++#define AES_GCM_KEY_VAES_AVX2(key) \
++	container_of((key), struct aes_gcm_key_vaes_avx2, base)
++#define AES_GCM_KEY_VAES_AVX2_SIZE \
++	(sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
++
++/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
++struct aes_gcm_key_vaes_avx512 {
+ 	/*
+ 	 * Common part of the key.  The assembly code prefers 16-byte alignment
+ 	 * for the round keys; we get this by them being located at the start of
+@@ -895,10 +925,10 @@ struct aes_gcm_key_avx10 {
+ 	/* Three padding blocks required by the assembly code */
+ 	u64 padding[3][2];
+ };
+-#define AES_GCM_KEY_AVX10(key)	\
+-	container_of((key), struct aes_gcm_key_avx10, base)
+-#define AES_GCM_KEY_AVX10_SIZE	\
+-	(sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
++#define AES_GCM_KEY_VAES_AVX512(key) \
++	container_of((key), struct aes_gcm_key_vaes_avx512, base)
++#define AES_GCM_KEY_VAES_AVX512_SIZE \
++	(sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
+ 
+ /*
+  * These flags are passed to the AES-GCM helper functions to specify the
+@@ -910,14 +940,16 @@ struct aes_gcm_key_avx10 {
+ #define FLAG_RFC4106	BIT(0)
+ #define FLAG_ENC	BIT(1)
+ #define FLAG_AVX	BIT(2)
+-#define FLAG_AVX10_256	BIT(3)
+-#define FLAG_AVX10_512	BIT(4)
++#define FLAG_VAES_AVX2	BIT(3)
++#define FLAG_VAES_AVX512 BIT(4)
+ 
+ static inline struct aes_gcm_key *
+ aes_gcm_key_get(struct crypto_aead *tfm, int flags)
+ {
+-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
++	if (flags & FLAG_VAES_AVX512)
+ 		return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
++	else if (flags & FLAG_VAES_AVX2)
++		return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
+ 	else
+ 		return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
+ }
+@@ -927,26 +959,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
+ asmlinkage void
+ aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
+ asmlinkage void
+-aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
++aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+ asmlinkage void
+-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
++aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
+ 
+ static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
+ {
+-	/*
+-	 * To make things a bit easier on the assembly side, the AVX10
+-	 * implementations use the same key format.  Therefore, a single
+-	 * function using 256-bit vectors would suffice here.  However, it's
+-	 * straightforward to provide a 512-bit one because of how the assembly
+-	 * code is structured, and it works nicely because the total size of the
+-	 * key powers is a multiple of 512 bits.  So we take advantage of that.
+-	 *
+-	 * A similar situation applies to the AES-NI implementations.
+-	 */
+-	if (flags & FLAG_AVX10_512)
+-		aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
+-	else if (flags & FLAG_AVX10_256)
+-		aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
++	if (flags & FLAG_VAES_AVX512)
++		aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
++	else if (flags & FLAG_VAES_AVX2)
++		aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
+ 	else if (flags & FLAG_AVX)
+ 		aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
+ 	else
+@@ -960,15 +982,21 @@ asmlinkage void
+ aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ 			     u8 ghash_acc[16], const u8 *aad, int aadlen);
+ asmlinkage void
+-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-			      u8 ghash_acc[16], const u8 *aad, int aadlen);
++aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++			     u8 ghash_acc[16], const u8 *aad, int aadlen);
++asmlinkage void
++aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++			       u8 ghash_acc[16], const u8 *aad, int aadlen);
+ 
+ static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ 			       const u8 *aad, int aadlen, int flags)
+ {
+-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+-		aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
+-					      aad, aadlen);
++	if (flags & FLAG_VAES_AVX512)
++		aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
++					       ghash_acc, aad, aadlen);
++	else if (flags & FLAG_VAES_AVX2)
++		aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
++					     ghash_acc, aad, aadlen);
+ 	else if (flags & FLAG_AVX)
+ 		aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
+ 					     aad, aadlen);
+@@ -986,13 +1014,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ 			     const u32 le_ctr[4], u8 ghash_acc[16],
+ 			     const u8 *src, u8 *dst, int datalen);
+ asmlinkage void
+-aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+-				  const u32 le_ctr[4], u8 ghash_acc[16],
+-				  const u8 *src, u8 *dst, int datalen);
++aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++			     const u32 le_ctr[4], u8 ghash_acc[16],
++			     const u8 *src, u8 *dst, int datalen);
+ asmlinkage void
+-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+-				  const u32 le_ctr[4], u8 ghash_acc[16],
+-				  const u8 *src, u8 *dst, int datalen);
++aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++			       const u32 le_ctr[4], u8 ghash_acc[16],
++			       const u8 *src, u8 *dst, int datalen);
+ 
+ asmlinkage void
+ aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
+@@ -1003,13 +1031,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ 			     const u32 le_ctr[4], u8 ghash_acc[16],
+ 			     const u8 *src, u8 *dst, int datalen);
+ asmlinkage void
+-aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+-				  const u32 le_ctr[4], u8 ghash_acc[16],
+-				  const u8 *src, u8 *dst, int datalen);
++aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++			     const u32 le_ctr[4], u8 ghash_acc[16],
++			     const u8 *src, u8 *dst, int datalen);
+ asmlinkage void
+-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+-				  const u32 le_ctr[4], u8 ghash_acc[16],
+-				  const u8 *src, u8 *dst, int datalen);
++aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++			       const u32 le_ctr[4], u8 ghash_acc[16],
++			       const u8 *src, u8 *dst, int datalen);
+ 
+ /* __always_inline to optimize out the branches based on @flags */
+ static __always_inline void
+@@ -1018,14 +1046,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
+ 	       const u8 *src, u8 *dst, int datalen, int flags)
+ {
+ 	if (flags & FLAG_ENC) {
+-		if (flags & FLAG_AVX10_512)
+-			aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+-							  le_ctr, ghash_acc,
+-							  src, dst, datalen);
+-		else if (flags & FLAG_AVX10_256)
+-			aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+-							  le_ctr, ghash_acc,
+-							  src, dst, datalen);
++		if (flags & FLAG_VAES_AVX512)
++			aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
++						       le_ctr, ghash_acc,
++						       src, dst, datalen);
++		else if (flags & FLAG_VAES_AVX2)
++			aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
++						     le_ctr, ghash_acc,
++						     src, dst, datalen);
+ 		else if (flags & FLAG_AVX)
+ 			aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ 						     le_ctr, ghash_acc,
+@@ -1034,14 +1062,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
+ 			aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
+ 						 ghash_acc, src, dst, datalen);
+ 	} else {
+-		if (flags & FLAG_AVX10_512)
+-			aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+-							  le_ctr, ghash_acc,
+-							  src, dst, datalen);
+-		else if (flags & FLAG_AVX10_256)
+-			aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+-							  le_ctr, ghash_acc,
+-							  src, dst, datalen);
++		if (flags & FLAG_VAES_AVX512)
++			aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
++						       le_ctr, ghash_acc,
++						       src, dst, datalen);
++		else if (flags & FLAG_VAES_AVX2)
++			aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
++						     le_ctr, ghash_acc,
++						     src, dst, datalen);
+ 		else if (flags & FLAG_AVX)
+ 			aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ 						     le_ctr, ghash_acc,
+@@ -1062,9 +1090,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ 			    const u32 le_ctr[4], u8 ghash_acc[16],
+ 			    u64 total_aadlen, u64 total_datalen);
+ asmlinkage void
+-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-			     const u32 le_ctr[4], u8 ghash_acc[16],
+-			     u64 total_aadlen, u64 total_datalen);
++aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++			    const u32 le_ctr[4], u8 ghash_acc[16],
++			    u64 total_aadlen, u64 total_datalen);
++asmlinkage void
++aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++			      const u32 le_ctr[4], u8 ghash_acc[16],
++			      u64 total_aadlen, u64 total_datalen);
+ 
+ /* __always_inline to optimize out the branches based on @flags */
+ static __always_inline void
+@@ -1072,10 +1104,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key,
+ 		  const u32 le_ctr[4], u8 ghash_acc[16],
+ 		  u64 total_aadlen, u64 total_datalen, int flags)
+ {
+-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+-		aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+-					     le_ctr, ghash_acc,
+-					     total_aadlen, total_datalen);
++	if (flags & FLAG_VAES_AVX512)
++		aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
++					      le_ctr, ghash_acc,
++					      total_aadlen, total_datalen);
++	else if (flags & FLAG_VAES_AVX2)
++		aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
++					    le_ctr, ghash_acc,
++					    total_aadlen, total_datalen);
+ 	else if (flags & FLAG_AVX)
+ 		aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ 					    le_ctr, ghash_acc,
+@@ -1097,10 +1133,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ 			    u64 total_aadlen, u64 total_datalen,
+ 			    const u8 tag[16], int taglen);
+ asmlinkage bool __must_check
+-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+-			     const u32 le_ctr[4], const u8 ghash_acc[16],
+-			     u64 total_aadlen, u64 total_datalen,
+-			     const u8 tag[16], int taglen);
++aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
++			    const u32 le_ctr[4], const u8 ghash_acc[16],
++			    u64 total_aadlen, u64 total_datalen,
++			    const u8 tag[16], int taglen);
++asmlinkage bool __must_check
++aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
++			      const u32 le_ctr[4], const u8 ghash_acc[16],
++			      u64 total_aadlen, u64 total_datalen,
++			      const u8 tag[16], int taglen);
+ 
+ /* __always_inline to optimize out the branches based on @flags */
+ static __always_inline bool __must_check
+@@ -1108,11 +1149,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
+ 		  u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
+ 		  u8 tag[16], int taglen, int flags)
+ {
+-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+-		return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+-						    le_ctr, ghash_acc,
+-						    total_aadlen, total_datalen,
+-						    tag, taglen);
++	if (flags & FLAG_VAES_AVX512)
++		return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
++						     le_ctr, ghash_acc,
++						     total_aadlen, total_datalen,
++						     tag, taglen);
++	else if (flags & FLAG_VAES_AVX2)
++		return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
++						   le_ctr, ghash_acc,
++						   total_aadlen, total_datalen,
++						   tag, taglen);
+ 	else if (flags & FLAG_AVX)
+ 		return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ 						   le_ctr, ghash_acc,
+@@ -1195,10 +1241,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
+ 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
+ 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
+-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
+-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
+-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
+-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
++	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
+ 
+ 	if (likely(crypto_simd_usable())) {
+ 		err = aes_check_keylen(keylen);
+@@ -1231,8 +1281,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ 		gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
+ 
+ 		/* Compute the needed key powers */
+-		if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
+-			struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
++		if (flags & FLAG_VAES_AVX512) {
++			struct aes_gcm_key_vaes_avx512 *k =
++				AES_GCM_KEY_VAES_AVX512(key);
+ 
+ 			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ 				k->h_powers[i][0] = be64_to_cpu(h.b);
+@@ -1240,6 +1291,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ 				gf128mul_lle(&h, &h1);
+ 			}
+ 			memset(k->padding, 0, sizeof(k->padding));
++		} else if (flags & FLAG_VAES_AVX2) {
++			struct aes_gcm_key_vaes_avx2 *k =
++				AES_GCM_KEY_VAES_AVX2(key);
++			static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 };
++
++			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
++				k->h_powers[i][0] = be64_to_cpu(h.b);
++				k->h_powers[i][1] = be64_to_cpu(h.a);
++				gf128mul_lle(&h, &h1);
++			}
++			for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) {
++				int j = indices[i];
++
++				k->h_powers_xored[i] = k->h_powers[j][0] ^
++						       k->h_powers[j][1];
++			}
+ 		} else {
+ 			struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
+ 
+@@ -1508,15 +1575,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
+ 		"generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
+ 		AES_GCM_KEY_AESNI_SIZE, 500);
+ 
+-/* aes_gcm_algs_vaes_avx10_256 */
+-DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
+-		"generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
+-		AES_GCM_KEY_AVX10_SIZE, 700);
++/* aes_gcm_algs_vaes_avx2 */
++DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2,
++		"generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
++		AES_GCM_KEY_VAES_AVX2_SIZE, 600);
+ 
+-/* aes_gcm_algs_vaes_avx10_512 */
+-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
+-		"generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
+-		AES_GCM_KEY_AVX10_SIZE, 800);
++/* aes_gcm_algs_vaes_avx512 */
++DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
++		"generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
++		AES_GCM_KEY_VAES_AVX512_SIZE, 800);
+ 
+ static int __init register_avx_algs(void)
+ {
+@@ -1548,6 +1615,10 @@ static int __init register_avx_algs(void)
+ 					ARRAY_SIZE(skcipher_algs_vaes_avx2));
+ 	if (err)
+ 		return err;
++	err = crypto_register_aeads(aes_gcm_algs_vaes_avx2,
++				    ARRAY_SIZE(aes_gcm_algs_vaes_avx2));
++	if (err)
++		return err;
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
+ 	    !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+@@ -1556,26 +1627,21 @@ static int __init register_avx_algs(void)
+ 			       XFEATURE_MASK_AVX512, NULL))
+ 		return 0;
+ 
+-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
+-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
+-	if (err)
+-		return err;
+-
+ 	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
+ 		int i;
+ 
+ 		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
+ 			skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
+-		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
+-			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
++		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
++			aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
+ 	}
+ 
+ 	err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
+ 					ARRAY_SIZE(skcipher_algs_vaes_avx512));
+ 	if (err)
+ 		return err;
+-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
+-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
++	err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
++				    ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
+ 	if (err)
+ 		return err;
+ 
+@@ -1595,8 +1661,8 @@ static void unregister_avx_algs(void)
+ 	unregister_aeads(aes_gcm_algs_aesni_avx);
+ 	unregister_skciphers(skcipher_algs_vaes_avx2);
+ 	unregister_skciphers(skcipher_algs_vaes_avx512);
+-	unregister_aeads(aes_gcm_algs_vaes_avx10_256);
+-	unregister_aeads(aes_gcm_algs_vaes_avx10_512);
++	unregister_aeads(aes_gcm_algs_vaes_avx2);
++	unregister_aeads(aes_gcm_algs_vaes_avx512);
+ }
+ #else /* CONFIG_X86_64 */
+ static struct aead_alg aes_gcm_algs_aesni[0];
+diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
+index 104aa5355090..cac4926fc340 100644
+--- a/drivers/md/Kconfig
++++ b/drivers/md/Kconfig
+@@ -546,6 +546,7 @@ config DM_VERITY
+ 	depends on BLK_DEV_DM
+ 	select CRYPTO
+ 	select CRYPTO_HASH
++	select CRYPTO_LIB_SHA256
+ 	select DM_BUFIO
+ 	help
+ 	  This device-mapper target creates a read-only device that
+diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
+index 72047b47a7a0..0c858b9ee06b 100644
+--- a/drivers/md/dm-verity-fec.c
++++ b/drivers/md/dm-verity-fec.c
+@@ -188,14 +188,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
+  * Locate data block erasures using verity hashes.
+  */
+ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+-			  u8 *want_digest, u8 *data)
++			  const u8 *want_digest, const u8 *data)
+ {
+ 	if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits,
+-				 verity_io_real_digest(v, io))))
++				 io->tmp_digest)))
+ 		return 0;
+ 
+-	return memcmp(verity_io_real_digest(v, io), want_digest,
+-		      v->digest_size) != 0;
++	return memcmp(io->tmp_digest, want_digest, v->digest_size) != 0;
+ }
+ 
+ /*
+@@ -362,7 +361,7 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+  */
+ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ 			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+-			  bool use_erasures)
++			  const u8 *want_digest, bool use_erasures)
+ {
+ 	int r, neras = 0;
+ 	unsigned int pos;
+@@ -388,12 +387,11 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ 
+ 	/* Always re-validate the corrected block against the expected hash */
+ 	r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits,
+-			verity_io_real_digest(v, io));
++			io->tmp_digest);
+ 	if (unlikely(r < 0))
+ 		return r;
+ 
+-	if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
+-		   v->digest_size)) {
++	if (memcmp(io->tmp_digest, want_digest, v->digest_size)) {
+ 		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
+ 			    v->data_dev->name, (unsigned long long)rsb, neras);
+ 		return -EILSEQ;
+@@ -404,7 +402,8 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ 
+ /* Correct errors in a block. Copies corrected block to dest. */
+ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+-		      enum verity_block_type type, sector_t block, u8 *dest)
++		      enum verity_block_type type, const u8 *want_digest,
++		      sector_t block, u8 *dest)
+ {
+ 	int r;
+ 	struct dm_verity_fec_io *fio = fec_io(io);
+@@ -447,9 +446,9 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+ 	 * them first. Do a second attempt with erasures if the corruption is
+ 	 * bad enough.
+ 	 */
+-	r = fec_decode_rsb(v, io, fio, rsb, offset, false);
++	r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false);
+ 	if (r < 0) {
+-		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
++		r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true);
+ 		if (r < 0)
+ 			goto done;
+ 	}
+diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
+index 09123a612953..a6689cdc489d 100644
+--- a/drivers/md/dm-verity-fec.h
++++ b/drivers/md/dm-verity-fec.h
+@@ -68,8 +68,8 @@ struct dm_verity_fec_io {
+ extern bool verity_fec_is_enabled(struct dm_verity *v);
+ 
+ extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+-			     enum verity_block_type type, sector_t block,
+-			     u8 *dest);
++			     enum verity_block_type type, const u8 *want_digest,
++			     sector_t block, u8 *dest);
+ 
+ extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz,
+ 					char *result, unsigned int maxlen);
+@@ -99,6 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v)
+ static inline int verity_fec_decode(struct dm_verity *v,
+ 				    struct dm_verity_io *io,
+ 				    enum verity_block_type type,
++				    const u8 *want_digest,
+ 				    sector_t block, u8 *dest)
+ {
+ 	return -EOPNOTSUPP;
+diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
+index 66a00a8ccb39..bf0aee73b074 100644
+--- a/drivers/md/dm-verity-target.c
++++ b/drivers/md/dm-verity-target.c
+@@ -117,11 +117,25 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
+ int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
+ 		const u8 *data, size_t len, u8 *digest)
+ {
+-	struct shash_desc *desc = &io->hash_desc;
++	struct shash_desc *desc;
+ 	int r;
+ 
++	if (likely(v->use_sha256_lib)) {
++		struct sha256_ctx *ctx = &io->hash_ctx.sha256;
++
++		/*
++		 * Fast path using SHA-256 library.  This is enabled only for
++		 * verity version 1, where the salt is at the beginning.
++		 */
++		*ctx = *v->initial_hashstate.sha256;
++		sha256_update(ctx, data, len);
++		sha256_final(ctx, digest);
++		return 0;
++	}
++
++	desc = &io->hash_ctx.shash;
+ 	desc->tfm = v->shash_tfm;
+-	if (unlikely(v->initial_hashstate == NULL)) {
++	if (unlikely(v->initial_hashstate.shash == NULL)) {
+ 		/* Version 0: salt at end */
+ 		r = crypto_shash_init(desc) ?:
+ 		    crypto_shash_update(desc, data, len) ?:
+@@ -129,7 +143,7 @@ int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
+ 		    crypto_shash_final(desc, digest);
+ 	} else {
+ 		/* Version 1: salt at beginning */
+-		r = crypto_shash_import(desc, v->initial_hashstate) ?:
++		r = crypto_shash_import(desc, v->initial_hashstate.shash) ?:
+ 		    crypto_shash_finup(desc, data, len, digest);
+ 	}
+ 	if (unlikely(r))
+@@ -215,12 +229,12 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+  * Verify hash of a metadata block pertaining to the specified data block
+  * ("block" argument) at a specified level ("level" argument).
+  *
+- * On successful return, verity_io_want_digest(v, io) contains the hash value
+- * for a lower tree level or for the data block (if we're at the lowest level).
++ * On successful return, want_digest contains the hash value for a lower tree
++ * level or for the data block (if we're at the lowest level).
+  *
+  * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
+  * If "skip_unverified" is false, unverified buffer is hashed and verified
+- * against current value of verity_io_want_digest(v, io).
++ * against current value of want_digest.
+  */
+ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ 			       sector_t block, int level, bool skip_unverified,
+@@ -259,7 +273,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ 		if (IS_ERR(data))
+ 			return r;
+ 		if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+-				      hash_block, data) == 0) {
++				      want_digest, hash_block, data) == 0) {
+ 			aux = dm_bufio_get_aux_data(buf);
+ 			aux->hash_verified = 1;
+ 			goto release_ok;
+@@ -279,11 +293,11 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ 		}
+ 
+ 		r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits,
+-				verity_io_real_digest(v, io));
++				io->tmp_digest);
+ 		if (unlikely(r < 0))
+ 			goto release_ret_r;
+ 
+-		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
++		if (likely(memcmp(io->tmp_digest, want_digest,
+ 				  v->digest_size) == 0))
+ 			aux->hash_verified = 1;
+ 		else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
+@@ -294,7 +308,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ 			r = -EAGAIN;
+ 			goto release_ret_r;
+ 		} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+-					     hash_block, data) == 0)
++					     want_digest, hash_block, data) == 0)
+ 			aux->hash_verified = 1;
+ 		else if (verity_handle_err(v,
+ 					   DM_VERITY_BLOCK_TYPE_METADATA,
+@@ -358,7 +372,8 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+ }
+ 
+ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
+-				   sector_t cur_block, u8 *dest)
++				   const u8 *want_digest, sector_t cur_block,
++				   u8 *dest)
+ {
+ 	struct page *page;
+ 	void *buffer;
+@@ -382,12 +397,11 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
+ 		goto free_ret;
+ 
+ 	r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits,
+-			verity_io_real_digest(v, io));
++			io->tmp_digest);
+ 	if (unlikely(r))
+ 		goto free_ret;
+ 
+-	if (memcmp(verity_io_real_digest(v, io),
+-		   verity_io_want_digest(v, io), v->digest_size)) {
++	if (memcmp(io->tmp_digest, want_digest, v->digest_size)) {
+ 		r = -EIO;
+ 		goto free_ret;
+ 	}
+@@ -402,9 +416,13 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
+ 
+ static int verity_handle_data_hash_mismatch(struct dm_verity *v,
+ 					    struct dm_verity_io *io,
+-					    struct bio *bio, sector_t blkno,
+-					    u8 *data)
++					    struct bio *bio,
++					    struct pending_block *block)
+ {
++	const u8 *want_digest = block->want_digest;
++	sector_t blkno = block->blkno;
++	u8 *data = block->data;
++
+ 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
+ 		/*
+ 		 * Error handling code (FEC included) cannot be run in the
+@@ -412,14 +430,14 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v,
+ 		 */
+ 		return -EAGAIN;
+ 	}
+-	if (verity_recheck(v, io, blkno, data) == 0) {
++	if (verity_recheck(v, io, want_digest, blkno, data) == 0) {
+ 		if (v->validated_blocks)
+ 			set_bit(blkno, v->validated_blocks);
+ 		return 0;
+ 	}
+ #if defined(CONFIG_DM_VERITY_FEC)
+-	if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno,
+-			      data) == 0)
++	if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, want_digest,
++			      blkno, data) == 0)
+ 		return 0;
+ #endif
+ 	if (bio->bi_status)
+@@ -433,6 +451,58 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v,
+ 	return 0;
+ }
+ 
++static void verity_clear_pending_blocks(struct dm_verity_io *io)
++{
++	int i;
++
++	for (i = io->num_pending - 1; i >= 0; i--) {
++		kunmap_local(io->pending_blocks[i].data);
++		io->pending_blocks[i].data = NULL;
++	}
++	io->num_pending = 0;
++}
++
++static int verity_verify_pending_blocks(struct dm_verity *v,
++					struct dm_verity_io *io,
++					struct bio *bio)
++{
++	const unsigned int block_size = 1 << v->data_dev_block_bits;
++	int i, r;
++
++	if (io->num_pending == 2) {
++		/* num_pending == 2 implies that the algorithm is SHA-256 */
++		sha256_finup_2x(v->initial_hashstate.sha256,
++				io->pending_blocks[0].data,
++				io->pending_blocks[1].data, block_size,
++				io->pending_blocks[0].real_digest,
++				io->pending_blocks[1].real_digest);
++	} else {
++		for (i = 0; i < io->num_pending; i++) {
++			r = verity_hash(v, io, io->pending_blocks[i].data,
++					block_size,
++					io->pending_blocks[i].real_digest);
++			if (unlikely(r))
++				return r;
++		}
++	}
++
++	for (i = 0; i < io->num_pending; i++) {
++		struct pending_block *block = &io->pending_blocks[i];
++
++		if (likely(memcmp(block->real_digest, block->want_digest,
++				  v->digest_size) == 0)) {
++			if (v->validated_blocks)
++				set_bit(block->blkno, v->validated_blocks);
++		} else {
++			r = verity_handle_data_hash_mismatch(v, io, bio, block);
++			if (unlikely(r))
++				return r;
++		}
++	}
++	verity_clear_pending_blocks(io);
++	return 0;
++}
++
+ /*
+  * Verify one "dm_verity_io" structure.
+  */
+@@ -440,10 +510,14 @@ static int verity_verify_io(struct dm_verity_io *io)
+ {
+ 	struct dm_verity *v = io->v;
+ 	const unsigned int block_size = 1 << v->data_dev_block_bits;
++	const int max_pending = v->use_sha256_finup_2x ? 2 : 1;
+ 	struct bvec_iter iter_copy;
+ 	struct bvec_iter *iter;
+ 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+ 	unsigned int b;
++	int r;
++
++	io->num_pending = 0;
+ 
+ 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
+ 		/*
+@@ -457,21 +531,22 @@ static int verity_verify_io(struct dm_verity_io *io)
+ 
+ 	for (b = 0; b < io->n_blocks;
+ 	     b++, bio_advance_iter(bio, iter, block_size)) {
+-		int r;
+-		sector_t cur_block = io->block + b;
++		sector_t blkno = io->block + b;
++		struct pending_block *block;
+ 		bool is_zero;
+ 		struct bio_vec bv;
+ 		void *data;
+ 
+ 		if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
+-		    likely(test_bit(cur_block, v->validated_blocks)))
++		    likely(test_bit(blkno, v->validated_blocks)))
+ 			continue;
+ 
+-		r = verity_hash_for_block(v, io, cur_block,
+-					  verity_io_want_digest(v, io),
++		block = &io->pending_blocks[io->num_pending];
++
++		r = verity_hash_for_block(v, io, blkno, block->want_digest,
+ 					  &is_zero);
+ 		if (unlikely(r < 0))
+-			return r;
++			goto error;
+ 
+ 		bv = bio_iter_iovec(bio, *iter);
+ 		if (unlikely(bv.bv_len < block_size)) {
+@@ -482,7 +557,8 @@ static int verity_verify_io(struct dm_verity_io *io)
+ 			 * data block size to be greater than PAGE_SIZE.
+ 			 */
+ 			DMERR_LIMIT("unaligned io (data block spans pages)");
+-			return -EIO;
++			r = -EIO;
++			goto error;
+ 		}
+ 
+ 		data = bvec_kmap_local(&bv);
+@@ -496,29 +572,26 @@ static int verity_verify_io(struct dm_verity_io *io)
+ 			kunmap_local(data);
+ 			continue;
+ 		}
+-
+-		r = verity_hash(v, io, data, block_size,
+-				verity_io_real_digest(v, io));
+-		if (unlikely(r < 0)) {
+-			kunmap_local(data);
+-			return r;
++		block->data = data;
++		block->blkno = blkno;
++		if (++io->num_pending == max_pending) {
++			r = verity_verify_pending_blocks(v, io, bio);
++			if (unlikely(r))
++				goto error;
+ 		}
++	}
+ 
+-		if (likely(memcmp(verity_io_real_digest(v, io),
+-				  verity_io_want_digest(v, io), v->digest_size) == 0)) {
+-			if (v->validated_blocks)
+-				set_bit(cur_block, v->validated_blocks);
+-			kunmap_local(data);
+-			continue;
+-		}
+-		r = verity_handle_data_hash_mismatch(v, io, bio, cur_block,
+-						     data);
+-		kunmap_local(data);
++	if (io->num_pending) {
++		r = verity_verify_pending_blocks(v, io, bio);
+ 		if (unlikely(r))
+-			return r;
++			goto error;
+ 	}
+ 
+ 	return 0;
++
++error:
++	verity_clear_pending_blocks(io);
++	return r;
+ }
+ 
+ /*
+@@ -1004,7 +1077,7 @@ static void verity_dtr(struct dm_target *ti)
+ 
+ 	kvfree(v->validated_blocks);
+ 	kfree(v->salt);
+-	kfree(v->initial_hashstate);
++	kfree(v->initial_hashstate.shash);
+ 	kfree(v->root_digest);
+ 	kfree(v->zero_digest);
+ 	verity_free_sig(v);
+@@ -1069,8 +1142,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
+ 	if (!v->zero_digest)
+ 		return r;
+ 
+-	io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm),
+-		     GFP_KERNEL);
++	io = kmalloc(v->ti->per_io_data_size, GFP_KERNEL);
+ 
+ 	if (!io)
+ 		return r; /* verity_dtr will free zero_digest */
+@@ -1252,11 +1324,26 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
+ 	}
+ 	v->shash_tfm = shash;
+ 	v->digest_size = crypto_shash_digestsize(shash);
+-	DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash));
+ 	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+ 		ti->error = "Digest size too big";
+ 		return -EINVAL;
+ 	}
++	if (likely(v->version && strcmp(alg_name, "sha256") == 0)) {
++		/*
++		 * Fast path: use the library API for reduced overhead and
++		 * interleaved hashing support.
++		 */
++		v->use_sha256_lib = true;
++		if (sha256_finup_2x_is_optimized())
++			v->use_sha256_finup_2x = true;
++		ti->per_io_data_size =
++			offsetofend(struct dm_verity_io, hash_ctx.sha256);
++	} else {
++		/* Fallback case: use the generic crypto API. */
++		ti->per_io_data_size =
++			offsetofend(struct dm_verity_io, hash_ctx.shash) +
++			crypto_shash_descsize(shash);
++	}
+ 	return 0;
+ }
+ 
+@@ -1277,7 +1364,18 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
+ 			return -EINVAL;
+ 		}
+ 	}
+-	if (v->version) { /* Version 1: salt at beginning */
++	if (likely(v->use_sha256_lib)) {
++		/* Implies version 1: salt at beginning */
++		v->initial_hashstate.sha256 =
++			kmalloc(sizeof(struct sha256_ctx), GFP_KERNEL);
++		if (!v->initial_hashstate.sha256) {
++			ti->error = "Cannot allocate initial hash state";
++			return -ENOMEM;
++		}
++		sha256_init(v->initial_hashstate.sha256);
++		sha256_update(v->initial_hashstate.sha256,
++			      v->salt, v->salt_size);
++	} else if (v->version) { /* Version 1: salt at beginning */
+ 		SHASH_DESC_ON_STACK(desc, v->shash_tfm);
+ 		int r;
+ 
+@@ -1285,16 +1383,16 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
+ 		 * Compute the pre-salted hash state that can be passed to
+ 		 * crypto_shash_import() for each block later.
+ 		 */
+-		v->initial_hashstate = kmalloc(
++		v->initial_hashstate.shash = kmalloc(
+ 			crypto_shash_statesize(v->shash_tfm), GFP_KERNEL);
+-		if (!v->initial_hashstate) {
++		if (!v->initial_hashstate.shash) {
+ 			ti->error = "Cannot allocate initial hash state";
+ 			return -ENOMEM;
+ 		}
+ 		desc->tfm = v->shash_tfm;
+ 		r = crypto_shash_init(desc) ?:
+ 		    crypto_shash_update(desc, v->salt, v->salt_size) ?:
+-		    crypto_shash_export(desc, v->initial_hashstate);
++		    crypto_shash_export(desc, v->initial_hashstate.shash);
+ 		if (r) {
+ 			ti->error = "Cannot set up initial hash state";
+ 			return r;
+@@ -1556,9 +1654,6 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+ 		goto bad;
+ 	}
+ 
+-	ti->per_io_data_size = sizeof(struct dm_verity_io) +
+-			       crypto_shash_descsize(v->shash_tfm);
+-
+ 	r = verity_fec_ctr(v);
+ 	if (r)
+ 		goto bad;
+diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
+index 6d141abd965c..f975a9e5c5d6 100644
+--- a/drivers/md/dm-verity.h
++++ b/drivers/md/dm-verity.h
+@@ -16,6 +16,7 @@
+ #include <linux/device-mapper.h>
+ #include <linux/interrupt.h>
+ #include <crypto/hash.h>
++#include <crypto/sha2.h>
+ 
+ #define DM_VERITY_MAX_LEVELS		63
+ 
+@@ -42,7 +43,10 @@ struct dm_verity {
+ 	struct crypto_shash *shash_tfm;
+ 	u8 *root_digest;	/* digest of the root block */
+ 	u8 *salt;		/* salt: its size is salt_size */
+-	u8 *initial_hashstate;	/* salted initial state, if version >= 1 */
++	union {
++		struct sha256_ctx *sha256;	/* for use_sha256_lib=1 */
++		u8 *shash;			/* for use_sha256_lib=0 */
++	} initial_hashstate; /* salted initial state, if version >= 1 */
+ 	u8 *zero_digest;	/* digest for a zero block */
+ #ifdef CONFIG_SECURITY
+ 	u8 *root_digest_sig;	/* signature of the root digest */
+@@ -59,6 +63,8 @@ struct dm_verity {
+ 	unsigned char version;
+ 	bool hash_failed:1;	/* set if hash of any block failed */
+ 	bool use_bh_wq:1;	/* try to verify in BH wq before normal work-queue */
++	bool use_sha256_lib:1;	/* use SHA-256 library instead of generic crypto API */
++	bool use_sha256_finup_2x:1; /* use interleaved hashing optimization */
+ 	unsigned int digest_size;	/* digest size for the current hash algorithm */
+ 	enum verity_mode mode;	/* mode for handling verification errors */
+ 	enum verity_mode error_mode;/* mode for handling I/O errors */
+@@ -78,6 +84,13 @@ struct dm_verity {
+ 	mempool_t recheck_pool;
+ };
+ 
++struct pending_block {
++	void *data;
++	sector_t blkno;
++	u8 want_digest[HASH_MAX_DIGESTSIZE];
++	u8 real_digest[HASH_MAX_DIGESTSIZE];
++};
++
+ struct dm_verity_io {
+ 	struct dm_verity *v;
+ 
+@@ -94,28 +107,29 @@ struct dm_verity_io {
+ 	struct work_struct work;
+ 	struct work_struct bh_work;
+ 
+-	u8 real_digest[HASH_MAX_DIGESTSIZE];
+-	u8 want_digest[HASH_MAX_DIGESTSIZE];
++	u8 tmp_digest[HASH_MAX_DIGESTSIZE];
+ 
+ 	/*
+-	 * Temporary space for hashing.  This is variable-length and must be at
+-	 * the end of the struct.  struct shash_desc is just the fixed part;
+-	 * it's followed by a context of size crypto_shash_descsize(shash_tfm).
++	 * This is the queue of data blocks that are pending verification.  When
++	 * the crypto layer supports interleaved hashing, we allow multiple
++	 * blocks to be queued up in order to utilize it.  This can improve
++	 * performance significantly vs. sequential hashing of each block.
+ 	 */
+-	struct shash_desc hash_desc;
+-};
++	int num_pending;
++	struct pending_block pending_blocks[2];
+ 
+-static inline u8 *verity_io_real_digest(struct dm_verity *v,
+-					struct dm_verity_io *io)
+-{
+-	return io->real_digest;
+-}
+-
+-static inline u8 *verity_io_want_digest(struct dm_verity *v,
+-					struct dm_verity_io *io)
+-{
+-	return io->want_digest;
+-}
++	/*
++	 * Temporary space for hashing.  Either sha256 or shash is used,
++	 * depending on the value of use_sha256_lib.  If shash is used,
++	 * then this field is variable-length, with total size
++	 * sizeof(struct shash_desc) + crypto_shash_descsize(shash_tfm).
++	 * For this reason, this field must be the end of the struct.
++	 */
++	union {
++		struct sha256_ctx sha256;
++		struct shash_desc shash;
++	} hash_ctx;
++};
+ 
+ extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
+ 		       const u8 *data, size_t len, u8 *digest);
+diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
+index 05a221ce79a6..b87a768b955c 100644
+--- a/include/linux/rhashtable.h
++++ b/include/linux/rhashtable.h
+@@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl,
+ 	local_irq_restore(flags);
+ }
+ 
+-static inline struct rhash_head *__rht_ptr(
+-	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
++enum rht_lookup_freq {
++	RHT_LOOKUP_NORMAL,
++	RHT_LOOKUP_LIKELY,
++};
++
++static __always_inline struct rhash_head *__rht_ptr(
++	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt,
++	const enum rht_lookup_freq freq)
+ {
+-	return (struct rhash_head *)
+-		((unsigned long)p & ~BIT(0) ?:
+-		 (unsigned long)RHT_NULLS_MARKER(bkt));
++	unsigned long p_val = (unsigned long)p & ~BIT(0);
++
++	BUILD_BUG_ON(!__builtin_constant_p(freq));
++
++	if (freq == RHT_LOOKUP_LIKELY)
++		return (struct rhash_head *)
++			(likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt));
++	else
++		return (struct rhash_head *)
++			(p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt));
+ }
+ 
+ /*
+@@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr(
+  *   rht_ptr_exclusive() dereferences in a context where exclusive
+  *            access is guaranteed, such as when destroying the table.
+  */
++static __always_inline struct rhash_head *__rht_ptr_rcu(
++	struct rhash_lock_head __rcu *const *bkt,
++	const enum rht_lookup_freq freq)
++{
++	return __rht_ptr(rcu_dereference(*bkt), bkt, freq);
++}
++
+ static inline struct rhash_head *rht_ptr_rcu(
+ 	struct rhash_lock_head __rcu *const *bkt)
+ {
+-	return __rht_ptr(rcu_dereference_all(*bkt), bkt);
++	return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL);
+ }
+ 
+ static inline struct rhash_head *rht_ptr(
+@@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr(
+ 	struct bucket_table *tbl,
+ 	unsigned int hash)
+ {
+-	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
++	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt,
++			 RHT_LOOKUP_NORMAL);
+ }
+ 
+ static inline struct rhash_head *rht_ptr_exclusive(
+ 	struct rhash_lock_head __rcu *const *bkt)
+ {
+-	return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
++	return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt,
++			 RHT_LOOKUP_NORMAL);
+ }
+ 
+ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+@@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
+ /* Internal function, do not use. */
+ static __always_inline struct rhash_head *__rhashtable_lookup(
+ 	struct rhashtable *ht, const void *key,
+-	const struct rhashtable_params params)
++	const struct rhashtable_params params,
++	const enum rht_lookup_freq freq)
+ {
+ 	struct rhashtable_compare_arg arg = {
+ 		.ht = ht,
+@@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup(
+ 	struct rhash_head *he;
+ 	unsigned int hash;
+ 
++	BUILD_BUG_ON(!__builtin_constant_p(freq));
+ 	tbl = rht_dereference_rcu(ht->tbl, ht);
+ restart:
+ 	hash = rht_key_hashfn(ht, tbl, key, params);
+ 	bkt = rht_bucket(tbl, hash);
+ 	do {
+-		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
++		rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) {
+ 			if (params.obj_cmpfn ?
+ 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+ 			    rhashtable_compare(&arg, rht_obj(ht, he)))
+@@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup(
+ 	struct rhashtable *ht, const void *key,
+ 	const struct rhashtable_params params)
+ {
+-	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
++	struct rhash_head *he = __rhashtable_lookup(ht, key, params,
++						    RHT_LOOKUP_NORMAL);
+ 
+ 	return he ? rht_obj(ht, he) : NULL;
+ }
+ 
++static __always_inline void *rhashtable_lookup_likely(
++	struct rhashtable *ht, const void *key,
++	const struct rhashtable_params params)
++{
++	struct rhash_head *he = __rhashtable_lookup(ht, key, params,
++						    RHT_LOOKUP_LIKELY);
++
++	return likely(he) ? rht_obj(ht, he) : NULL;
++}
++
+ /**
+  * rhashtable_lookup_fast - search hash table, without RCU read lock
+  * @ht:		hash table
+@@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup(
+ 	struct rhltable *hlt, const void *key,
+ 	const struct rhashtable_params params)
+ {
+-	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
++	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
++						    RHT_LOOKUP_NORMAL);
+ 
+ 	return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+ }
+ 
++static __always_inline struct rhlist_head *rhltable_lookup_likely(
++	struct rhltable *hlt, const void *key,
++	const struct rhashtable_params params)
++{
++	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
++						    RHT_LOOKUP_LIKELY);
++
++	return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL;
++}
++
+ /* Internal function, please use rhashtable_insert_fast() instead. This
+  * function returns the existing element already in hashes if there is a clash,
+  * otherwise it returns an error via ERR_PTR().
+-- 
+2.52.0
+
diff --git a/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch b/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch
new file mode 100644
index 0000000..b0aaec7
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch
@@ -0,0 +1,708 @@
+From 9d35fa170b23d0aa9e7724629d55f8c2c6e38e99 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Thu, 18 Dec 2025 16:42:35 +0100
+Subject: [PATCH 10/11] sched-ext
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/sched/ext.h                     |   1 +
+ kernel/sched/ext.c                            |  69 ++++-
+ tools/sched_ext/include/scx/common.bpf.h      |   1 +
+ tools/sched_ext/include/scx/compat.bpf.h      |  18 ++
+ tools/testing/selftests/sched_ext/Makefile    |   1 +
+ .../selftests/sched_ext/peek_dsq.bpf.c        | 251 ++++++++++++++++++
+ tools/testing/selftests/sched_ext/peek_dsq.c  | 224 ++++++++++++++++
+ 7 files changed, 561 insertions(+), 4 deletions(-)
+ create mode 100644 tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+ create mode 100644 tools/testing/selftests/sched_ext/peek_dsq.c
+
+diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
+index d82b7a9b0658..81478d4ae782 100644
+--- a/include/linux/sched/ext.h
++++ b/include/linux/sched/ext.h
+@@ -58,6 +58,7 @@ enum scx_dsq_id_flags {
+  */
+ struct scx_dispatch_q {
+ 	raw_spinlock_t		lock;
++	struct task_struct __rcu *first_task; /* lockless peek at head */
+ 	struct list_head	list;	/* tasks in dispatch order */
+ 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
+ 	u32			nr;
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index 979484dab2d3..9acc660c350c 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -965,8 +965,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ 				container_of(rbp, struct task_struct,
+ 					     scx.dsq_priq);
+ 			list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
++			/* first task unchanged - no update needed */
+ 		} else {
+ 			list_add(&p->scx.dsq_list.node, &dsq->list);
++			/* not builtin and new task is at head - use fastpath */
++			rcu_assign_pointer(dsq->first_task, p);
+ 		}
+ 	} else {
+ 		/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
+@@ -974,10 +977,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ 			scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ 				  dsq->id);
+ 
+-		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
++		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
+ 			list_add(&p->scx.dsq_list.node, &dsq->list);
+-		else
++			/* new task inserted at head - use fastpath */
++			if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
++				rcu_assign_pointer(dsq->first_task, p);
++		} else {
++			bool was_empty;
++
++			was_empty = list_empty(&dsq->list);
+ 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
++			if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
++				rcu_assign_pointer(dsq->first_task, p);
++		}
+ 	}
+ 
+ 	/* seq records the order tasks are queued, used by BPF DSQ iterator */
+@@ -1034,6 +1046,13 @@ static void task_unlink_from_dsq(struct task_struct *p,
+ 
+ 	list_del_init(&p->scx.dsq_list.node);
+ 	dsq_mod_nr(dsq, -1);
++
++	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
++		struct task_struct *first_task;
++
++		first_task = nldsq_next_task(dsq, NULL, false);
++		rcu_assign_pointer(dsq->first_task, first_task);
++	}
+ }
+ 
+ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
+@@ -4516,7 +4535,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+ 	return ERR_PTR(ret);
+ }
+ 
+-static void check_hotplug_seq(struct scx_sched *sch,
++static int check_hotplug_seq(struct scx_sched *sch,
+ 			      const struct sched_ext_ops *ops)
+ {
+ 	unsigned long long global_hotplug_seq;
+@@ -4533,8 +4552,11 @@ static void check_hotplug_seq(struct scx_sched *sch,
+ 				 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ 				 "expected hotplug seq %llu did not match actual %llu",
+ 				 ops->hotplug_seq, global_hotplug_seq);
++			return -EBUSY;
+ 		}
+ 	}
++
++	return 0;
+ }
+ 
+ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
+@@ -4636,7 +4658,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+ 		if (((void (**)(void))ops)[i])
+ 			set_bit(i, sch->has_op);
+ 
+-	check_hotplug_seq(sch, ops);
++	ret = check_hotplug_seq(sch, ops);
++	if (ret) {
++		cpus_read_unlock();
++		goto err_disable;
++	}
+ 	scx_idle_update_selcpu_topology(ops);
+ 
+ 	cpus_read_unlock();
+@@ -6183,6 +6209,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+ 	kit->dsq = NULL;
+ }
+ 
++/**
++ * scx_bpf_dsq_peek - Lockless peek at the first element.
++ * @dsq_id: DSQ to examine.
++ *
++ * Read the first element in the DSQ. This is semantically equivalent to using
++ * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
++ * this provides only a point-in-time snapshot, and the contents may change
++ * by the time any subsequent locking operation reads the queue.
++ *
++ * Returns the pointer, or NULL indicates an empty queue OR internal error.
++ */
++__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
++{
++	struct scx_sched *sch;
++	struct scx_dispatch_q *dsq;
++
++	sch = rcu_dereference(scx_root);
++	if (unlikely(!sch))
++		return NULL;
++
++	if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
++		scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
++		return NULL;
++	}
++
++	dsq = find_user_dsq(sch, dsq_id);
++	if (unlikely(!dsq)) {
++		scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
++		return NULL;
++	}
++
++	return rcu_dereference(dsq->first_task);
++}
++
+ __bpf_kfunc_end_defs();
+ 
+ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+@@ -6740,6 +6800,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
+ BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+ BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
+ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
++BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
+ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
+diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
+index 06e2551033cb..fbf3e7f9526c 100644
+--- a/tools/sched_ext/include/scx/common.bpf.h
++++ b/tools/sched_ext/include/scx/common.bpf.h
+@@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
+ void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
++struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
+ int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
+ struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+ void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
+diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
+index dd9144624dc9..467a987880e1 100644
+--- a/tools/sched_ext/include/scx/compat.bpf.h
++++ b/tools/sched_ext/include/scx/compat.bpf.h
+@@ -130,6 +130,24 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym
+ 	false;									\
+ })
+ 
++/*
++ * v6.19: Introduce lockless peek API for user DSQs.
++ *
++ * Preserve the following macro until v6.21.
++ */
++static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
++{
++	struct task_struct *p = NULL;
++	struct bpf_iter_scx_dsq it;
++
++	if (bpf_ksym_exists(scx_bpf_dsq_peek))
++		return scx_bpf_dsq_peek(dsq_id);
++	if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
++		p = bpf_iter_scx_dsq_next(&it);
++	bpf_iter_scx_dsq_destroy(&it);
++	return p;
++}
++
+ /**
+  * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
+  * in a compatible way. We will preserve this __COMPAT helper until v6.16.
+diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
+index 9d9d6b4c38b0..5fe45f9c5f8f 100644
+--- a/tools/testing/selftests/sched_ext/Makefile
++++ b/tools/testing/selftests/sched_ext/Makefile
+@@ -174,6 +174,7 @@ auto-test-targets :=			\
+ 	minimal				\
+ 	numa				\
+ 	allowed_cpus			\
++	peek_dsq			\
+ 	prog_run			\
+ 	reload_loop			\
+ 	select_cpu_dfl			\
+diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+new file mode 100644
+index 000000000000..a3faf5bb49d6
+--- /dev/null
++++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+@@ -0,0 +1,251 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * A BPF program for testing DSQ operations and peek in particular.
++ *
++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
++ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
++ */
++
++#include <scx/common.bpf.h>
++#include <scx/compat.bpf.h>
++
++char _license[] SEC("license") = "GPL";
++
++UEI_DEFINE(uei); /* Error handling */
++
++#define MAX_SAMPLES 100
++#define MAX_CPUS 512
++#define DSQ_POOL_SIZE 8
++int max_samples = MAX_SAMPLES;
++int max_cpus = MAX_CPUS;
++int dsq_pool_size = DSQ_POOL_SIZE;
++
++/* Global variables to store test results */
++int dsq_peek_result1 = -1;
++long dsq_inserted_pid = -1;
++int insert_test_cpu = -1; /* Set to the cpu that performs the test */
++long dsq_peek_result2 = -1;
++long dsq_peek_result2_pid = -1;
++long dsq_peek_result2_expected = -1;
++int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
++int real_dsq_id = 1235; /* DSQ for normal operation */
++int enqueue_count = -1;
++int dispatch_count = -1;
++bool debug_ksym_exists;
++
++/* DSQ pool for stress testing */
++int dsq_pool_base_id = 2000;
++int phase1_complete = -1;
++long total_peek_attempts = -1;
++long successful_peeks = -1;
++
++/* BPF map for sharing peek results with userspace */
++struct {
++	__uint(type, BPF_MAP_TYPE_ARRAY);
++	__uint(max_entries, MAX_SAMPLES);
++	__type(key, u32);
++	__type(value, long);
++} peek_results SEC(".maps");
++
++static int get_random_dsq_id(void)
++{
++	u64 time = bpf_ktime_get_ns();
++
++	return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
++}
++
++static void record_peek_result(long pid)
++{
++	u32 slot_key;
++	long *slot_pid_ptr;
++	int ix;
++
++	if (pid <= 0)
++		return;
++
++	/* Find an empty slot or one with the same PID */
++	bpf_for(ix, 0, 10) {
++		slot_key = (pid + ix) % MAX_SAMPLES;
++		slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
++		if (!slot_pid_ptr)
++			continue;
++
++		if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
++			*slot_pid_ptr = pid;
++			break;
++		}
++	}
++}
++
++/* Scan all DSQs in the pool and try to move a task to local */
++static int scan_dsq_pool(void)
++{
++	struct task_struct *task;
++	int moved = 0;
++	int i;
++
++	bpf_for(i, 0, DSQ_POOL_SIZE) {
++		int dsq_id = dsq_pool_base_id + i;
++
++		total_peek_attempts++;
++
++		task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
++		if (task) {
++			successful_peeks++;
++			record_peek_result(task->pid);
++
++			/* Try to move this task to local */
++			if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
++				moved = 1;
++				break;
++			}
++		}
++	}
++	return moved;
++}
++
++/* Struct_ops scheduler for testing DSQ peek operations */
++void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
++{
++	struct task_struct *peek_result;
++	int last_insert_test_cpu, cpu;
++
++	enqueue_count++;
++	cpu = bpf_get_smp_processor_id();
++	last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
++
++	/* Phase 1: Simple insert-then-peek test (only on first task) */
++	if (last_insert_test_cpu == -1) {
++		bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
++
++		/* Test 1: Peek empty DSQ - should return NULL */
++		peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
++		dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
++
++		/* Test 2: Insert task into test DSQ for testing in dispatch callback */
++		dsq_inserted_pid = p->pid;
++		scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
++		dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
++	} else if (!phase1_complete) {
++		/* Still in phase 1, use real DSQ */
++		scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
++	} else {
++		/* Phase 2: Random DSQ insertion for stress testing */
++		int random_dsq_id = get_random_dsq_id();
++
++		scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
++	}
++}
++
++void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
++{
++	dispatch_count++;
++
++	/* Phase 1: Complete the simple peek test if we inserted a task but
++	 * haven't tested peek yet
++	 */
++	if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
++		struct task_struct *peek_result;
++
++		bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
++
++		/* Test 3: Peek DSQ after insert - should return the task we inserted */
++		peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
++		/* Store the PID of the peeked task for comparison */
++		dsq_peek_result2 = (long)peek_result;
++		dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
++
++		/* Now consume the task since we've peeked at it */
++		scx_bpf_dsq_move_to_local(test_dsq_id);
++
++		/* Mark phase 1 as complete */
++		phase1_complete = 1;
++		bpf_printk("Phase 1 complete, starting phase 2 stress testing");
++	} else if (!phase1_complete) {
++		/* Still in phase 1, use real DSQ */
++		scx_bpf_dsq_move_to_local(real_dsq_id);
++	} else {
++		/* Phase 2: Scan all DSQs in the pool and try to move a task */
++		if (!scan_dsq_pool()) {
++			/* No tasks found in DSQ pool, fall back to real DSQ */
++			scx_bpf_dsq_move_to_local(real_dsq_id);
++		}
++	}
++}
++
++s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
++{
++	s32 err;
++	int i;
++
++	/* Always set debug values so we can see which version we're using */
++	debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
++
++	/* Initialize state first */
++	insert_test_cpu = -1;
++	enqueue_count = 0;
++	dispatch_count = 0;
++	phase1_complete = 0;
++	total_peek_attempts = 0;
++	successful_peeks = 0;
++
++	/* Create the test and real DSQs */
++	err = scx_bpf_create_dsq(test_dsq_id, -1);
++	if (err) {
++		scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
++		return err;
++	}
++	err = scx_bpf_create_dsq(real_dsq_id, -1);
++	if (err) {
++		scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
++		return err;
++	}
++
++	/* Create the DSQ pool for stress testing */
++	bpf_for(i, 0, DSQ_POOL_SIZE) {
++		int dsq_id = dsq_pool_base_id + i;
++
++		err = scx_bpf_create_dsq(dsq_id, -1);
++		if (err) {
++			scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
++			return err;
++		}
++	}
++
++	/* Initialize the peek results map */
++	bpf_for(i, 0, MAX_SAMPLES) {
++		u32 key = i;
++		long pid = -1;
++
++		bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
++	}
++
++	return 0;
++}
++
++void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
++{
++	int i;
++
++	/* Destroy the primary DSQs */
++	scx_bpf_destroy_dsq(test_dsq_id);
++	scx_bpf_destroy_dsq(real_dsq_id);
++
++	/* Destroy the DSQ pool */
++	bpf_for(i, 0, DSQ_POOL_SIZE) {
++		int dsq_id = dsq_pool_base_id + i;
++
++		scx_bpf_destroy_dsq(dsq_id);
++	}
++
++	UEI_RECORD(uei, ei);
++}
++
++SEC(".struct_ops.link")
++struct sched_ext_ops peek_dsq_ops = {
++	.enqueue = (void *)peek_dsq_enqueue,
++	.dispatch = (void *)peek_dsq_dispatch,
++	.init = (void *)peek_dsq_init,
++	.exit = (void *)peek_dsq_exit,
++	.name = "peek_dsq",
++};
+diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c
+new file mode 100644
+index 000000000000..a717384a3224
+--- /dev/null
++++ b/tools/testing/selftests/sched_ext/peek_dsq.c
+@@ -0,0 +1,224 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Test for DSQ operations including create, destroy, and peek operations.
++ *
++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
++ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
++ */
++#include <bpf/bpf.h>
++#include <scx/common.h>
++#include <sys/wait.h>
++#include <unistd.h>
++#include <pthread.h>
++#include <string.h>
++#include <sched.h>
++#include "peek_dsq.bpf.skel.h"
++#include "scx_test.h"
++
++#define NUM_WORKERS 4
++
++static bool workload_running = true;
++static pthread_t workload_threads[NUM_WORKERS];
++
++/**
++ * Background workload thread that sleeps and wakes rapidly to exercise
++ * the scheduler's enqueue operations and ensure DSQ operations get tested.
++ */
++static void *workload_thread_fn(void *arg)
++{
++	while (workload_running) {
++		/* Sleep for a very short time to trigger scheduler activity */
++		usleep(1000); /* 1ms sleep */
++		/* Yield to ensure we go through the scheduler */
++		sched_yield();
++	}
++	return NULL;
++}
++
++static enum scx_test_status setup(void **ctx)
++{
++	struct peek_dsq *skel;
++
++	skel = peek_dsq__open();
++	SCX_FAIL_IF(!skel, "Failed to open");
++	SCX_ENUM_INIT(skel);
++	SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
++
++	*ctx = skel;
++
++	return SCX_TEST_PASS;
++}
++
++static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
++{
++	long count = 0;
++
++	printf("Observed %s DSQ peek pids:\n", dsq_name);
++	for (int i = 0; i < max_samples; i++) {
++		long pid;
++		int err;
++
++		err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
++		if (err == 0) {
++			if (pid == 0) {
++				printf("  Sample %d: NULL peek\n", i);
++			} else if (pid > 0) {
++				printf("  Sample %d: pid %ld\n", i, pid);
++				count++;
++			}
++		} else {
++			printf("  Sample %d: error reading pid (err=%d)\n", i, err);
++		}
++	}
++	printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
++	return count;
++}
++
++static enum scx_test_status run(void *ctx)
++{
++	struct peek_dsq *skel = ctx;
++	bool failed = false;
++	int seconds = 3;
++	int err;
++
++	/* Enable the scheduler to test DSQ operations */
++	printf("Enabling scheduler to test DSQ insert operations...\n");
++
++	struct bpf_link *link =
++		bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
++
++	if (!link) {
++		SCX_ERR("Failed to attach struct_ops");
++		return SCX_TEST_FAIL;
++	}
++
++	printf("Starting %d background workload threads...\n", NUM_WORKERS);
++	workload_running = true;
++	for (int i = 0; i < NUM_WORKERS; i++) {
++		err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
++		if (err) {
++			SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
++			/* Stop already created threads */
++			workload_running = false;
++			for (int j = 0; j < i; j++)
++				pthread_join(workload_threads[j], NULL);
++			bpf_link__destroy(link);
++			return SCX_TEST_FAIL;
++		}
++	}
++
++	printf("Waiting for enqueue events.\n");
++	sleep(seconds);
++	while (skel->data->enqueue_count <= 0) {
++		printf(".");
++		fflush(stdout);
++		sleep(1);
++		seconds++;
++		if (seconds >= 30) {
++			printf("\n\u2717 Timeout waiting for enqueue events\n");
++			/* Stop workload threads and cleanup */
++			workload_running = false;
++			for (int i = 0; i < NUM_WORKERS; i++)
++				pthread_join(workload_threads[i], NULL);
++			bpf_link__destroy(link);
++			return SCX_TEST_FAIL;
++		}
++	}
++
++	workload_running = false;
++	for (int i = 0; i < NUM_WORKERS; i++) {
++		err = pthread_join(workload_threads[i], NULL);
++		if (err) {
++			SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
++			bpf_link__destroy(link);
++			return SCX_TEST_FAIL;
++		}
++	}
++	printf("Background workload threads stopped.\n");
++
++	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
++
++	/* Detach the scheduler */
++	bpf_link__destroy(link);
++
++	printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
++		skel->data->enqueue_count, skel->data->dispatch_count);
++	printf("Debug: ksym_exists=%d\n",
++	       skel->bss->debug_ksym_exists);
++
++	/* Check DSQ insert result */
++	printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
++	if (skel->data->insert_test_cpu != -1)
++		printf("\u2713 DSQ insert succeeded !\n");
++	else {
++		printf("\u2717 DSQ insert failed or not attempted\n");
++		failed = true;
++	}
++
++	/* Check DSQ peek results */
++	printf("  DSQ peek result 1 (before insert): %d\n",
++	       skel->data->dsq_peek_result1);
++	if (skel->data->dsq_peek_result1 == 0)
++		printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
++	else {
++		printf("\u2717 DSQ peek verification failed\n");
++		failed = true;
++	}
++
++	printf("  DSQ peek result 2 (after insert): %ld\n",
++	       skel->data->dsq_peek_result2);
++	printf("  DSQ peek result 2, expected: %ld\n",
++	       skel->data->dsq_peek_result2_expected);
++	if (skel->data->dsq_peek_result2 ==
++	    skel->data->dsq_peek_result2_expected)
++		printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
++	else {
++		printf("\u2717 DSQ peek verification failed\n");
++		failed = true;
++	}
++
++	printf("  Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
++	printf("  DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
++
++	int pid_count;
++
++	pid_count = print_observed_pids(skel->maps.peek_results,
++					skel->data->max_samples, "DSQ pool");
++	printf("Total non-null peek observations: %ld out of %ld\n",
++	       skel->data->successful_peeks, skel->data->total_peek_attempts);
++
++	if (skel->bss->debug_ksym_exists && pid_count == 0) {
++		printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
++		failed = true;
++	}
++	if (skel->bss->debug_ksym_exists && pid_count > 0)
++		printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
++
++	if (failed)
++		return SCX_TEST_FAIL;
++	else
++		return SCX_TEST_PASS;
++}
++
++static void cleanup(void *ctx)
++{
++	struct peek_dsq *skel = ctx;
++
++	if (workload_running) {
++		workload_running = false;
++		for (int i = 0; i < NUM_WORKERS; i++)
++			pthread_join(workload_threads[i], NULL);
++	}
++
++	peek_dsq__destroy(skel);
++}
++
++struct scx_test peek_dsq = {
++	.name = "peek_dsq",
++	.description =
++		"Test DSQ create/destroy operations and future peek functionality",
++	.setup = setup,
++	.run = run,
++	.cleanup = cleanup,
++};
++REGISTER_SCX_TEST(&peek_dsq)
+-- 
+2.52.0
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip
new file mode 100644
index 0000000..2ac2c2f
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip
@@ -0,0 +1,654 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 46062169AD2
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:21 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206703; cv=none; b=RpWLRsxlJTzhlJSNJ6YDnnOidsJ7oCIJ0QG0EXS7VFoOFFRWiuWYlsET6M5MjOkyE+dnQih3vxbVtcm+li+EdUZBeyP5FVticeDHkmuoWPHZblewToySaE5iRFgZqZZMrF2/g7ww+IHVQ3wb1PmaWoyqrDBaIo5To0g72h92TRE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206703; c=relaxed/simple;
+	bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version:Content-Type; b=QJa7XLmNRAgs2IV6jX9+J3RTiz2TA7hXn5NgC4yjWKV75coBs2eumwHZZgG2HlZqrxNZy2yyHAMM73rFnrDZIvG+RpHWxcfbJopVHrre/vMQ3HJJFjQUmhaAwWCfX+5CuF2S3mkLLbQPk1FwQMpFRQzmQi7ZRNOguwaR+/BIBvQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=fA7dEfIE; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="fA7dEfIE"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206701; x=1791742701;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=;
+  b=fA7dEfIE91ULN1jqc64owLAysrWyqWsDA5nuO1+sgcIA15Yn8yYj6iw4
+   55VPKl3g+xYXhPmGyE7a0LZvFUc9YG3ckmUpqO0pvf6oo1RJcM13mS3yi
+   KNsM4bbd9aFpNPTftzZGqryw94QrGirzar7JNUNOk0MJqRkziOVPLHnOi
+   iVfGn7SOaI4LzDDzlorOXwaeFstT3f2UVe0Cr2vAWBdxYyDop0Z+G9hqb
+   BhSDn+aeXU8OqAYP/xGpt3Ce8cbnDhTJhA+r5jzej1xMspSEeS1p/SQOm
+   slC+k3w/mm9HPugo6aL39ZyshlQHrAN4qvnJBJT/5GnR6bFHs9O0IKtHz
+   w==;
+X-CSE-ConnectionGUID: AwkM8kCOR6yXxOyCyDBj4Q==
+X-CSE-MsgGUID: FBEmDsF5QKC61vf0MqpBmQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339614"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339614"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:19 -0700
+X-CSE-ConnectionGUID: HGgPT3dBQFm59TiA7l3rfA==
+X-CSE-MsgGUID: SlOHviQzSgGRjsbScX9f4Q==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487181"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:19 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load balancing
+Date: Sat, 11 Oct 2025 11:24:38 -0700
+Message-Id: <865b852e3fdef6561c9e0a5be9a94aec8a68cdea.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+
+Cache-aware load balancing aims to aggregate tasks with potential
+shared resources into the same cache domain. This approach enhances
+cache locality, thereby optimizing system performance by reducing
+cache misses and improving data access efficiency.
+
+In the current implementation, threads within the same process are
+considered as entities that potentially share resources.
+Cache-aware load balancing monitors the CPU occupancy of each cache
+domain for every process. Based on this monitoring, it endeavors to
+migrate threads within a given process to its cache-hot domains,
+with the goal of maximizing cache locality.
+
+It is an attempt at modelling cache affinity. While the patch series
+only targets LLC, it could very well be extended to clusters (L2),
+or other kind of domains grouping inside a node.
+
+As it stands, the mechanism only computes a CPU within the LLC that
+has the highest recent runtime; this CPU is then used in the load
+balance path in subsequent patches to steer toward this LLC.
+
+More elaborate measures could be added later in NUMA_BALANCING: for
+example, migrating task A to its preferred LLC when it has spare CPU
+capacity, or swapping task A with another running task B in task Aâs
+preferred LLC.
+
+Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/mm_types.h |  44 ++++++
+ include/linux/sched.h    |   4 +
+ init/Kconfig             |  11 ++
+ kernel/fork.c            |   6 +
+ kernel/sched/core.c      |   6 +
+ kernel/sched/fair.c      | 288 +++++++++++++++++++++++++++++++++++++++
+ kernel/sched/features.h  |   1 +
+ kernel/sched/sched.h     |   8 ++
+ 8 files changed, 368 insertions(+)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 08bc2442db93..3ca557c2f36d 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -927,6 +927,11 @@ struct mm_cid {
+ };
+ #endif
+ 
++struct mm_sched {
++	u64 runtime;
++	unsigned long epoch;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -1017,6 +1022,17 @@ struct mm_struct {
+ 		 */
+ 		raw_spinlock_t cpus_allowed_lock;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
++		 * See account_mm_sched() and ...
++		 */
++		struct mm_sched __percpu *pcpu_sched;
++		raw_spinlock_t mm_sched_lock;
++		unsigned long mm_sched_epoch;
++		int mm_sched_cpu;
++#endif
++
+ #ifdef CONFIG_MMU
+ 		atomic_long_t pgtables_bytes;	/* size of all page tables */
+ #endif
+@@ -1436,6 +1452,34 @@ static inline unsigned int mm_cid_size(void)
+ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
++#ifdef CONFIG_SCHED_CACHE
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
++
++static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
++{
++	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++
++	if (!pcpu_sched)
++		return -ENOMEM;
++
++	mm_init_sched(mm, pcpu_sched);
++	return 0;
++}
++
++#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
++
++static inline void mm_destroy_sched(struct mm_struct *mm)
++{
++	free_percpu(mm->pcpu_sched);
++	mm->pcpu_sched = NULL;
++}
++#else /* !CONFIG_SCHED_CACHE */
++
++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
++static inline void mm_destroy_sched(struct mm_struct *mm) { }
++
++#endif /* CONFIG_SCHED_CACHE */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index f8188b833350..d7ddb7ce6c4b 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1400,6 +1400,10 @@ struct task_struct {
+ 	unsigned long			numa_pages_migrated;
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	struct callback_head		cache_work;
++#endif
++
+ #ifdef CONFIG_RSEQ
+ 	struct rseq __user *rseq;
+ 	u32 rseq_len;
+diff --git a/init/Kconfig b/init/Kconfig
+index e3eb63eadc87..4e625db7920a 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -970,6 +970,17 @@ config NUMA_BALANCING
+ 
+ 	  This system will be inactive on UMA systems.
+ 
++config SCHED_CACHE
++	bool "Cache aware load balance"
++	default y
++	depends on SMP
++	help
++	  When enabled, the scheduler will attempt to aggregate tasks from
++	  the same process onto a single Last Level Cache (LLC) domain when
++	  possible. This improves cache locality by keeping tasks that share
++	  resources within the same cache domain, reducing cache misses and
++	  lowering data access latency.
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index c4ada32598bd..9cd6efe2926d 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
+ 	cleanup_lazy_tlbs(mm);
+ 
+ 	WARN_ON_ONCE(mm == current->active_mm);
++	mm_destroy_sched(mm);
+ 	mm_free_pgd(mm);
+ 	mm_free_id(mm);
+ 	destroy_context(mm);
+@@ -1079,6 +1080,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	if (mm_alloc_cid(mm, p))
+ 		goto fail_cid;
+ 
++	if (mm_alloc_sched(mm))
++		goto fail_sched;
++
+ 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ 				     NR_MM_COUNTERS))
+ 		goto fail_pcpu;
+@@ -1088,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	return mm;
+ 
+ fail_pcpu:
++	mm_destroy_sched(mm);
++fail_sched:
+ 	mm_destroy_cid(mm);
+ fail_cid:
+ 	destroy_context(mm);
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index be00629f0ba4..79d15e904d12 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4520,6 +4520,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ 	p->migration_pending = NULL;
+ 	init_sched_mm_cid(p);
++	init_sched_mm(p);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+@@ -8821,6 +8822,11 @@ void __init sched_init(void)
+ 
+ 		rq->core_cookie = 0UL;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		raw_spin_lock_init(&rq->cpu_epoch_lock);
++		rq->cpu_epoch_next = jiffies;
++#endif
++
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+ 	}
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b173a059315c..a2ea002f4fd6 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
+ 	sa->runnable_avg = sa->util_avg;
+ }
+ 
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
++
+ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ {
+ 	u64 now = rq_clock_task(rq);
+@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ 		trace_sched_stat_runtime(running, delta_exec);
+ 		account_group_exec_runtime(running, delta_exec);
++		account_mm_sched(rq, donor, delta_exec);
+ 
+ 		/* cgroup time is always accounted against the donor */
+ 		cgroup_account_cputime(donor, delta_exec);
+@@ -1193,6 +1196,289 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 	return delta_exec;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++
++/*
++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
++ * tunable or so.
++ */
++#define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
++#define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
++
++static int llc_id(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
++{
++	unsigned long epoch;
++	int i;
++
++	for_each_possible_cpu(i) {
++		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct rq *rq = cpu_rq(i);
++
++		pcpu_sched->runtime = 0;
++		pcpu_sched->epoch = rq->cpu_epoch;
++		epoch = rq->cpu_epoch;
++	}
++
++	raw_spin_lock_init(&mm->mm_sched_lock);
++	mm->mm_sched_epoch = epoch;
++	mm->mm_sched_cpu = -1;
++
++	/*
++	 * The update to mm->pcpu_sched should not be reordered
++	 * before initialization to mm's other fields, in case
++	 * the readers may get invalid mm_sched_epoch, etc.
++	 */
++	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++}
++
++/* because why would C be fully specified */
++static __always_inline void __shr_u64(u64 *val, unsigned int n)
++{
++	if (n >= 64) {
++		*val = 0;
++		return;
++	}
++	*val >>= n;
++}
++
++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	lockdep_assert_held(&rq->cpu_epoch_lock);
++
++	unsigned long n, now = jiffies;
++	long delta = now - rq->cpu_epoch_next;
++
++	if (delta > 0) {
++		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		rq->cpu_epoch += n;
++		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		__shr_u64(&rq->cpu_runtime, n);
++	}
++
++	n = rq->cpu_epoch - pcpu_sched->epoch;
++	if (n) {
++		pcpu_sched->epoch += n;
++		__shr_u64(&pcpu_sched->runtime, n);
++	}
++}
++
++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
++
++	__update_mm_sched(rq, pcpu_sched);
++
++	/*
++	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
++	 * the accumulation period, this means the multiplcation here should
++	 * not overflow.
++	 */
++	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
++}
++
++static inline
++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_sched *pcpu_sched;
++	unsigned long epoch;
++
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (p->sched_class != &fair_sched_class)
++		return;
++	/*
++	 * init_task and kthreads don't having mm
++	 */
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
++
++	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
++		__update_mm_sched(rq, pcpu_sched);
++		pcpu_sched->runtime += delta_exec;
++		rq->cpu_runtime += delta_exec;
++		epoch = rq->cpu_epoch;
++	}
++
++	/*
++	 * If this task hasn't hit task_cache_work() for a while, or it
++	 * has only 1 thread, invalidate its preferred state.
++	 */
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	    get_nr_threads(p) <= 1) {
++		if (mm->mm_sched_cpu != -1)
++			mm->mm_sched_cpu = -1;
++	}
++}
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	struct mm_struct *mm = p->mm;
++
++	if (!sched_feat(SCHED_CACHE))
++		return;
++
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	if (mm->mm_sched_epoch == rq->cpu_epoch)
++		return;
++
++	guard(raw_spinlock)(&mm->mm_sched_lock);
++
++	if (work->next == work) {
++		task_work_add(p, work, TWA_RESUME);
++		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
++	}
++}
++
++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
++			      int pref_nid, int curr_cpu)
++{
++#ifdef CONFIG_NUMA_BALANCING
++	/* First honor the task's preferred node. */
++	if (pref_nid != NUMA_NO_NODE)
++		cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
++#endif
++
++	/* Next honor the task's cache CPU if it is not included. */
++	if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus))
++		cpumask_or(cpus, cpus,
++			   cpumask_of_node(cpu_to_node(cache_cpu)));
++
++	/*
++	 * Lastly make sure that the task's current running node is
++	 * considered.
++	 */
++	if (!cpumask_test_cpu(curr_cpu, cpus))
++		cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
++}
++
++static void __no_profile task_cache_work(struct callback_head *work)
++{
++	struct task_struct *p = current;
++	struct mm_struct *mm = p->mm;
++	unsigned long m_a_occ = 0;
++	unsigned long curr_m_a_occ = 0;
++	int cpu, m_a_cpu = -1, cache_cpu,
++	    pref_nid = NUMA_NO_NODE, curr_cpu;
++	cpumask_var_t cpus;
++
++	WARN_ON_ONCE(work != &p->cache_work);
++
++	work->next = work;
++
++	if (p->flags & PF_EXITING)
++		return;
++
++	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
++		return;
++
++	curr_cpu = task_cpu(p);
++	cache_cpu = mm->mm_sched_cpu;
++#ifdef CONFIG_NUMA_BALANCING
++	if (static_branch_likely(&sched_numa_balancing))
++		pref_nid = p->numa_preferred_nid;
++#endif
++
++	scoped_guard (cpus_read_lock) {
++		get_scan_cpumasks(cpus, cache_cpu,
++				  pref_nid, curr_cpu);
++
++		for_each_cpu(cpu, cpus) {
++			/* XXX sched_cluster_active */
++			struct sched_domain *sd = per_cpu(sd_llc, cpu);
++			unsigned long occ, m_occ = 0, a_occ = 0;
++			int m_cpu = -1, i;
++
++			if (!sd)
++				continue;
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				occ = fraction_mm_sched(cpu_rq(i),
++							per_cpu_ptr(mm->pcpu_sched, i));
++				a_occ += occ;
++				if (occ > m_occ) {
++					m_occ = occ;
++					m_cpu = i;
++				}
++			}
++
++			/*
++			 * Compare the accumulated occupancy of each LLC. The
++			 * reason for using accumulated occupancy rather than average
++			 * per CPU occupancy is that it works better in asymmetric LLC
++			 * scenarios.
++			 * For example, if there are 2 threads in a 4CPU LLC and 3
++			 * threads in an 8CPU LLC, it might be better to choose the one
++			 * with 3 threads. However, this would not be the case if the
++			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
++			 * if average per CPU occupancy is used).
++			 * Besides, NUMA balancing fault statistics behave similarly:
++			 * the total number of faults per node is compared rather than
++			 * the average number of faults per CPU. This strategy is also
++			 * followed here.
++			 */
++			if (a_occ > m_a_occ) {
++				m_a_occ = a_occ;
++				m_a_cpu = m_cpu;
++			}
++
++			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
++				curr_m_a_occ = a_occ;
++
++			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
++		}
++	}
++
++	if (m_a_occ > (2 * curr_m_a_occ)) {
++		/*
++		 * Avoid switching mm_sched_cpu too fast.
++		 * The reason to choose 2X is because:
++		 * 1. It is better to keep the preferred LLC stable,
++		 *    rather than changing it frequently and cause migrations
++		 * 2. 2X means the new preferred LLC has at least 1 more
++		 *    busy CPU than the old one(200% vs 100%, eg)
++		 * 3. 2X is chosen based on test results, as it delivers
++		 *    the optimal performance gain so far.
++		 */
++		mm->mm_sched_cpu = m_a_cpu;
++	}
++
++	free_cpumask_var(cpus);
++}
++
++void init_sched_mm(struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++
++	init_task_work(work, task_cache_work);
++	work->next = work;
++}
++
++#else
++
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
++				    s64 delta_exec) { }
++
++void init_sched_mm(struct task_struct *p) { }
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
++
++#endif
++
+ /*
+  * Used by other classes to account runtime.
+  */
+@@ -13031,6 +13317,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ 	if (static_branch_unlikely(&sched_numa_balancing))
+ 		task_tick_numa(rq, curr);
+ 
++	task_tick_cache(rq, curr);
++
+ 	update_misfit_status(curr, rq);
+ 	check_update_overutilized_status(task_rq(curr));
+ 
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index 3c12d9f93331..d2af7bfd36bf 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_UTIL, true)
+ 
++SCHED_FEAT(SCHED_CACHE, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+  * in a single rq->lock section. Default disabled because the
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index be9745d104f7..2ded8d3d0ecc 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1166,6 +1166,12 @@ struct rq {
+ 	u64			clock_pelt_idle_copy;
+ 	u64			clock_idle_copy;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
++	u64			cpu_runtime;
++	unsigned long		cpu_epoch;
++	unsigned long		cpu_epoch_next;
++#endif
+ 
+ 	atomic_t		nr_iowait;
+ 
+@@ -3790,6 +3796,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
+ 
++extern void init_sched_mm(struct task_struct *p);
++
+ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+ extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ static inline
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
new file mode 100644
index 0000000..cbf16ce
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
@@ -0,0 +1,227 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19068204096
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:21 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206704; cv=none; b=EzlLh3pSj7Y4f8RITAS280jAzGdfSil0Uvmf2s0iDBWXhjbTN9kKcwe8yCBI8vI/kpxwAU/q6SDZiBXRODyVXxt+x1ZEHGNytyNVJ+14VdLcKLUF/bWqEXXojGdMU1nZFeYor5k/Gwn2eBMXY7mjVq+req3REwzEV/z7PNxWJYU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206704; c=relaxed/simple;
+	bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version:Content-Type; b=naTQ9gtxsiPYap1e7sRA67shhCjtvQU5+UWYPmFmFnsa1NV0CLod+8tcKlUn52BHYuXFMHk+KQi3AhpPSOC+Tysfot4R/EhnOjDucwfpslAmfKl+rwCfOrGMnq3fjOG/h3r7EnuLxz8dxpUfqriJzedrFrStvfO37iAPvvF5HVg=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=LSwa/WAK; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="LSwa/WAK"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206702; x=1791742702;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=;
+  b=LSwa/WAKvGAX6RIYpQ7iNqrlvhm/Szlkb5ZlWCgbajQDsBhTiTWg/PPi
+   Nxj6VEs7MSoZptgkIvxX8jl3FQca3deDnRuhlinmaGbJYu3LY3ZP4p3jp
+   4+hBugKd3GkfwcLlWr+3IrP84r9gwdtMmKlDccI1G07f4s4tirTBoEDsm
+   gJ8uA3qrKlx1xYMf/sgz5udiByo4NeRPGdBdJ+bYBTDvNTGeTE9k4bBmi
+   0OuSxEI9YhInAS8s2mr8VnpZwUVjixmAO4g6ZwRHW42PucNrjAj/v7YoU
+   sfJ1aDaIb4/pD7oTExOcJxChABHQZAXGQ1b9F1jBoWdX4w8mb0HwbQJ+I
+   A==;
+X-CSE-ConnectionGUID: V6kqtIYCR06jkGZvnWCLsQ==
+X-CSE-MsgGUID: XSPXCIWWQjiVjOSNzEq1Ow==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339631"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339631"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:20 -0700
+X-CSE-ConnectionGUID: wcTW2V7hQHun3H1J8na2Fw==
+X-CSE-MsgGUID: zfpr8MStR5yuJxzDpmsnpw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487184"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:20 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 02/19] sched/fair: Record per-LLC utilization to guide cache-aware scheduling decisions
+Date: Sat, 11 Oct 2025 11:24:39 -0700
+Message-Id: <7684e7381c61a2a0d0580790340d4daa5349e48c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+When a system becomes busy and a processâs preferred LLC is
+saturated with too many threads, tasks within that LLC migrate
+frequently. These in LLC migrations introduce latency and degrade
+performance. To avoid this, task aggregation should be suppressed when
+the preferred LLC is overloaded, which requires a metric to indicate
+LLC utilization.
+
+Record per LLC utilization/cpu capacity during periodic load
+balancing. These statistics will be used in later patches to decide
+whether tasks should be aggregated into their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched/topology.h |  4 ++
+ kernel/sched/fair.c            | 73 ++++++++++++++++++++++++++++++++++
+ 2 files changed, 77 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 5263746b63e8..fa25db00fdb6 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -77,6 +77,10 @@ struct sched_domain_shared {
+ 	atomic_t	nr_busy_cpus;
+ 	int		has_idle_cores;
+ 	int		nr_idle_scan;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned long	util_avg;
++	unsigned long	capacity ____cacheline_aligned_in_smp;
++#endif
+ };
+ 
+ struct sched_domain {
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a2ea002f4fd6..1ebb0d99a906 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9559,6 +9559,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/* Called from load balancing paths with rcu_read_lock held */
++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
++					 unsigned long *cap)
++{
++	struct sched_domain_shared *sd_share;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share)
++		return false;
++
++	*util = READ_ONCE(sd_share->util_avg);
++	*cap = READ_ONCE(sd_share->capacity);
++
++	return true;
++}
++#else
++static inline bool get_llc_stats(int cpu, unsigned long *util,
++				 unsigned long *cap)
++{
++	return false;
++}
++#endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+  */
+@@ -10529,6 +10552,55 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Record the statistics for this scheduler group for later
++ * use. These values guide load balancing on aggregating tasks
++ * to a LLC.
++ */
++static void record_sg_llc_stats(struct lb_env *env,
++				struct sg_lb_stats *sgs,
++				struct sched_group *group)
++{
++	/*
++	 * Find the child domain on env->dst_cpu. This domain
++	 * is either the domain that spans this group(if the
++	 * group is a local group), or the sibling domain of
++	 * this group.
++	 */
++	struct sched_domain *sd = env->sd->child;
++	struct sched_domain_shared *sd_share;
++
++	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++		return;
++
++	/* only care about sched domains spanning a LLC */
++	if (sd != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
++		return;
++
++	/*
++	 * At this point we know this group spans a LLC domain.
++	 * Record the statistic of this group in its corresponding
++	 * shared LLC domain.
++	 */
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
++					   cpumask_first(sched_group_span(group))));
++	if (!sd_share)
++		return;
++
++	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
++		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
++
++	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
++		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
++}
++#else
++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
++				       struct sched_group *group)
++{
++}
++#endif
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10618,6 +10690,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	record_sg_llc_stats(env, sgs, group);
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
new file mode 100644
index 0000000..eb1895b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
@@ -0,0 +1,335 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F9012652B7
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:22 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206704; cv=none; b=oUXwn7ZLltUxrcsLLRQdMkG+rOj3I6N99RIlDJViVMyN84ZxeHx7+Ziq9zOEmnN6HNfk258hdIef+3nAkETeBkCnWEbZ8Lcj64n3OoXf0SrXkICA1KPwc1TZ230lpQNfogVeErSJlu4VOhrgueBPexZRP8Ng8MlzAqpdxuV0fQw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206704; c=relaxed/simple;
+	bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=h155j6xc5cDWdV6bfIecXus0Znq8M6zidqbVhtVjeQT/UoiHcyIrY8v1abXoVw27R0/39P2bQUH4GyYEjMOV8PSTvlLp8J+kYh4mcI1SSe5ftkudSs2ubZG59uaM4B6xXwz85tEAhPwwNkRLqFlmW7J/wyi3Ynw+ec/ie7a3Ft4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n7smfE6o; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n7smfE6o"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206702; x=1791742702;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=;
+  b=n7smfE6oCjv1Z9pv/7dg2JDtqoMwaTw0XnoJhqh6krIk55XD846r100l
+   CQyKNCviKGlIlQvhs/a27sgH4IgQduwhbRn6XT0KlUibkjI+C8DxLau1W
+   bQGlFOBkWVF6N/GWfn6y0ss98uylK337lt84xU7aPoM+QWTzjR+VkOrKT
+   0bIzxevMwLmEG4vuOleJ69vSQP6G0PZSGpGrTBTnbFEemOJQO4Ufh8Z3S
+   CBvnKym+IUG+WQx9TQa+cFfFXkPxhSkobYj2dyGq+CWyc4oBsOiaaIfuN
+   mb6/NAGjVnTGTjlIsC3a7QsDovld1JkhMvVnrniOZGCbMVHv6vrIMp6no
+   g==;
+X-CSE-ConnectionGUID: y8Q0FIVVTeyqh+iA7G7QGw==
+X-CSE-MsgGUID: NHnFhDxxRvChXLKbODkIZw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339652"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339652"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:21 -0700
+X-CSE-ConnectionGUID: r3BrcjKDSJONY4pZr3YdUQ==
+X-CSE-MsgGUID: 9FSjHRHPTQWyN3aom4KQIA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487189"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:21 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy
+Date: Sat, 11 Oct 2025 11:24:40 -0700
+Message-Id: <d6830774db8e260af4da728c44e9f899376a0ea6.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Cache-aware scheduling aggregates threads onto their preferred LLC,
+mainly through load balancing. When the preferred LLC becomes
+saturated, more threads are still placed there, increasing latency.
+A mechanism is needed to limit aggregation so that the preferred LLC
+does not become overloaded.
+
+Introduce helper functions can_migrate_llc() and
+can_migrate_llc_task() to enforce the LLC migration policy:
+
+  1. Aggregate a task to its preferred LLC if both source and
+     destination LLCs are not too busy (<50% utilization, tunable),
+     or if doing so will not leave the preferred LLC much more
+     imbalanced than the non-preferred one (>20% utilization
+     difference, tunable, similar to imbalance_pct of the LLC domain).
+  2. Allow moving a task from overloaded preferred LLC to a non preferred
+     LLC if this will not cause the non preferred LLC to become
+     too imbalanced to cause a later migration back.
+  3. If both LLCs are too busy, let the generic load balance to spread
+     the tasks.
+
+This hysteresis prevents tasks from being migrated into and out of the
+preferred LLC frequently (back and forth): the threshold for migrating
+a task out of its preferred LLC is higher than that for migrating it
+into the LLC.
+
+Since aggregation tends to make the preferred LLC busier than others,
+the imbalance tolerance is controlled by llc_imb_pct. If set to 0,
+tasks may still aggregate to the preferred LLC as long as it is
+not more utilized than the source LLC, preserving the preference.
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/debug.c |   4 ++
+ kernel/sched/fair.c  | 145 +++++++++++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h |   5 ++
+ 3 files changed, 154 insertions(+)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 02e16b70a790..57bb04ebbf96 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -523,6 +523,10 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct);
++	debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct);
++#endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+ 	debugfs_fair_server_init();
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 1ebb0d99a906..cd080468ddc9 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ #define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
+ #define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
+ 
++__read_mostly unsigned int llc_overload_pct       = 50;
++__read_mostly unsigned int llc_imb_pct            = 20;
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -9560,6 +9563,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ }
+ 
+ #ifdef CONFIG_SCHED_CACHE
++/*
++ * The margin used when comparing LLC utilization with CPU capacity.
++ * Parameter llc_overload_pct determines the LLC load level where
++ * active LLC aggregation is done.
++ * Derived from fits_capacity().
++ *
++ * (default: ~50%)
++ */
++#define fits_llc_capacity(util, max)	\
++	((util) * 100 < (max) * llc_overload_pct)
++
++/*
++ * The margin used when comparing utilization.
++ * is 'util1' noticeably greater than 'util2'
++ * Derived from capacity_greater().
++ * Bias is in perentage.
++ */
++/* Allows dst util to be bigger than src util by up to bias percent */
++#define util_greater(util1, util2) \
++	((util1) * 100 > (util2) * (100 + llc_imb_pct))
++
+ /* Called from load balancing paths with rcu_read_lock held */
+ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 					 unsigned long *cap)
+@@ -9575,6 +9599,127 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 
+ 	return true;
+ }
++
++/*
++ * Decision matrix according to the LLC utilization. To
++ * decide whether we can do task aggregation across LLC.
++ *
++ * By default, 50% is the threshold to treat the LLC as busy,
++ * and 20% is the utilization imbalance percentage to decide
++ * if the preferred LLC is busier than the non-preferred LLC.
++ *
++ * 1. moving towards the preferred LLC, dst is the preferred
++ *    LLC, src is not.
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            Y    Y    Y    N
++ * 40%            Y    Y    Y    Y
++ * 50%            Y    Y    G    G
++ * 60%            Y    Y    G    G
++ *
++ * 2. moving out of the preferred LLC, src is the preferred
++ *    LLC, dst is not:
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            N    N    N    N
++ * 40%            N    N    N    N
++ * 50%            N    N    G    G
++ * 60%            Y    N    G    G
++ *
++ * src :      src_util
++ * dst :      dst_util
++ * Y :        Yes, migrate
++ * N :        No, do not migrate
++ * G :        let the Generic load balance to even the load.
++ *
++ * The intention is that if both LLCs are quite busy, cache aware
++ * load balance should not be performed, and generic load balance
++ * should take effect. However, if one is busy and the other is not,
++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
++ * be considered to determine whether LLC aggregation should be
++ * performed to bias the load towards the preferred LLC.
++ */
++
++/* migration decision, 3 states are orthogonal. */
++enum llc_mig {
++	mig_forbid = 0,		/* N: Don't migrate task, respect LLC preference */
++	mig_llc,		/* Y: Do LLC preference based migration */
++	mig_unrestricted	/* G: Don't restrict generic load balance migration */
++};
++
++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
++				    unsigned long tsk_util,
++				    bool to_pref)
++{
++	unsigned long src_util, dst_util, src_cap, dst_cap;
++
++	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
++	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
++		return mig_unrestricted;
++
++	if (!fits_llc_capacity(dst_util, dst_cap) &&
++	    !fits_llc_capacity(src_util, src_cap))
++		return mig_unrestricted;
++
++	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
++	dst_util = dst_util + tsk_util;
++	if (to_pref) {
++		/*
++		 * llc_imb_pct is the imbalance allowed between
++		 * preferred LLC and non-preferred LLC.
++		 * Don't migrate if we will get preferred LLC too
++		 * heavily loaded and if the dest is much busier
++		 * than the src, in which case migration will
++		 * increase the imbalance too much.
++		 */
++		if (!fits_llc_capacity(dst_util, dst_cap) &&
++		    util_greater(dst_util, src_util))
++			return mig_forbid;
++	} else {
++		/*
++		 * Don't migrate if we will leave preferred LLC
++		 * too idle, or if this migration leads to the
++		 * non-preferred LLC falls within sysctl_aggr_imb percent
++		 * of preferred LLC, leading to migration again
++		 * back to preferred LLC.
++		 */
++		if (fits_llc_capacity(src_util, src_cap) ||
++		    !util_greater(src_util, dst_util))
++			return mig_forbid;
++	}
++	return mig_llc;
++}
++
++/*
++ * Check if task p can migrate from src_cpu to dst_cpu
++ * in terms of cache aware load balance.
++ */
++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++							struct task_struct *p)
++{
++	struct mm_struct *mm;
++	bool to_pref;
++	int cpu;
++
++	mm = p->mm;
++	if (!mm)
++		return mig_unrestricted;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
++		return mig_unrestricted;
++
++	if (cpus_share_cache(dst_cpu, cpu))
++		to_pref = true;
++	else if (cpus_share_cache(src_cpu, cpu))
++		to_pref = false;
++	else
++		return mig_unrestricted;
++
++	return can_migrate_llc(src_cpu, dst_cpu,
++			       task_util(p), to_pref);
++}
++
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 2ded8d3d0ecc..a52c96064b36 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2797,6 +2797,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern unsigned int llc_overload_pct;
++extern unsigned int llc_imb_pct;
++#endif
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip
new file mode 100644
index 0000000..233f3fe
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip
@@ -0,0 +1,208 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD9FE27F75F
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:23 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206705; cv=none; b=toy7mYgrkMShyfYM+pYJVnlk2kT96KNiv5DNY2SPeZNG+C4hUMbzxW+QMLoY5P4G0gxMEqPJZD1oRcx17kku+G6SaznXM9qHf6TbjE3y6E+5eW6mFGs9F7x17MH+po42oQIBeMuQONsrqKSl7XLcK2ag8qWKJC1Xr5w/c8efzqg=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206705; c=relaxed/simple;
+	bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=BrQQpH91F+AYLu9pNsP5vrblllGBIiYSrf9Tqy9EYC4wS0n0udak+gKeFf8J19+3f0P2Q81tPIF74K0DC5ETs6YeanXYBydnXlUojA//lO1O300HBm7E4ONxjKjmsrUvcSI3JT5Le3EHo8kdx7whhv843/P3GIna7MP3njXDV14=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BHqKXCIn; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BHqKXCIn"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206704; x=1791742704;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=;
+  b=BHqKXCInpJ9FMs87LCbtbTr8sCx+I94vOdw+YhnA01VGi2y2vrviHuha
+   44dYUBEYMQCSqJ0LZTT2V+2kshxkaTOgIYxGLcnue8xZcdvJE+tFA1vNK
+   e3l/bHsCjqNkzuXBC7xQTcdlcOk0RWIbIkbhlcUaSh6K3yuxlVHUHJcmE
+   r0xmWO+olPuADPa5P30u0Ohf3HcjIqBXZsxBvV5VI21iprKzNU2fqZx7i
+   dnB6Mbk+VkrpWYKhn8UVMBHAO40Hwj1qg7dTaTpQfAWXx8+nbbBZeHxKl
+   1QcSW4+uLMzTxhbUTINvxL6mxdB/i7FkzCBGLbgZ013YwkDLFD2+4CBnX
+   w==;
+X-CSE-ConnectionGUID: XU0Bp+klQCiSCfmyOaBeOA==
+X-CSE-MsgGUID: qUdy5aE4QB+ndas2O3JrjQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339674"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339674"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:22 -0700
+X-CSE-ConnectionGUID: veyEE6PBTGirh+PomEioDQ==
+X-CSE-MsgGUID: eht/GZN/S/ekMdaQtDO0ag==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487193"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:22 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 04/19] sched/fair: Introduce a static key to enable cache aware only for multi LLCs
+Date: Sat, 11 Oct 2025 11:24:41 -0700
+Message-Id: <ef136e6a6f5a2ef840b1f9571c47411f04705b6a.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Enable cache-aware load balancing only if at least 1 NUMA node has
+more than one LLC.
+
+Suggested-by: Libo Chen <libo.chen@oracle.com>
+Suggested-by: Adam Li <adamli@os.amperecomputing.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c     | 15 ++++++++++++---
+ kernel/sched/sched.h    |  1 +
+ kernel/sched/topology.c | 14 ++++++++++++--
+ 3 files changed, 25 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cd080468ddc9..3d643449c48c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1208,6 +1208,14 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ __read_mostly unsigned int llc_overload_pct       = 50;
+ __read_mostly unsigned int llc_imb_pct            = 20;
+ 
++DEFINE_STATIC_KEY_FALSE(sched_cache_allowed);
++
++static inline bool sched_cache_enabled(void)
++{
++	return sched_feat(SCHED_CACHE) &&
++		static_branch_likely(&sched_cache_allowed);
++}
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -1294,7 +1302,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	if (p->sched_class != &fair_sched_class)
+@@ -1330,7 +1338,7 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 	struct callback_head *work = &p->cache_work;
+ 	struct mm_struct *mm = p->mm;
+ 
+-	if (!sched_feat(SCHED_CACHE))
++	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	if (!mm || !mm->pcpu_sched)
+@@ -10716,7 +10724,8 @@ static void record_sg_llc_stats(struct lb_env *env,
+ 	struct sched_domain *sd = env->sd->child;
+ 	struct sched_domain_shared *sd_share;
+ 
+-	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
++	if (!sched_cache_enabled() ||
++	    env->idle == CPU_NEWLY_IDLE)
+ 		return;
+ 
+ 	/* only care about sched domains spanning a LLC */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index a52c96064b36..60f1e51685ec 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2800,6 +2800,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int llc_overload_pct;
+ extern unsigned int llc_imb_pct;
++extern struct static_key_false sched_cache_allowed;
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 6e2f54169e66..2675db980f70 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -2444,6 +2444,7 @@ static int
+ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+ {
+ 	enum s_alloc alloc_state = sa_none;
++	bool has_multi_llcs = false;
+ 	struct sched_domain *sd;
+ 	struct s_data d;
+ 	struct rq *rq = NULL;
+@@ -2530,10 +2531,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 				 * between LLCs and memory channels.
+ 				 */
+ 				nr_llcs = sd->span_weight / child->span_weight;
+-				if (nr_llcs == 1)
++				if (nr_llcs == 1) {
+ 					imb = sd->span_weight >> 3;
+-				else
++				} else {
+ 					imb = nr_llcs;
++					has_multi_llcs = true;
++				}
+ 				imb = max(1U, imb);
+ 				sd->imb_numa_nr = imb;
+ 
+@@ -2581,6 +2584,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	if (has_cluster)
+ 		static_branch_inc_cpuslocked(&sched_cluster_active);
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (has_multi_llcs) {
++		static_branch_enable_cpuslocked(&sched_cache_allowed);
++		pr_info("Cache aware load balance enabled.\n");
++	}
++#endif
++
+ 	if (rq && sched_debug_verbose)
+ 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip
new file mode 100644
index 0000000..cd2305a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip
@@ -0,0 +1,291 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E929283153
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:24 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206706; cv=none; b=l9o+r3tPneRXt3UimsPhWTyfqr4rcCBrkqPagUsuj236psyVrtVREf1eV9bh9i5x6sqiX/93/2fGTQOd3tDyAfM2x8nQDBG2tniRFTa1AjKlI5Hs36x8WGu+npNUTYaShkti1wSxrqntJys6VhwZ+aL+o6PQ3k1GyXMU2JJL3bw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206706; c=relaxed/simple;
+	bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=MDjhwzZYr3m7pwdhzj9TlyV526H5WJLBGHEilCqY27+WQSI1yxnPWT6k5Mm6bFKl/0I+sfGQBi/7HzzHe1S3ts6bk23EZaJB+w94GLEZKAcc8cSHQMDIbKKzGRMgBrwPnT0sZBkKxiooppSIJhtXCA86kWL70YWS1bZ1PVuSOI8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BzReY9Ll; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BzReY9Ll"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206705; x=1791742705;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=;
+  b=BzReY9LlEh9sk7OgZDcp2VjjY3mwnRzW5hp4d8rSX40TSJQm31n7pNsD
+   pGDX4pGNqIL2dKhB0TWBOakqdMqoEJBGhhFnbP0SML4ddRpmP22b3hhKk
+   66OBjK6EOlIiBTx96elcU0fwjNnZqBKTvf/i3IuC2HlilzxwoimPLi7ym
+   OqUTRkCWmlqgJ5BjvtUEaD2eb97VkiEAs6iUC5FsMQPohIZRE0ZJGIQT2
+   rLWb4YevoZUYtWiZQU/yYmcq5sU7eCp84d/YBPYTw8uDxW2au989TrB9t
+   olL4givIBdX+ieIJw7430Yz/Es1H+8Ji46MflznNqafshDKBuL8HbpSmx
+   A==;
+X-CSE-ConnectionGUID: xTVpDyXiQYmCxiG8vc8uKg==
+X-CSE-MsgGUID: ouYA76mXSo+MkfJ9ZAYryA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339693"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339693"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:23 -0700
+X-CSE-ConnectionGUID: Vda9/GgFQc2uyKt8dn0epA==
+X-CSE-MsgGUID: 2SFdpXMCSGKC8Z5YqgWCow==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487198"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:23 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs
+Date: Sat, 11 Oct 2025 11:24:42 -0700
+Message-Id: <7d75af576986cf447a171ce11f5e8a15a692e780.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce an index mapping between CPUs and their LLCs. This provides
+a continuous per LLC index needed for cache-aware load balancing in
+later patches.
+
+The existing per_cpu llc_id usually points to the first CPU of the
+LLC domain, which is sparse and unsuitable as an array index. Using
+llc_id directly would waste memory.
+
+With the new mapping, CPUs in the same LLC share a continuous index:
+
+  per_cpu(llc_idx, CPU=0...15)  = 0
+  per_cpu(llc_idx, CPU=16...31) = 1
+  per_cpu(llc_idx, CPU=32...47) = 2
+  ...
+
+The maximum number of LLCs is limited by CONFIG_NR_LLCS. If the number
+of LLCs available exceeds CONFIG_NR_LLCS, the cache aware load balance
+is disabled. To further save memory, this array could be converted to
+dynamic allocation in the future, or the LLC index could be made NUMA
+node-wide.
+
+As mentioned by Adam, if there is no domain with SD_SHARE_LLC, the
+function update_llc_idx() should not be invoked to update the index;
+otherwise, it will generate an invalid index.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/threads.h | 10 +++++++++
+ init/Kconfig            |  9 ++++++++
+ kernel/sched/fair.c     | 11 ++++++++++
+ kernel/sched/sched.h    |  2 ++
+ kernel/sched/topology.c | 47 +++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 79 insertions(+)
+
+diff --git a/include/linux/threads.h b/include/linux/threads.h
+index 1674a471b0b4..2c9b1adfe024 100644
+--- a/include/linux/threads.h
++++ b/include/linux/threads.h
+@@ -20,6 +20,16 @@
+ /* Places which use this should consider cpumask_var_t. */
+ #define NR_CPUS		CONFIG_NR_CPUS
+ 
++#ifndef CONFIG_NR_LLCS
++#define CONFIG_NR_LLCS 1
++#endif
++
++#if CONFIG_NR_LLCS > NR_CPUS
++#define NR_LLCS		NR_CPUS
++#else
++#define NR_LLCS		CONFIG_NR_LLCS
++#endif
++
+ #define MIN_THREADS_LEFT_FOR_ROOT 4
+ 
+ /*
+diff --git a/init/Kconfig b/init/Kconfig
+index 4e625db7920a..6e4c96ccdda0 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -981,6 +981,15 @@ config SCHED_CACHE
+ 	  resources within the same cache domain, reducing cache misses and
+ 	  lowering data access latency.
+ 
++config NR_LLCS
++	int "Maximum number of Last Level Caches"
++	range 2 1024
++	depends on SMP && SCHED_CACHE
++	default 64
++	help
++	  This allows you to specify the maximum number of last level caches
++	  this kernel will support for cache aware scheduling.
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 3d643449c48c..61c129bde8b6 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1224,6 +1224,17 @@ static int llc_id(int cpu)
+ 	return per_cpu(sd_llc_id, cpu);
+ }
+ 
++/*
++ * continuous LLC index, starting from 0.
++ */
++static inline int llc_idx(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_idx, cpu);
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 60f1e51685ec..b448ad6dc51d 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2039,6 +2039,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DECLARE_PER_CPU(int, sd_llc_size);
+ DECLARE_PER_CPU(int, sd_llc_id);
++DECLARE_PER_CPU(int, sd_llc_idx);
+ DECLARE_PER_CPU(int, sd_share_id);
+ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -2047,6 +2048,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ 
+ extern struct static_key_false sched_asym_cpucapacity;
+ extern struct static_key_false sched_cluster_active;
++extern int max_llcs;
+ 
+ static __always_inline bool sched_asym_cpucap_active(void)
+ {
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 2675db980f70..4bd033060f1d 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -659,6 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+ DEFINE_PER_CPU(int, sd_llc_size);
+ DEFINE_PER_CPU(int, sd_llc_id);
++DEFINE_PER_CPU(int, sd_llc_idx);
+ DEFINE_PER_CPU(int, sd_share_id);
+ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+@@ -668,6 +669,40 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+ 
++int max_llcs = -1;
++
++static void update_llc_idx(int cpu)
++{
++#ifdef CONFIG_SCHED_CACHE
++	int idx = -1, llc_id = -1;
++
++	if (max_llcs > NR_LLCS)
++		return;
++
++	llc_id = per_cpu(sd_llc_id, cpu);
++	idx = per_cpu(sd_llc_idx, llc_id);
++
++	/*
++	 * A new LLC is detected, increase the index
++	 * by 1.
++	 */
++	if (idx < 0) {
++		idx = max_llcs++;
++
++		if (max_llcs > NR_LLCS) {
++			if (static_branch_unlikely(&sched_cache_allowed))
++				static_branch_disable_cpuslocked(&sched_cache_allowed);
++
++			pr_warn_once("CONFIG_NR_LLCS is too small, disable cache aware load balance\n");
++			return;
++		}
++
++		per_cpu(sd_llc_idx, llc_id) = idx;
++	}
++	per_cpu(sd_llc_idx, cpu) = idx;
++#endif
++}
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+@@ -687,6 +722,10 @@ static void update_top_cache_domain(int cpu)
+ 	per_cpu(sd_llc_id, cpu) = id;
+ 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ 
++	/* only update the llc index for domain with SD_SHARE_LLC */
++	if (sd)
++		update_llc_idx(cpu);
++
+ 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ 	if (sd)
+ 		id = cpumask_first(sched_domain_span(sd));
+@@ -2452,6 +2491,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (max_llcs < 0) {
++		for_each_possible_cpu(i)
++			per_cpu(sd_llc_idx, i) = -1;
++		max_llcs = 0;
++	}
++#endif
++
+ 	if (WARN_ON(cpumask_empty(cpu_map)))
+ 		goto error;
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip
new file mode 100644
index 0000000..33e7efa
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip
@@ -0,0 +1,156 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A93492836B1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:24 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206706; cv=none; b=S6xTZtgG4gDit+VImk9W2UzS4qpXEGkcWHMUVoYyOSnpNNw4aucqYAXSSje8zYLjl3z3dX3Jt3ztt7bwcuxWrRrv6qxUGactOiUWUNrvSPN2VWKScV6w3ksMM6saX0NH5ZC3WBABiX0+fpwQlzvqkQFNz80/YqP8x3hbG8jBKng=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206706; c=relaxed/simple;
+	bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=JsV8TTkODWXWFKIKrzZGo3NxMw8hU5p/OWk4qVG3F1HoqgFqWBsu2TcQGUVWw1R9rnOAFP+1s9fHghtr+g8SHhcTCX8Srq+6rXX7gAPQLfCi2R3P+f6W+h6FG6DDQXFxrgsSAi265RFjsNyqSNVDyYiSw0j1kUou9k2jg/TFWas=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=maHNOTTa; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="maHNOTTa"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206705; x=1791742705;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=;
+  b=maHNOTTaUom4zOfjF9aQgzk/EHInefpcQXQBpZ407o2A6QAh7rtx4d1V
+   uIUh04rGM6MxEKMGQGzPbEcwmEUVnQVNQXhq0m60vo8GIlq3nI3UFHh2/
+   okHOmrxdhoN3uwbNZN5d2mGAMO3ADHunEGtbLYRsJ5ffyJXYwvK9ZYj6n
+   ZqWJDYCygmb5LDln/D3icLbLhH8Zm6QWr4yAgVZQ73wl/I3EgDdp+pIYb
+   aLimiW5HUOhIlD+krR4Rg02sINFyPrZ2h5VJdZ1v01hMqilwa2zgPVcWi
+   tEJ0OmQs9iwf0mBA0kNnJx5l2NSvLy+2FE84H8lwtH6U/4ySfKAnmdVGc
+   Q==;
+X-CSE-ConnectionGUID: LhZ9XN5ESr6ORNd5zvY9sA==
+X-CSE-MsgGUID: UBKHEBpdQNSkGD6fqT87jQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339711"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339711"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:24 -0700
+X-CSE-ConnectionGUID: M/4LVw/6Qg626wVKqENzqw==
+X-CSE-MsgGUID: hqk2hnIER+q1aJ8R3vcczQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487203"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:24 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 06/19] sched/fair: Assign preferred LLC ID to processes
+Date: Sat, 11 Oct 2025 11:24:43 -0700
+Message-Id: <cfa266cd6ea6fa30cbf7b07573992f18f786955e.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+With cache-aware scheduling enabled, each task is assigned a
+preferred LLC ID. This allows quick identification of the LLC domain
+where the task prefers to run, similar to numa_preferred_nid in
+NUMA balancing.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched.h | 1 +
+ init/init_task.c      | 3 +++
+ kernel/sched/fair.c   | 7 +++++++
+ 3 files changed, 11 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index d7ddb7ce6c4b..8a5e4038cd5c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1402,6 +1402,7 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	int				preferred_llc;
+ #endif
+ 
+ #ifdef CONFIG_RSEQ
+diff --git a/init/init_task.c b/init/init_task.c
+index e557f622bd90..5fffbe766f57 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_group	= NULL,
+ 	.numa_faults	= NULL,
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	.preferred_llc  = -1,
++#endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ 	.kasan_depth	= 1,
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 61c129bde8b6..d6167a029c47 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1312,6 +1312,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
++	int mm_sched_llc = -1;
+ 
+ 	if (!sched_cache_enabled())
+ 		return;
+@@ -1342,6 +1343,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
++
++	if (mm->mm_sched_cpu != -1)
++		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
++
++	if (p->preferred_llc != mm_sched_llc)
++		p->preferred_llc = mm_sched_llc;
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip
new file mode 100644
index 0000000..f87fefd
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip
@@ -0,0 +1,257 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7966283FE1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:25 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206707; cv=none; b=Jt9YvY3nM/0EYBih4PVmiKQ2QzO4ZDLh2TKnGqMyWerCIfIM0CWceRhOpjM2iQwiUHzLszpycQZ+UQorhwMqEi3t7Erkuc8eVsgIO7guz2r8zCqiEsDc75hJulbNVOIh4Hf5WtkLCN2FDwtJ+pKaDQzjrmQsv/RTGx24LhvBhds=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206707; c=relaxed/simple;
+	bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=PrDaFPl16+dUYVfNSWRpTD87yz4MK7/HdghB7ILX5xXggJN8vYLmcy4RQj7oE9weOCdcBzd1EZg476MST0VNTm2z3r/YGhIw0/+VWbtq1PKhfCTIEnPZWnJryrgw70ZRp0r4XDiQwz/h8bzHoZp9hMCEYHtSbHfUHW8eNSYr5z8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GvsjlkoW; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GvsjlkoW"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206706; x=1791742706;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=;
+  b=GvsjlkoWqX+zgP+tTee0MXcNRVBTPQkZKjOLBXZh33p44VICJNCiih6g
+   bdtLdnWwRkrJ2u2n2AVNyKIqQq+ELwCHQ1bUAIVe5B+Rq8F/WdKivkeVK
+   qCMdNHmRRRa8ijhdo6AEjjUZeHNS6/1dPU14KFq5zOdeXfuxJL5tGjlxb
+   ZtqhKFOWrFhhFPJwUw1KWb7C0rBkSGVoUeZH3ORagBu6Ud545g9bPF/M+
+   p6sJSBNbnSNsdtDoZzzIKVmezgct+rLH0giyW0IcdjAUJlzYg6VsmVomk
+   Zm8UHf1s2hBr8fNdeC7UuXGFmty4d2atXckCM+YB8PsOqI0JwqlHCMSZ2
+   A==;
+X-CSE-ConnectionGUID: uKPzZGMbTiObyQydogOwGQ==
+X-CSE-MsgGUID: QbxPW0yzQ4WA7VOf/APdAg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339729"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339729"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:25 -0700
+X-CSE-ConnectionGUID: GxY9AWlwTACW1S97eEsWGg==
+X-CSE-MsgGUID: +oNXqS3kSkOTENG/ySm5FA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487208"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:25 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 07/19] sched/fair: Track LLC-preferred tasks per runqueue
+Date: Sat, 11 Oct 2025 11:24:44 -0700
+Message-Id: <ccbfda37200b66177a1c1add4715a49b863ac84d.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+For each runqueue, track the number of tasks with an LLC preference
+and how many of them are running on their preferred LLC. This mirrors
+nr_numa_running and nr_preferred_running for NUMA balancing, and will
+be used by cache-aware load balancing in later patches.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/core.c  | 12 +++++++++++
+ kernel/sched/fair.c  | 47 +++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/sched.h |  7 +++++++
+ 3 files changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 79d15e904d12..5940756e2da3 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -529,6 +529,18 @@ void __trace_set_current_state(int state_value)
+ }
+ EXPORT_SYMBOL(__trace_set_current_state);
+ 
++#ifdef CONFIG_SMP
++int task_llc(const struct task_struct *p)
++{
++	return per_cpu(sd_llc_id, task_cpu(p));
++}
++#else
++int task_llc(const struct task_struct *p)
++{
++	return 0;
++}
++#endif
++
+ /*
+  * Serialization rules:
+  *
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d6167a029c47..fd315937c0cf 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1235,6 +1235,24 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running += (p->preferred_llc != -1);
++	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running -= (p->preferred_llc != -1);
++	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1306,6 +1324,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
+ 	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+ }
+ 
++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
++
+ static inline
+ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+@@ -1347,8 +1367,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	if (mm->mm_sched_cpu != -1)
+ 		mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu);
+ 
+-	if (p->preferred_llc != mm_sched_llc)
++	/* task not on rq accounted later in account_entity_enqueue() */
++	if (task_running_on_cpu(rq->cpu, p) &&
++	    p->preferred_llc != mm_sched_llc) {
++		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
++		account_llc_enqueue(rq, p);
++	}
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+@@ -1497,6 +1522,15 @@ void init_sched_mm(struct task_struct *p)
+ 	work->next = work;
+ }
+ 
++void reset_llc_stats(struct rq *rq)
++{
++	if (!sched_cache_enabled())
++		return;
++
++	rq->nr_llc_running = 0;
++	rq->nr_pref_llc_running = 0;
++}
++
+ #else
+ 
+ static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
+@@ -1506,6 +1540,11 @@ void init_sched_mm(struct task_struct *p) { }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
++
++void reset_llc_stats(struct rq *rq) {}
+ #endif
+ 
+ /*
+@@ -3999,6 +4038,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+ 		account_numa_enqueue(rq, task_of(se));
++		account_llc_enqueue(rq, task_of(se));
+ 		list_add(&se->group_node, &rq->cfs_tasks);
+ 	}
+ 	cfs_rq->nr_queued++;
+@@ -4010,9 +4050,14 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	update_load_sub(&cfs_rq->load, se->load.weight);
+ 	if (entity_is_task(se)) {
+ 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
++		account_llc_dequeue(rq_of(cfs_rq), task_of(se));
+ 		list_del_init(&se->group_node);
+ 	}
+ 	cfs_rq->nr_queued--;
++
++	/* safeguard to clear the cache aware data */
++	if (!parent_entity(se) && !cfs_rq->nr_queued)
++		reset_llc_stats(rq_of(cfs_rq));
+ }
+ 
+ /*
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index b448ad6dc51d..3ab64067acc6 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1098,6 +1098,10 @@ struct rq {
+ 	unsigned int		nr_preferred_running;
+ 	unsigned int		numa_migrate_on;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int		nr_pref_llc_running;
++	unsigned int		nr_llc_running;
++#endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+ 	unsigned int		has_blocked_load;
+@@ -1952,6 +1956,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+ 
+ #endif /* !CONFIG_NUMA_BALANCING */
+ 
++void reset_llc_stats(struct rq *rq);
++int task_llc(const struct task_struct *p);
++
+ static inline void
+ queue_balance_callback(struct rq *rq,
+ 		       struct balance_callback *head,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
new file mode 100644
index 0000000..18dc0f7
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
@@ -0,0 +1,194 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8D1D3284688
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:26 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206708; cv=none; b=W6A0Asy9e3NNDRL2ti9BvFY1go+vAlduaKJd1rmOWRr4k4IHRIEpHNJhix4g/v1mdJgDI06CWQ3sQC5YxuLOry9f66mT2W5iUkNoO1AMOa7iJYVMhxygC7dgS1riRk+Xr61GHZrfTq3glOqKoHqMJR1ChGEEIDFSijs9KJo91LU=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206708; c=relaxed/simple;
+	bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=uzr/dGrFdG1v5FdOJ/f9StnRIpzjJ5uOjWV+sYvWDeYE/dxtVTZG5FXWR8UqlK4jv7ZYYOlRDJRmdwLszrh1cbzNE43kw7ueGEnBAbSwzUyXo12aLw3ckNHZHHjqr9uTbTYz7GDrN3J5K862edN4cdJHoI9buyHUDzdCkXfIheE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MiTdX6Q6; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MiTdX6Q6"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206707; x=1791742707;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=;
+  b=MiTdX6Q6R/zAjqSeS2bqz6JnSO+lVjbu/CGoRS4W48TnANXSK7FbeFq8
+   HIHNTysTrwhHCzP1gtYr6N2x0eFio/feVeyFBD5UytM6ahWF0SC67agMj
+   jWOkCg+WyPpJSmb2V4GE3mePGb9vm7kjvgiTp1tcN15ClNGhVOTqusLqF
+   ueDZKLr7dTfEr95oP3PXRNzKFZfqVSGN5aLDywe826XmjT29nykVCoMh+
+   U9I8MAfHqzZxWLRDx+EC8+DhJZRsWw9B7dXqvyz67FsBnLG+HHYrAB479
+   +0mKNo9XBbRlGAtlUlqUTEvej+mP00q1dndiGmLH/nY7e+wci1WK/1VQo
+   g==;
+X-CSE-ConnectionGUID: e2RK1jGJT9eTlAZZ8FMWJQ==
+X-CSE-MsgGUID: se6P+xZrTfOL+/m4zXf2xg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339748"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339748"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:26 -0700
+X-CSE-ConnectionGUID: Lb/G/3cTR6W6ajd8OWjDtQ==
+X-CSE-MsgGUID: f0zaj3jsRd+gLA/rNNvR9A==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487214"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:26 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 08/19] sched/fair: Introduce per runqueue task LLC preference counter
+Date: Sat, 11 Oct 2025 11:24:45 -0700
+Message-Id: <a002ffc53c06bfa0ef0700631b0cb5413bdbf06c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Each runqueue is assigned a static array where each element tracks
+the number of tasks preferring a given LLC, indexed from 0 to
+NR_LLCS.
+
+For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
+this runqueue which prefer to run within LLC3 (indexed from 0 to NR_LLCS
+
+The load balancer can use this information to identify busy runqueues
+and migrate tasks to their preferred LLC domains.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c  | 35 +++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h |  1 +
+ 2 files changed, 36 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index fd315937c0cf..b7a68fe7601b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1235,22 +1235,51 @@ static inline int llc_idx(int cpu)
+ 	return per_cpu(sd_llc_idx, cpu);
+ }
+ 
++static inline int pref_llc_idx(struct task_struct *p)
++{
++	return llc_idx(p->preferred_llc);
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running += (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	++rq->nr_pref_llc[pref_llc];
+ }
+ 
+ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ {
++	int pref_llc;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running -= (p->preferred_llc != -1);
+ 	rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p));
++
++	if (p->preferred_llc < 0)
++		return;
++
++	pref_llc = pref_llc_idx(p);
++	if (pref_llc < 0)
++		return;
++
++	/* avoid negative counter */
++	if (rq->nr_pref_llc[pref_llc] > 0)
++		--rq->nr_pref_llc[pref_llc];
+ }
+ 
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+@@ -1524,10 +1553,16 @@ void init_sched_mm(struct task_struct *p)
+ 
+ void reset_llc_stats(struct rq *rq)
+ {
++	int i = 0;
++
+ 	if (!sched_cache_enabled())
+ 		return;
+ 
+ 	rq->nr_llc_running = 0;
++
++	for (i = 0; i < max_llcs; ++i)
++		rq->nr_pref_llc[i] = 0;
++
+ 	rq->nr_pref_llc_running = 0;
+ }
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 3ab64067acc6..b801d32d5fba 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1101,6 +1101,7 @@ struct rq {
+ #ifdef CONFIG_SCHED_CACHE
+ 	unsigned int		nr_pref_llc_running;
+ 	unsigned int		nr_llc_running;
++	unsigned int		nr_pref_llc[NR_LLCS];
+ #endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip
new file mode 100644
index 0000000..caf0c08
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip
@@ -0,0 +1,143 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 896A92848A1
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:27 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206709; cv=none; b=OEtiMJ0EXsYmk/b2RpkCvrola+Tb5ZlnJVLLgRLqGiICx7t2qJcij9yw0SgiiThPPPTMrbIdFBAm4w8howvUGPAJFc0ItOZDXO+gwbi0GCrU/MRny5Tre78B7YMgEyxZMXkI05Eu0+fODpObrBBk2c09F8OXQKZ4o5hgptBzDK8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206709; c=relaxed/simple;
+	bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=J6bK9CIrnn+dpoeG8RJW1aH3SE1Yc7QYj7Dgh7cqTjdsd3fsWZdu3E2SAwDjyqT5ptCJzWnqjXDoxnW3sFv/aeRC7QnnQkB9bTzAgmfskcoHsp0hZI6c042fUlYpwgsk0j6PmWc4xM8hZNNktu5sqG8t6W1tVMFc+pGngTuF0j8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n3R+hIU0; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n3R+hIU0"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206708; x=1791742708;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=;
+  b=n3R+hIU0WDMCAOT74Si47T0DHUQFpP/mOPOr4EFjzfrMTg20mocMFVue
+   SPJYeD3u+HI/S8DzRBSopnypgjipAk03R2jKWcm5OSqY338iFWIhO44pH
+   Rkbh2OZ1rpYHNaif/qBdzoG/S0GRuxE4+p6SgnYPob1i1tRz5kFPtKtWI
+   Em/YtXT8s7M8i1lwEkDGhNlIAeWj5yl5FVsHoShyMoDnOs/ZKpz9fa1vH
+   yY+/JK9y5B5Rh8CVo9sz+iLl5gL/zxPW+ETtFRKayHPWInq1R4rGuUz8D
+   OVUSiTUoZeUSI+4YJPz+v9iatJmNEpwFlvZeVYR4+WsdGyv8IT5qlNl3i
+   g==;
+X-CSE-ConnectionGUID: VcC/511LSz6QngP8mD/4Fw==
+X-CSE-MsgGUID: cm5ykdK+Tza9czQo0iIcIQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339767"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339767"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:27 -0700
+X-CSE-ConnectionGUID: +fnFCaxeROy1X1/2M3UOCQ==
+X-CSE-MsgGUID: cAIBkdx0SvqbyNLUptq1pw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487219"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 09/19] sched/fair: Count tasks prefering each LLC in a sched group
+Date: Sat, 11 Oct 2025 11:24:46 -0700
+Message-Id: <00e5f2cb6eadc3738e33858d3c4563a0775ee1c0.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, tabulate the number of tasks on each runqueue
+that prefer a given destination LLC in a sched group.
+
+For example, consider a system with 4 LLC sched groups (LLC0 to LLC3)
+balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has
+2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is
+selected as the busiest source to pick tasks from.
+
+Within a source LLC, the total number of tasks preferring a destination
+LLC is computed by summing counts across all CPUs in that runqueue. For
+instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring
+LLC3, the total for LLC0 is 3.
+
+These statistics allow the load balancer to choose tasks from source
+sched groups that best match their preferred LLCs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b7a68fe7601b..cbd1e97bca4b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10399,6 +10399,9 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int nr_pref_llc[NR_LLCS];
++#endif
+ };
+ 
+ /*
+@@ -10891,6 +10894,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		if (sched_cache_enabled()) {
++			int j;
++
++			for (j = 0; j < max_llcs; ++j)
++				sgs->nr_pref_llc[j] += rq->nr_pref_llc[j];
++		}
++#endif
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
new file mode 100644
index 0000000..4bcffad
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
@@ -0,0 +1,187 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F13BA28505C
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:28 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206711; cv=none; b=EhBerRhJhQXPW7xGyw0P5bxJnRZLdUKLIQ12NKKqVw4ZWFGkcALuZ8VykNWnycAafmMkb5kBWaZT15xr3ZuPia1hqPYipqCAVEd34Wn9NgZ7h0Lqr4/FQP1HOI9Yp9naliJ5jjs5uaj5L1/4fJBsGwV0wle3JatN24KLVnEBxK8=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206711; c=relaxed/simple;
+	bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=XF3a1nw/8EN0FU+PNi1yIJ/227PxHRBRy24uDZNEkqQuRuIG35Ap7GIvbGG+L1n9ZlEPV0A8eM5UvEqTGNXZktaeA+OJjX4avu9hw9uu6rqowoIWWNlLa6/0iuozmn5jhIZJJqDbWB7j1stg+x51fnwnSbNrDkb2H27S3usCnzQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Loa6o7d1; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Loa6o7d1"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206709; x=1791742709;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=;
+  b=Loa6o7d1Mzs3ouslW83UWTdxmyggGuWTcpizCbNq+GcghqOrvTfXSRIV
+   0EP9sedHVH3VdKCqAQHV/ZX3VHfUXCRKy9+NcdVchFLL8bKi/9buFRwhw
+   ZWmkcnGopsf975TA51MaL7sh2sNrOAvPuHmiA1plKNFBBesobcOlf5xbr
+   aZ9W/S+Mv3Ykf28JPDwOIYzvtKZi5pCgwvqz5wqJHrujBfUq//kuxX1xD
+   44PevqjxkAnPNbnm/C3CdQgNXiNta5xW/ZKmACOzIkYXaOsL8kl9jvdQl
+   4VJ6pV7RaGBpMqmBXGMhRqdKmN0HSByZ1kvmH46v45jRNYG2/U+7kgbrO
+   A==;
+X-CSE-ConnectionGUID: 7OsmkTE2T2eIFyDjRKp/ig==
+X-CSE-MsgGUID: oqLf97jbSIOB+8Rk4LLqqA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339788"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339788"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:28 -0700
+X-CSE-ConnectionGUID: jHLQbWxOTR2E4C2/k5j7Wg==
+X-CSE-MsgGUID: sQhO8wOTQIuj4/5Og2eBgw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487222"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 10/19] sched/fair: Prioritize tasks preferring destination LLC during balancing
+Date: Sat, 11 Oct 2025 11:24:47 -0700
+Message-Id: <ca1946de63ad9f0ae99e079a74d70c55879cc0b6.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, first check for tasks that prefer the
+destination LLC and balance them to it before others.
+
+Mark source sched groups containing tasks preferring non local LLCs
+with the group_llc_balance flag. This ensures the load balancer later
+pulls or pushes these tasks toward their preferred LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 41 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cbd1e97bca4b..af7b578eaa06 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9822,8 +9822,7 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu
+ 	else
+ 		return mig_unrestricted;
+ 
+-	return can_migrate_llc(src_cpu, dst_cpu,
+-			       task_util(p), to_pref);
++	return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref);
+ }
+ 
+ #else
+@@ -10394,6 +10393,7 @@ struct sg_lb_stats {
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
++	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	unsigned int nr_numa_running;
+@@ -10849,11 +10849,45 @@ static void record_sg_llc_stats(struct lb_env *env,
+ 	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+ }
++
++/*
++ * Do LLC balance on sched group that contains LLC, and have tasks preferring
++ * to run on LLC in idle dst_cpu.
++ */
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	struct sched_domain *child = env->sd->child;
++	int llc;
++
++	if (!sched_cache_enabled())
++		return false;
++
++	if (env->sd->flags & SD_SHARE_LLC)
++		return false;
++
++	/* only care about task migration among LLCs */
++	if (child && !(child->flags & SD_SHARE_LLC))
++		return false;
++
++	llc = llc_idx(env->dst_cpu);
++	if (sgs->nr_pref_llc[llc] > 0 &&
++	    can_migrate_llc(env->src_cpu, env->dst_cpu, 0, true) == mig_llc)
++		return true;
++
++	return false;
++}
+ #else
+ static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ 				       struct sched_group *group)
+ {
+ }
++
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	return false;
++}
+ #endif
+ 
+ /**
+@@ -10954,6 +10988,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
+ 	record_sg_llc_stats(env, sgs, group);
++
++	/* Check for tasks in this group can be moved to their preferred LLC */
++	if (!local_group && llc_balance(env, sgs, group))
++		sgs->group_llc_balance = 1;
++
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip
new file mode 100644
index 0000000..ee39ef0
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip
@@ -0,0 +1,184 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3802857E0
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:29 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206711; cv=none; b=t2IkYrrS4OEW0rLnZ4Ph2aLp/ob7UBcUobZQPFlHPmpcJEG5m0pUt/86mOssLKuYpjefjiUDrjFelfxhjAxq8hkNJqtOEMJPbTz+zzT3SsVZRdrqKE8v+5YoRbLqXRQPim2ll3DhWUtUyVjcOo+wuodh/CEa974mbGOLa7mTgCc=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206711; c=relaxed/simple;
+	bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=CFlB5zhIcHUsbSOo/sD1pZdSFz7frR0zFFzgb5/20MqZiItU17WC0G8ifB7ANEAoWHl+sZ1UBTS2HXkckShm7SoSJJXvPBbw6XxQCBJK6yrElYIzS1CzXKAx7vBmkFFghPyfHOK4JpsmMAKYxqatpcWaHZwO7N1+tqHPYDwlFpo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y9YkqrBb; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y9YkqrBb"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206709; x=1791742709;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=;
+  b=Y9YkqrBbsakXirsuA3GK7ppNmtxnJk2cm0iimpzRLvMdIlTwXGPf3Jxq
+   CO6EwYbc/Esxx5TDgaH0h7SVW6eQY5e38xqt9oEwqeMZQtQ13URaPfC2Q
+   Mwk/v0qwxo5jXbC8xa2O9JpbH1ZyVCsabZmLtbPS2e8WfQbQS4lgRoeof
+   RbwLkRXbWC69JnwGxh3aUM7ZF9q8ziMLuIK7nYhL3utheouiHtWkbs+nW
+   RBMmwNo592e9Wh6g7Ht+Vdc051U+njdgUo7aZRqY6DlKoIGZaJJSG2c0W
+   jAF73DWLcSoTQT2Ii9M9dPOTvOCcojIDgIVpILvlasXm0wG4u+s+OJFGn
+   Q==;
+X-CSE-ConnectionGUID: bcFBDLOoTw6TYukUkbI3wQ==
+X-CSE-MsgGUID: 0WEdTBqUR0WG7HuYHYySDg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339807"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339807"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:29 -0700
+X-CSE-ConnectionGUID: teKUgYrNS8ayzrTmALf01w==
+X-CSE-MsgGUID: OBuR3uU9Q8qKO64uzC8h4Q==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487230"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:28 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 11/19] sched/fair: Identify busiest sched_group for LLC-aware load balancing
+Date: Sat, 11 Oct 2025 11:24:48 -0700
+Message-Id: <fcdf37780eeb409cf10925f8b8dcef486c92b218.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+The load balancer selects the busiest sched_group and migrates tasks
+to less busy groups to distribute load across CPUs.
+
+With cache-aware scheduling enabled, the busiest sched_group is
+the one with most tasks preferring the destination LLC. If
+the group has the llc_balance flag set, cache aware load balancing is
+triggered.
+
+Introduce the helper function update_llc_busiest() to identify the
+sched_group with the most tasks preferring the destination LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 38 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index af7b578eaa06..8469ec528cb1 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10877,6 +10877,23 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ 
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	int idx;
++
++	/* Only the candidate with llc_balance needs to be taken care of */
++	if (!sgs->group_llc_balance)
++		return false;
++
++	/*
++	 * There are more tasks that want to run on dst_cpu's LLC.
++	 */
++	idx = llc_idx(env->dst_cpu);
++	return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx];
++}
+ #else
+ static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ 				       struct sched_group *group)
+@@ -10888,6 +10905,13 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ {
+ 	return false;
+ }
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	return false;
++}
+ #endif
+ 
+ /**
+@@ -11035,6 +11059,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 	     sds->local_stat.group_type != group_has_spare))
+ 		return false;
+ 
++	/* deal with prefer LLC load balance, if failed, fall into normal load balance */
++	if (update_llc_busiest(env, busiest, sgs))
++		return true;
++
++	/*
++	 * If the busiest group has tasks with LLC preference,
++	 * skip normal load balance.
++	 */
++	if (busiest->group_llc_balance)
++		return false;
++
+ 	if (sgs->group_type > busiest->group_type)
+ 		return true;
+ 
+@@ -11942,9 +11977,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
+ 	/*
+ 	 * Try to move all excess tasks to a sibling domain of the busiest
+ 	 * group's child domain.
++	 * Also do so if we can move some tasks that prefer the local LLC.
+ 	 */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+-	    sibling_imbalance(env, &sds, busiest, local) > 1)
++	    (busiest->group_llc_balance ||
++	    sibling_imbalance(env, &sds, busiest, local) > 1))
+ 		goto force_balance;
+ 
+ 	if (busiest->group_type != group_overloaded) {
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
new file mode 100644
index 0000000..e9edb7a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
@@ -0,0 +1,185 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 99E5F28642E
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:30 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206712; cv=none; b=CcfwsAyp1OHHqY4mNPYPcN6bUrl09ci4+a/v8FtP9azgYQzfS6lmRwWajeweUonIlhrYSa3k3Uk+3iau8s00TJMHIq9pc69gZThbuJO24GmjHBtcGot6LsPzytIaUPaB8oNg5fj064BJxFXz948iENpfk/rfsglOKxpcJkX9wG0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206712; c=relaxed/simple;
+	bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ScMEWl2DOAQMR5u9bpXgwKEadirbrSNG1X0vBv1Qm5M7qzeQRW6zyzR/0wZ49Stn9ftQ28uc0NLCvRH6mwbydhKFD3kpg3JgxWk9NBUU+Qnt+t7g3WQ/pDx7wFSEDUiofgdlic68Cqje1J43vJo7n57s1boIMbDvvtchvPGoTXM=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=WEVJOxO1; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="WEVJOxO1"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206711; x=1791742711;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=;
+  b=WEVJOxO1Uy4x+GEHukYgK7cjQhJ+ZPzArevJFx6r0uwjLvVHXCsCVf0d
+   U5oZ9qGbRNsQ961+swsJygnl0Xp69gaKKJFDcVvaKlw28OYtLWeCcKxy5
+   4DN0Azrktm8AXYGwp3idVSw3VynSmNbW2dqVmCfWn3Np2iYv1w7hTpRfb
+   SetW2PMNCXc4Fk5w1ve3GEJ9Bax25e3mUvpabN2XIbAEnlZu4rHyR3ovD
+   1WzBrpK45tvGmB0FKRXCfsKbMFF1KdXCgjW4lAJ2KU2k2bhxv6SPWDjA8
+   0qVm8erW2mgP7HqJHVa71uZn8ehzzZAPeMVO4wyBDdQns/j8tkr67uAC6
+   w==;
+X-CSE-ConnectionGUID: osVAgR9XSEi43ydURnxquA==
+X-CSE-MsgGUID: sgSrXMaOTSCJRnEynSu6Vg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339827"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339827"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:30 -0700
+X-CSE-ConnectionGUID: U/XiMYdrQLyr4smIn6sKwQ==
+X-CSE-MsgGUID: iE4re5OqR+eOwHWBOdmfKA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487233"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:29 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 12/19] sched/fair: Add migrate_llc_task migration type for cache-aware balancing
+Date: Sat, 11 Oct 2025 11:24:49 -0700
+Message-Id: <f22827867d2c245c00063a3fa9f2aeddddaacca1.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce a new migration type, migrate_llc_task, to support
+cache-aware load balancing.
+
+After identifying the busiest sched_group (having the most tasks
+preferring the destination LLC), mark migrations with this type.
+During load balancing, each runqueue in the busiest sched_group is
+examined, and the runqueue with the highest number of tasks preferring
+the destination CPU is selected as the busiest runqueue.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
+ 1 file changed, 31 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 8469ec528cb1..bec6354d7841 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9504,7 +9504,8 @@ enum migration_type {
+ 	migrate_load = 0,
+ 	migrate_util,
+ 	migrate_task,
+-	migrate_misfit
++	migrate_misfit,
++	migrate_llc_task
+ };
+ 
+ #define LBF_ALL_PINNED	0x01
+@@ -10082,6 +10083,10 @@ static int detach_tasks(struct lb_env *env)
+ 			env->imbalance -= util;
+ 			break;
+ 
++		case migrate_llc_task:
++			env->imbalance--;
++			break;
++
+ 		case migrate_task:
+ 			env->imbalance--;
+ 			break;
+@@ -11733,6 +11738,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (busiest->group_llc_balance) {
++		/* Move a task that prefer local LLC */
++		env->migration_type = migrate_llc_task;
++		env->imbalance = 1;
++		return;
++	}
++#endif
++
+ 	if (busiest->group_type == group_imbalanced) {
+ 		/*
+ 		 * In the group_imb case we cannot rely on group-wide averages
+@@ -12041,6 +12055,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ 	unsigned int busiest_nr = 0;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int busiest_pref_llc = 0;
++	int dst_llc;
++#endif
+ 	int i;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+@@ -12149,6 +12167,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 			}
+ 			break;
+ 
++		case migrate_llc_task:
++#ifdef CONFIG_SCHED_CACHE
++			dst_llc = llc_idx(env->dst_cpu);
++			if (!cpus_share_cache(env->dst_cpu, rq->cpu) &&
++			    busiest_pref_llc < rq->nr_pref_llc[dst_llc]) {
++				busiest_pref_llc = rq->nr_pref_llc[dst_llc];
++				busiest = rq;
++			}
++#endif
++			break;
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
+ 				busiest_nr = nr_running;
+@@ -12331,6 +12359,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 	case migrate_misfit:
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
++	case migrate_llc_task:
++		break;
+ 	}
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
new file mode 100644
index 0000000..50e470a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
@@ -0,0 +1,208 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CE0E286D56
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:31 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206713; cv=none; b=GHTSZiD43H1BP9udGQWGRTSdycj0dFbwOFNYssvdtvgDyjDEnOhEZuZ3tF7d4Oxq4KjVh/REHJdk8e5qmA0nk91pFvjTrD7ew0sadW9X2+TjejBiKi+Z4u/nZlJeGc29rI3I01ytNZfNGLLusPB2P/4mVx6bLIuv9bhIea7/KOQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206713; c=relaxed/simple;
+	bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Z43NaTPGAIlscL0L7fDhRwzngl1+8YayCbuXKnJJO/leht3IttqnVKWti2tJx4O3Ad4+Bxa7ijhsxQg7lysYNstcyC73l5FTr0P11m80kqmUiNRrC4pt99E80BCBIbFo2SatFJnTKT4Q1ux117UKVwuy6P9Rh922Z1naN6x4Wgc=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JdkwbeJq; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JdkwbeJq"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206711; x=1791742711;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=;
+  b=JdkwbeJqpvNLxxR/C5J1ZH6Sc5bkBzINB0NUowykgcoSMh+IrKTz9SEs
+   3TI4U2WqUZ4fGfcXVpbX1N2vbaAfyQUv4dhr3bMb1WSUcBz4dSrMfVdBf
+   Gdlpc/LwIyV72Eyt8t+mfF176Y/vv2GuGHN9WuXsK8/fBvzDMB20NsZLB
+   QBg0I+M7oRSQsaiygrqnGBFHiCS3p2JbXoqghWgigPrv6u1iqo8HXxcYs
+   HtDa1JUkhRKqPvvWxmzbfQzJYS+Coi/HVD3eewtzP+ILLi56XMzOKLHfR
+   iZqHJ/1cq2a50rc7YQNpk4EmPQ7vkE0qnNCf9o39KpjsRQh5qnu3HCaul
+   A==;
+X-CSE-ConnectionGUID: VRcX2cnOQSeMAY0e8g4K3w==
+X-CSE-MsgGUID: SPoQqM3DQk6EyvXMnqQjmg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339847"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339847"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:31 -0700
+X-CSE-ConnectionGUID: pKVZhrKMR8K6LBYqzMOqAA==
+X-CSE-MsgGUID: CK8cGt1oRtCxjPN4nS/YdA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487238"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:30 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 13/19] sched/fair: Handle moving single tasks to/from their preferred LLC
+Date: Sat, 11 Oct 2025 11:24:50 -0700
+Message-Id: <231864b303906a60491bbb9eb7b2e3f083bff248.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If the busiest runqueue has only one task, active balancing may be
+invoked to move it. However, before migration, check whether the task
+is running on its preferred LLC.
+
+Do not move a lone task to another LLC if it would move the task
+away from its preferred LLC or cause excessive imbalance between LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 59 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index bec6354d7841..19ba9c1b9a63 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9826,12 +9826,53 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu
+ 	return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref);
+ }
+ 
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	if (!sched_cache_enabled())
++		return false;
++
++	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
++		return false;
++	/*
++	 * All tasks prefer to stay on their current CPU.
++	 * Do not pull a task from its preferred CPU if:
++	 * 1. It is the only task running there; OR
++	 * 2. Migrating it away from its preferred LLC would violate
++	 *    the cache-aware scheduling policy.
++	 */
++	if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
++		unsigned long util = 0;
++		struct task_struct *cur;
++
++		if (env->src_rq->nr_running <= 1)
++			return true;
++
++		rcu_read_lock();
++		cur = rcu_dereference(env->src_rq->curr);
++		if (cur)
++			util = task_util(cur);
++		rcu_read_unlock();
++
++		if (can_migrate_llc(env->src_cpu, env->dst_cpu,
++				    util, false) == mig_forbid)
++			return true;
++	}
++
++	return false;
++}
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+ {
+ 	return false;
+ }
++
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	return false;
++}
+ #endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+@@ -12247,6 +12288,9 @@ static int need_active_balance(struct lb_env *env)
+ {
+ 	struct sched_domain *sd = env->sd;
+ 
++	if (break_llc_locality(env))
++		return 0;
++
+ 	if (asym_active_balance(env))
+ 		return 1;
+ 
+@@ -12266,7 +12310,8 @@ static int need_active_balance(struct lb_env *env)
+ 			return 1;
+ 	}
+ 
+-	if (env->migration_type == migrate_misfit)
++	if (env->migration_type == migrate_misfit ||
++	    env->migration_type == migrate_llc_task)
+ 		return 1;
+ 
+ 	return 0;
+@@ -12711,9 +12756,20 @@ static int active_load_balance_cpu_stop(void *data)
+ 		goto out_unlock;
+ 
+ 	/* Is there any task to move? */
+-	if (busiest_rq->nr_running <= 1)
+-		goto out_unlock;
++	if (busiest_rq->nr_running <= 1) {
++#ifdef CONFIG_SCHED_CACHE
++		int llc = llc_idx(target_cpu);
+ 
++		if (!sched_cache_enabled())
++			goto out_unlock;
++
++		if (llc < 0)
++			goto out_unlock;
++		/* don't migrate if no task prefers target */
++		if (busiest_rq->nr_pref_llc[llc] < 1)
++#endif
++			goto out_unlock;
++	}
+ 	/*
+ 	 * This condition is "impossible", if it occurs
+ 	 * we need to fix it. Originally reported by
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
new file mode 100644
index 0000000..2839724
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
@@ -0,0 +1,201 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 417D42874EA
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:32 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206714; cv=none; b=P5dnBcm/QdLKKHwOdHn/8WuPNdfAOl/PRiR2K2uOEI4cNFkN+3QA9gv1poGLydzEv/LcejqEay5DpC4q4pFVQXAYgNISmcWGnnkZt2WJ1RNwtLhNEUFXZhx40ubXDsBOhhphD04ToZpipNp3wabmP7EXcOk+GqqMg1ATyjn68eQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206714; c=relaxed/simple;
+	bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=K4DMRiSFceKlJzje7FYPpzQtMciS8INZnGsYmfTeHw6oUtErbWyqEJzurxfkaj/0e2BYrqNZ34Rdy0dGMjqeQWLbOVlQosaArztC6x5+Kes0uifkkB7Pj+Ot9ll7+ydHo4UrJOvNc7oKS/beZOgPG9FPfh7UCSuuvvMEgE2IUTo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=H3CAEs3w; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="H3CAEs3w"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206712; x=1791742712;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=;
+  b=H3CAEs3wXo6bis/3Dkhtptw+Q7vtaAFDMqK8g5XXqpoTWnnoOviYRAT9
+   w6Ikfty6wJNr1MlZJ1pp/FTRrzxJpmwm8JYX2yaBiDeoJDyx/agfVsZPY
+   MklgYKNASSHcEaoYoXP3gsqWfSwXldul6nD1Cye5tqr86XkWjK3gJK3C2
+   XHWF6ABgRrpsZ6WaBAuzrKten6FRqGkbA1i+aWIRwXqoWsGPVsgAC8AT4
+   v51P3tS4APRavdFpCNPn2xNzJPdUZAW7dgqXMB0AkpdRadIZ72DIu+BFu
+   J9oJpUAr+gFfhWThceV6xrW/Bi4Emncs3GIHURfaahEgiLmzNa/UX2/Km
+   w==;
+X-CSE-ConnectionGUID: L4/6SpgURcKa2MOypuG0Tw==
+X-CSE-MsgGUID: s8jp3cejRyWqoo8mO6QU6Q==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339866"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339866"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:32 -0700
+X-CSE-ConnectionGUID: IV2+5+btQ3GmLWn4UVfGIA==
+X-CSE-MsgGUID: Ti8qIpzsSjywiCl630piRA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487243"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:31 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 14/19] sched/fair: Consider LLC preference when selecting tasks for load balancing
+Date: Sat, 11 Oct 2025 11:24:51 -0700
+Message-Id: <26e7bfa88163e13ba1ebefbb54ecf5f42d84f884.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Currently, task selection from the busiest runqueue ignores LLC
+preferences. Reorder tasks in the busiest queue to prioritize selection
+as follows:
+
+  1. Tasks preferring the destination CPU's LLC
+  2. Tasks with no LLC preference
+  3. Tasks preferring an LLC different from their current one
+  4. Tasks preferring the LLC they are currently on
+
+This improves the likelihood that tasks are migrated to their
+preferred LLC.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 19ba9c1b9a63..0fafbfedb21d 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10036,6 +10036,68 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+ 	return NULL;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Prepare lists to detach tasks in the following order:
++ * 1. tasks that prefer dst cpu's LLC
++ * 2. tasks that have no preference in LLC
++ * 3. tasks that prefer LLC other than the ones they are on
++ * 4. tasks that prefer the LLC that they are currently on.
++ */
++static struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	struct task_struct *p;
++	LIST_HEAD(pref_old_llc);
++	LIST_HEAD(pref_new_llc);
++	LIST_HEAD(no_pref_llc);
++	LIST_HEAD(pref_other_llc);
++
++	if (!sched_cache_enabled())
++		return tasks;
++
++	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
++		return tasks;
++
++	while (!list_empty(tasks)) {
++		p = list_last_entry(tasks, struct task_struct, se.group_node);
++
++		if (p->preferred_llc == llc_id(env->dst_cpu)) {
++			list_move(&p->se.group_node, &pref_new_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == llc_id(env->src_cpu)) {
++			list_move(&p->se.group_node, &pref_old_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == -1) {
++			list_move(&p->se.group_node, &no_pref_llc);
++			continue;
++		}
++
++		list_move(&p->se.group_node, &pref_other_llc);
++	}
++
++	/*
++	 * We detach tasks from list tail in detach tasks.  Put tasks
++	 * to be chosen first at end of list.
++	 */
++	list_splice(&pref_new_llc, tasks);
++	list_splice(&no_pref_llc, tasks);
++	list_splice(&pref_other_llc, tasks);
++	list_splice(&pref_old_llc, tasks);
++	return tasks;
++}
++#else
++static inline struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	return tasks;
++}
++#endif
++
+ /*
+  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
+  * busiest_rq, as part of a balancing operation within domain "sd".
+@@ -10044,7 +10106,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+  */
+ static int detach_tasks(struct lb_env *env)
+ {
+-	struct list_head *tasks = &env->src_rq->cfs_tasks;
++	struct list_head *tasks;
+ 	unsigned long util, load;
+ 	struct task_struct *p;
+ 	int detached = 0;
+@@ -10063,6 +10125,8 @@ static int detach_tasks(struct lb_env *env)
+ 	if (env->imbalance <= 0)
+ 		return 0;
+ 
++	tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks);
++
+ 	while (!list_empty(tasks)) {
+ 		/*
+ 		 * We don't want to steal all, otherwise we may be treated likewise,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
new file mode 100644
index 0000000..0a36e52
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
@@ -0,0 +1,156 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 118912877EE
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:33 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206714; cv=none; b=Sgvo8eIzN/unUNmW2/+OixP9udhyNkmi4AZEZzDVPWK1PLnNoYAhA0isU11HgcQC7ul1i5aP8jgG2uHE7Cy8Asrdz+Y08qynhym2Y4X0S+xgTgNOkVzp41IhyzMl092I4cMjY7ziOvFvK6idsHZ/FR3VwQydRvg8d5aWYp64rpE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206714; c=relaxed/simple;
+	bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=TyYasSHPSqMZlN51+4bjWq8Z7cAg9IakiA1ZSJzbhlx8KJc6/UktRCAzZaEkZtQ3d+2B5EUSEDoefcCsbcoCPxFRSCAzN4VD9lBw94R0aIvRHbenlFVxgsvkmUCy9pzg5jZh5zHq/4CLUC+EDPmK622ZE8JNMYgUcZgPpxmosck=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Lw7L05el; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Lw7L05el"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206713; x=1791742713;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=;
+  b=Lw7L05elkbwdOCxozPfNxC8qRTe1i2iYshjZC2z6ZaIHRqDa3MmTXW5p
+   zHG6+auYcjgaRRcY16sdCyIbi7MCQxhd1rhIdaLh0bWrCs4ImE5P1VD8f
+   E+1GcTkJVgNbzLAR5f6+G7KZsA/sstlz5uIOTmFm5WpAXCY87MaYrAMAn
+   AO+uoYvLDh1ME4/gSK2T7C+P7K4lX/jQuif20ZGD72jW5wnQNob4g08JW
+   Z2MLtsd0WXxmCEXIKBfa0mtDIGY2FVs5/FvLd831/0grQYgT8vo1t80Kc
+   spuxB5OU6NgYwRfX7rKRRiLNfth6YUS68l+iwJeWbASwMAqE6PVWIEmJu
+   Q==;
+X-CSE-ConnectionGUID: eDbtoCrOQHyIZtGmIsjSMQ==
+X-CSE-MsgGUID: +ry6w/ChQZGrUwocr7gK9A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339887"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339887"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:33 -0700
+X-CSE-ConnectionGUID: 1LsFjRblTkmkQu9Zwyc6pQ==
+X-CSE-MsgGUID: 7olPURVrSrW53T9U5Kz7mw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487247"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:32 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 15/19] sched/fair: Respect LLC preference in task migration and detach
+Date: Sat, 11 Oct 2025 11:24:52 -0700
+Message-Id: <d3afcff5622222523c843f5c1b023bfe43f9c67c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During the final step of load balancing, can_migrate_task() now
+considers a task's LLC preference before moving it out of its
+preferred LLC.
+
+Additionally, add checks in detach_tasks() to prevent selecting tasks
+that prefer their current LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 24 ++++++++++++++++++++++--
+ 1 file changed, 22 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0fafbfedb21d..65ff7c306a2f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9801,8 +9801,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+  * Check if task p can migrate from src_cpu to dst_cpu
+  * in terms of cache aware load balance.
+  */
+-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+-							struct task_struct *p)
++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++					 struct task_struct *p)
+ {
+ 	struct mm_struct *mm;
+ 	bool to_pref;
+@@ -9969,6 +9969,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (env->flags & LBF_ACTIVE_LB)
+ 		return 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_cache_enabled() &&
++	    can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid)
++		return 0;
++#endif
++
+ 	degrades = migrate_degrades_locality(p, env);
+ 	if (!degrades)
+ 		hot = task_hot(p, env);
+@@ -10227,6 +10233,20 @@ static int detach_tasks(struct lb_env *env)
+ 		if (env->imbalance <= 0)
+ 			break;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Don't detach more tasks if the remaining tasks want
++		 * to stay. We know the remaining tasks all prefer the
++		 * current LLC, because after order_tasks_by_llc(), the
++		 * tasks that prefer the current LLC are at the tail of
++		 * the list. The inhibition of detachment is to avoid too
++		 * many tasks being migrated out of the preferred LLC.
++		 */
++		if (sched_cache_enabled() && detached && p->preferred_llc != -1 &&
++		    llc_id(env->src_cpu) == p->preferred_llc)
++			break;
++#endif
++
+ 		continue;
+ next:
+ 		if (p->sched_task_hot)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip
new file mode 100644
index 0000000..88914b1
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip
@@ -0,0 +1,172 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 16F11288C02
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:34 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206715; cv=none; b=msFA8TC41v9oEIuXxPkwmaUs9Guya5oz4k0g+kGWjFkx5t6zbq1fE/hqkiyOdPEhHS8cUTNX+aARYrbMu+YFzDRmUGhKnyOYkbiJD/UnEPwa2emEYG8RrqlU6lMxzm4wiDBJLxqnLLfKGSPXyWwXrM560Mia1tgl6K9uKsnEgFE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206715; c=relaxed/simple;
+	bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=pIYwSq6151qmo6KEbEr6KofmYMtBvZvl9VphDwsqPX3hTLP897hu66I6LFuek1xE2EdzY5hJ64po/YPEKcNn99hwknIHDQx8uamJBxPh8I2WV7/JQ8MBTxUclp3YSgTWiAJSRjNR9EBM7PkdUJqtsU69m11ei/HsbibGYzaOOwk=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TQVK1fUD; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TQVK1fUD"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206714; x=1791742714;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=;
+  b=TQVK1fUDtuFQmuxj0h/H6B3W/u2cJ2GkGOiUH7Lt/dRtHWxu09UqD683
+   GE9GznGGwwF/Ima7vRS1ctHwsI6Xpw4SijdVGn66soleS5/ydNjcGaSKg
+   ygudPZpTfNaQrBfM0sFvdqPmdg50LMShstL+8pxYWf160UzvXjzOECyon
+   VuIxmxxlfPMnN2wMIOyjbQiDBL/LsnnHbGArR4IFK3zGWts6KMkvPzkiR
+   EwWOPnHMmqriXFYLM8wcDjSverDfcRP6MlQsXXusYG7bdxJhhuwymEiBB
+   InFNxWr5/xEksEDfouM5jLx/TVwLUkF4o8vAQ8HbkYgDi57JrvvbuA4Mr
+   g==;
+X-CSE-ConnectionGUID: dN0cE9kLQ3yeKYNXwwT83A==
+X-CSE-MsgGUID: KDX51V55RvaEAIpyi8Kcxg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339905"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339905"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:34 -0700
+X-CSE-ConnectionGUID: SHt2rwkJR6+JML7EmRAXVw==
+X-CSE-MsgGUID: 7457bVysSBes9Wezrb15EQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487250"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:33 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 16/19] sched/fair: Exclude processes with many threads from cache-aware scheduling
+Date: Sat, 11 Oct 2025 11:24:53 -0700
+Message-Id: <637cdb8ab11b1b978d697ed744cc402d32443ecc.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+A performance regression was observed by Prateek when running hackbench
+with many threads per process (high fd count). To avoid this, processes
+with a large number of active threads are excluded from cache-aware
+scheduling.
+
+With sched_cache enabled, record the number of active threads in each
+process during the periodic task_cache_work(). While iterating over
+CPUs, if the currently running task belongs to the same process as the
+task that launched task_cache_work(), increment the active thread count.
+
+If the count exceeds the number of CPUs in the process's preferred LLC,
+sched_cache will avoid aggregating too many threads into a single LLC
+domain.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/mm_types.h |  1 +
+ kernel/sched/fair.c      | 14 ++++++++++++--
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 3ca557c2f36d..b307f81b2fde 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1031,6 +1031,7 @@ struct mm_struct {
+ 		raw_spinlock_t mm_sched_lock;
+ 		unsigned long mm_sched_epoch;
+ 		int mm_sched_cpu;
++		u64 nr_running_avg ____cacheline_aligned_in_smp;
+ #endif
+ 
+ #ifdef CONFIG_MMU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 65ff7c306a2f..79d109f8a09f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1451,12 +1451,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
+ 
+ static void __no_profile task_cache_work(struct callback_head *work)
+ {
+-	struct task_struct *p = current;
++	struct task_struct *p = current, *cur;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long curr_m_a_occ = 0;
+ 	int cpu, m_a_cpu = -1, cache_cpu,
+-	    pref_nid = NUMA_NO_NODE, curr_cpu;
++	    pref_nid = NUMA_NO_NODE, curr_cpu,
++	    nr_running = 0;
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1497,6 +1498,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					m_occ = occ;
+ 					m_cpu = i;
+ 				}
++
++				rcu_read_lock();
++				cur = rcu_dereference(cpu_rq(i)->curr);
++				if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
++				    cur->mm == mm)
++					nr_running++;
++				rcu_read_unlock();
++
+ 			}
+ 
+ 			/*
+@@ -1540,6 +1549,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		mm->mm_sched_cpu = m_a_cpu;
+ 	}
+ 
++	update_avg(&mm->nr_running_avg, nr_running);
+ 	free_cpumask_var(cpus);
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
new file mode 100644
index 0000000..0bb796c
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
@@ -0,0 +1,170 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBF228B400
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:35 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206717; cv=none; b=YyEz/CWTR29mSwIUaPFMfMePzkOh+JM5Sy6daDO5bi2qr7vVNV19xi6LQHHFuh3wAPmGhaJZO0psSS/hmmAhEm9YYTN/Jgc2pWxCyI+xWhQCLC7I/PnTVjCiCQif4wqMsrxoWCBWSb2OUxPbQQvBrskdsdNoyUkJX7OfjisrPEo=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206717; c=relaxed/simple;
+	bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=cDrry+jPMrDILm/r9QUVZNGIrsE561nMMRjz9ay5n5LBA0g4KQ5jFwtQhbKMvroO4a5axJHedJTHbl6aSfvc0uCnQwzJq+eaxxOqXVEOWsoi3zdhUNBrxg97Vqp+GrazIyVFmuyXj145vhjyv4Ug8nfP5dYxkUNSPkfjany2j50=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=lrCuBiww; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="lrCuBiww"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206715; x=1791742715;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=;
+  b=lrCuBiwwXTTaUUesVoUKShmqNypNMcjFctaFnNlL8Jy17kFhV1UkeZza
+   ZuX0GXcNA+d1mgjVrCdwx7TgVROgGBNK4U8k00nbzT6TvTcewZUk7QGtM
+   ze+FjZ8AcXNEy5AhOAJw/Pg8vbtTnZ1loNcqp57iteVrKQqHWUMDyfSYU
+   8P+nCqWidGuZDOqQcaEjQH4wD2Jn2+QsEcLHNMZnZLw6R3C8jci7hl1aG
+   MGxs8mPuw6pSR4ah1MI8YVoYS5wwLulLaJK/V5D02tGg7pdRILUMNtqsB
+   x0389trQkin/UccLwrCAMIGVL3znx7/2JW/py3nOY6EKojcOWTOyEIt0N
+   Q==;
+X-CSE-ConnectionGUID: WfwYlMtNQVe279pYYOUBnA==
+X-CSE-MsgGUID: AjSkDrsURkOZNf5ZbyXbNQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339923"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339923"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700
+X-CSE-ConnectionGUID: ezHUeA30SCiDTeB7wo76Nw==
+X-CSE-MsgGUID: YeYwMr00ThmPUWDQc0+YAw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487255"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:34 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 17/19] sched/fair: Disable cache aware scheduling for processes with high thread counts
+Date: Sat, 11 Oct 2025 11:24:54 -0700
+Message-Id: <a098a60d9b4fc8ccea3392096f8bb0cf03af070b.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+If the number of active threads within the process
+exceeds the number of Cores(divided by SMTs number)
+in the LLC, do not enable cache-aware scheduling.
+This is because there is a risk of cache contention
+within the preferred LLC when too many threads are
+present.
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ kernel/sched/fair.c | 27 +++++++++++++++++++++++++--
+ 1 file changed, 25 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 79d109f8a09f..6b8eace79eee 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1240,6 +1240,18 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
++{
++	int smt_nr = 1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active())
++		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
++#endif
++
++	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
+ 	int pref_llc;
+@@ -1385,10 +1397,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 
+ 	/*
+ 	 * If this task hasn't hit task_cache_work() for a while, or it
+-	 * has only 1 thread, invalidate its preferred state.
++	 * has only 1 thread, or has too many active threads, invalidate
++	 * its preferred state.
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+-	    get_nr_threads(p) <= 1) {
++	    get_nr_threads(p) <= 1 ||
++	    exceed_llc_nr(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1467,6 +1481,11 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
++	if (get_nr_threads(p) <= 1) {
++		mm->mm_sched_cpu = -1;
++		return;
++	}
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9826,6 +9845,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
++	 /* skip cache aware load balance for single/too many threads */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++		return mig_unrestricted;
++
+ 	if (cpus_share_cache(dst_cpu, cpu))
+ 		to_pref = true;
+ 	else if (cpus_share_cache(src_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
new file mode 100644
index 0000000..b614ebc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
@@ -0,0 +1,246 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC76C28C03B
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:35 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206717; cv=none; b=Gsl1htdC3Y7gJ6c3ywcidI/bSse8yUz6irs7/iI8KWV8rK5Ae95mMS6V4kE386ZpRZ64YVuSevPlw/gCCcGexlKVEsnpJGvjAMVnB6E3r26Sb5PQDcAwlJhgczIF0vnORN//ryXKWaGJdpyTLOi1a78IAJp76Mm0Cc1+XjF2rGQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206717; c=relaxed/simple;
+	bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=hT0pK7n3dH+PZ5LGb1wwP8mkt2A7mUf1PCIeydCbZfOqNSbSKOwNGkxWRp3xr4aPGGtMx1eK61Xyt7h2YGrFfvdSUCRdLGNS2BunlIUuq8SqGdxHIK829DTsOGKBUbEPWJzj/d6E4FC8xaBfUuz6ugBEq47VdX8vEtuc1XwNFis=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eyspbvXX; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eyspbvXX"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206716; x=1791742716;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=;
+  b=eyspbvXX6JZaLuPx9mP9k7AsJvdPNK3nA7Eu1n1ZjnjSeOqzlt2GEvCx
+   IIbDfmBwRBwDACT7YDm/5WXc6cuJLsO02ejx9sBoouGuZkUHl1/nB7J2O
+   i/e0/jcb0J2buciIQ3OvuzUhegT0ZaiQoJUm0tinSNJAyHv/2LoJKLT6E
+   1wncP9sm103omUQyz2nIdzytwxhPLCdaTXt3R4jfGDM0HbNy1TRA5Ex3O
+   eiDpNNIsPslVI7J8r5viBVFuJFJIfp1atbqNY5xQ3zDqGyLEqF5FJMEHK
+   BGBjTx2SYuiM3sv4eOtztesROh9S4vRoc6wieYXXgBwOgrHLMjZB8S3CI
+   A==;
+X-CSE-ConnectionGUID: 15+3n+5PQLG8KotmRvuIMw==
+X-CSE-MsgGUID: Dj1GwDBDRtWs7ASTeti8MA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339940"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339940"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700
+X-CSE-ConnectionGUID: O+LhKbX0QNyBYwHUAp0ttw==
+X-CSE-MsgGUID: PfPvzLkATc2Ca+B9H6Dwng==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487259"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:35 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 18/19] sched/fair: Avoid cache-aware scheduling for memory-heavy processes
+Date: Sat, 11 Oct 2025 11:24:55 -0700
+Message-Id: <00da49fd590b95baad0525660bda4c0ba178243d.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Prateek and Tingyin reported that memory-intensive workloads (such as
+stream) can saturate memory bandwidth and caches on the preferred LLC
+when sched_cache aggregates too many threads.
+
+To mitigate this, estimate a process's memory footprint by comparing
+its RSS (anonymous and shared pages) to the size of the LLC. If RSS
+exceeds the LLC size, skip cache-aware scheduling.
+
+Note that RSS is only an approximation of the memory footprint.
+By default, the comparison is strict, but a later patch will allow
+users to provide a hint to adjust this threshold.
+
+According to the test from Adam, some systems do not have shared L3
+but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].
+
+Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ include/linux/cacheinfo.h | 21 ++++++++++------
+ kernel/sched/fair.c       | 51 ++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 61 insertions(+), 11 deletions(-)
+
+diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
+index c8f4f0a0b874..82d0d59ca0e1 100644
+--- a/include/linux/cacheinfo.h
++++ b/include/linux/cacheinfo.h
+@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
+ 
+ const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
+ 
+-/*
+- * Get the cacheinfo structure for the cache associated with @cpu at
+- * level @level.
+- * cpuhp lock must be held.
+- */
+-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level)
+ {
+ 	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ 	int i;
+ 
+-	lockdep_assert_cpus_held();
+-
+ 	for (i = 0; i < ci->num_leaves; i++) {
+ 		if (ci->info_list[i].level == level) {
+ 			if (ci->info_list[i].attributes & CACHE_ID)
+@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
+ 	return NULL;
+ }
+ 
++/*
++ * Get the cacheinfo structure for the cache associated with @cpu at
++ * level @level.
++ * cpuhp lock must be held.
++ */
++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++{
++	lockdep_assert_cpus_held();
++
++	return _get_cpu_cacheinfo_level(cpu, level);
++}
++
+ /*
+  * Get the id of the cache associated with @cpu at level @level.
+  * cpuhp lock must be held.
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6b8eace79eee..46dfcd2a01b3 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1240,6 +1240,38 @@ static inline int pref_llc_idx(struct task_struct *p)
+ 	return llc_idx(p->preferred_llc);
+ }
+ 
++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
++{
++	struct cacheinfo *ci;
++	unsigned long rss;
++	unsigned int llc;
++
++	/*
++	 * get_cpu_cacheinfo_level() can not be used
++	 * because it requires the cpu_hotplug_lock
++	 * to be held. Use _get_cpu_cacheinfo_level()
++	 * directly because the 'cpu' can not be
++	 * offlined at the moment.
++	 */
++	ci = _get_cpu_cacheinfo_level(cpu, 3);
++	if (!ci) {
++		/*
++		 * On system without L3 but with shared L2,
++		 * L2 becomes the LLC.
++		 */
++		ci = _get_cpu_cacheinfo_level(cpu, 2);
++		if (!ci)
++			return true;
++	}
++
++	llc = ci->size;
++
++	rss = get_mm_counter(mm, MM_ANONPAGES) +
++		get_mm_counter(mm, MM_SHMEMPAGES);
++
++	return (llc <= (rss * PAGE_SIZE));
++}
++
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+ 	int smt_nr = 1;
+@@ -1402,7 +1434,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+ 	    get_nr_threads(p) <= 1 ||
+-	    exceed_llc_nr(mm, cpu_of(rq))) {
++	    exceed_llc_nr(mm, cpu_of(rq)) ||
++	    exceed_llc_capacity(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1486,6 +1519,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		return;
+ 	}
+ 
++	/*
++	 * Do not check exceed_llc_nr() because
++	 * the active number of threads needs to
++	 * been updated anyway.
++	 */
++	if (exceed_llc_capacity(mm, curr_cpu))
++		return;
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9845,8 +9886,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
+-	 /* skip cache aware load balance for single/too many threads */
+-	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++	/*
++	 * skip cache aware load balance for single/too many threads
++	 * or large footprint.
++	 */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
++	    exceed_llc_capacity(mm, dst_cpu))
+ 		return mig_unrestricted;
+ 
+ 	if (cpus_share_cache(dst_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip
new file mode 100644
index 0000000..893d5f6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip
@@ -0,0 +1,366 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18E8128D850
+	for <linux-kernel@vger.kernel.org>; Sat, 11 Oct 2025 18:18:37 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1760206719; cv=none; b=Xx1TJtOzMlihMYBSPUxuHxJ0Qjx1gDS60TVsBbaW2YAWG207+fLDuebhtY/m9byeKfuUMx/7RVc7mR4xE94pKemXSaF1s6z/Ug1MSbyJDL/f+gYUVN9JWyZVsl4nskC5I36GvI9Reswdcqif7FIqp4+OT03g4Ursen0Zl0KoJs4=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1760206719; c=relaxed/simple;
+	bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=TqNPDsqjikNan+1NtjFEbAg77jx9c3inhDW4V8l0uRiJhbQOXCuc9b1G6bYocgAvzvRSIQ0C9pHEOzGrnitQnTKHR4lM01jV+sq5AGE2Z0YUwNbJ3G2iOFzcz198JhG1QAmKUE7Vocf7AQigiloGd31ZcAGpFcHlx+XOPevHRzQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iOR0vW8+; arc=none smtp.client-ip=198.175.65.17
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iOR0vW8+"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1760206717; x=1791742717;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=;
+  b=iOR0vW8+BW1BG+CuQKpeekNgIJXVik0HqP3JsArGSk608O/BAqQp2/2V
+   NevdC5FBoGU0UJqaEBq3eyHXjM8fq6f/t4e0BsD23dpBBveuXe++OVX8Y
+   Aapb+EWCp+mFsFeSqc6EHn1EKVQFE1axOMUnDuAWrAcUGMdrmUl0Sqt8l
+   gPm1isDiRNA4VWnGAtuiefQtTbQsCK7LA3hCWV2kYbD78VwasjvY/a8Zs
+   eIWoDg9eon7/Ajv/YxTCU8u2KHeYWmlazBkEjZ2+x2uGykUr+ha3ebndP
+   Ilvnp7dapSvlsm6l5tNbjmODs4GBS1SErTGbDlGwNscJODVWeB1whKGtb
+   g==;
+X-CSE-ConnectionGUID: iwkdIGQ9QpepiaCCmITr2A==
+X-CSE-MsgGUID: vpqcAnIxSGm05xalZwxCuA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339958"
+X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; 
+   d="scan'208";a="62339958"
+Received: from orviesa004.jf.intel.com ([10.64.159.144])
+  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:36 -0700
+X-CSE-ConnectionGUID: l0yVaxC3RhO6SKkG+8NgJA==
+X-CSE-MsgGUID: KHjGlLwMQh2OAr5o5sZaPw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; 
+   d="scan'208";a="185487263"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:36 -0700
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Libo Chen <libo.chen@oracle.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH 19/19] sched/fair: Add user control to adjust the tolerance of cache-aware scheduling
+Date: Sat, 11 Oct 2025 11:24:56 -0700
+Message-Id: <afe7603c37fe76064d769ce9d78df494347a748c.1760206683.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+References: <cover.1760206683.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+With sched_cache enabled, the scheduler uses a process's RSS as a
+proxy for its LLC footprint to determine if aggregating tasks on the
+preferred LLC could cause cache contention. If RSS exceeds the LLC
+size, aggregation is skipped. Some workloads with large RSS but small
+actual memory footprints may still benefit from aggregation. Since
+the kernel cannot efficiently track per-task cache usage (resctrl is
+user-space only), userspace can provide a more accurate hint.
+
+Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
+users control how strictly RSS limits aggregation. Values range from
+0 to 100:
+
+  - 0: Cache-aware scheduling is disabled.
+  - 1: Strict; tasks with RSS larger than LLC size are skipped.
+  - 100: Aggressive; tasks are aggregated regardless of RSS.
+
+For example, with a 32MB L3 cache:
+
+  - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
+  - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
+    (784GB = (1 + (99 - 1) * 256) * 32MB).
+
+Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
+how strictly the number of active threads is considered when doing
+cache aware load balance. The number of SMTs is also considered.
+High SMT counts reduce the aggregation capacity, preventing excessive
+task aggregation on SMT-heavy systems like Power10/Power11.
+
+For example, with 8 Cores/16 CPUs in a L3:
+
+  - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
+  - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
+    785 = (1 + (99 - 1) * 8).
+
+Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Reported-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
+Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Reported-by: Tingyin Duan <tingyin.duan@gmail.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+ kernel/sched/debug.c | 56 ++++++++++++++++++++++++++++++--
+ kernel/sched/fair.c  | 76 ++++++++++++++++++++++++++++++++++++++++----
+ kernel/sched/sched.h |  3 ++
+ 3 files changed, 126 insertions(+), 9 deletions(-)
+
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 57bb04ebbf96..cfcd8b436cc5 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -169,6 +169,50 @@ static const struct file_operations sched_feat_fops = {
+ 	.release	= single_release,
+ };
+ 
++#ifdef CONFIG_SCHED_CACHE
++#define SCHED_CACHE_CREATE_CONTROL(name)			  \
++static ssize_t sched_cache_write_##name(struct file *filp,	  \
++					const char __user *ubuf,  \
++					size_t cnt, loff_t *ppos) \
++{								  \
++	char buf[16];						  \
++	unsigned int percent;					  \
++	if (cnt > 15)						  \
++		cnt = 15;					  \
++	if (copy_from_user(&buf, ubuf, cnt))			  \
++		return -EFAULT;					  \
++	buf[cnt] = '\0';					  \
++	if (kstrtouint(buf, 10, &percent))			  \
++		return -EINVAL;					  \
++	if (percent > 100)					  \
++		return -EINVAL;					  \
++	llc_##name = percent;					  \
++	*ppos += cnt;						  \
++	return cnt;						  \
++}								  \
++static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
++{								  \
++	seq_printf(m, "%d\n", llc_##name);			  \
++	return 0;						  \
++}								  \
++static int sched_cache_open_##name(struct inode *inode,		  \
++				   struct file *filp)		  \
++{								  \
++	return single_open(filp, sched_cache_show_##name, NULL);  \
++}								  \
++static const struct file_operations sched_cache_fops_##name = {	  \
++	.open		= sched_cache_open_##name,		  \
++	.write		= sched_cache_write_##name,		  \
++	.read		= seq_read,				  \
++	.llseek		= seq_lseek,				  \
++	.release	= single_release,			  \
++}
++
++SCHED_CACHE_CREATE_CONTROL(overload_pct);
++SCHED_CACHE_CREATE_CONTROL(imb_pct);
++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance);
++#endif /* SCHED_CACHE */
++
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -524,8 +568,16 @@ static __init int sched_init_debug(void)
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-	debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct);
+-	debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct);
++	debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_overload_pct);
++	debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_imb_pct);
++	debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_aggr_tolerance);
++	debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
++			   &llc_epoch_period);
++	debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
++			   &llc_epoch_affinity_timeout);
+ #endif
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 46dfcd2a01b3..f9084e2f9ef2 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1207,9 +1207,62 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ __read_mostly unsigned int llc_overload_pct       = 50;
+ __read_mostly unsigned int llc_imb_pct            = 20;
++__read_mostly unsigned int llc_aggr_tolerance     = 1;
++__read_mostly unsigned int llc_epoch_period       = EPOCH_PERIOD;
++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_cache_allowed);
+ 
++static inline int get_sched_cache_scale(int mul)
++{
++	if (!llc_aggr_tolerance)
++		return 0;
++
++	if (llc_aggr_tolerance == 100)
++		return INT_MAX;
++
++	return (1 + (llc_aggr_tolerance - 1) * mul);
++}
++
++static inline int get_sched_cache_rss_scale(void)
++{
++	/*
++	 * Suppose the L3 size is 32MB. If the
++	 * llc_aggr_tolerance is 1:
++	 * When the RSS is larger than 32MB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity. If the
++	 * llc_aggr_tolerance is 99:
++	 * When the RSS is larger than 784GB,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 784GB = (1 + (99 - 1) * 256) * 32MB
++	 */
++	return get_sched_cache_scale(256);
++}
++
++static inline int get_sched_cache_nr_scale(void)
++{
++	/*
++	 * Suppose the number of Cores in LLC is 8.
++	 * Every core has 2 SMTs.
++	 * If the llc_aggr_tolerance is 1: When the
++	 * nr_running is larger than 8, the process
++	 * is regarded as exceeding the LLC capacity.
++	 * If the llc_aggr_tolerance is 99:
++	 * When the nr_running is larger than 785,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 785 = 1 + (99 - 1) * 8
++	 */
++	return get_sched_cache_scale(1);
++}
++
++static inline int get_sched_cache_cap_scale(void)
++{
++	return (llc_overload_pct / cpu_smt_num_threads);
++}
++
+ static inline bool sched_cache_enabled(void)
+ {
+ 	return sched_feat(SCHED_CACHE) &&
+@@ -1245,6 +1298,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	struct cacheinfo *ci;
+ 	unsigned long rss;
+ 	unsigned int llc;
++	int scale;
+ 
+ 	/*
+ 	 * get_cpu_cacheinfo_level() can not be used
+@@ -1269,19 +1323,27 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	rss = get_mm_counter(mm, MM_ANONPAGES) +
+ 		get_mm_counter(mm, MM_SHMEMPAGES);
+ 
+-	return (llc <= (rss * PAGE_SIZE));
++	scale = get_sched_cache_rss_scale();
++	if (scale == INT_MAX)
++		return false;
++
++	return ((llc * scale) <= (rss * PAGE_SIZE));
+ }
+ 
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+-	int smt_nr = 1;
++	int smt_nr = 1, scale;
+ 
+ #ifdef CONFIG_SCHED_SMT
+ 	if (sched_smt_active())
+ 		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
+ #endif
+ 
+-	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++	scale = get_sched_cache_nr_scale();
++	if (scale == INT_MAX)
++		return false;
++
++	return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
+ }
+ 
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+@@ -1370,9 +1432,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ 	long delta = now - rq->cpu_epoch_next;
+ 
+ 	if (delta > 0) {
+-		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		n = (delta + llc_epoch_period - 1) / llc_epoch_period;
+ 		rq->cpu_epoch += n;
+-		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		rq->cpu_epoch_next += n * llc_epoch_period;
+ 		__shr_u64(&rq->cpu_runtime, n);
+ 	}
+ 
+@@ -1432,7 +1494,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 * has only 1 thread, or has too many active threads, invalidate
+ 	 * its preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
+ 	    get_nr_threads(p) <= 1 ||
+ 	    exceed_llc_nr(mm, cpu_of(rq)) ||
+ 	    exceed_llc_capacity(mm, cpu_of(rq))) {
+@@ -9749,7 +9811,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+  * (default: ~50%)
+  */
+ #define fits_llc_capacity(util, max)	\
+-	((util) * 100 < (max) * llc_overload_pct)
++	((util) * 100 < (max) * get_sched_cache_cap_scale())
+ 
+ /*
+  * The margin used when comparing utilization.
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index b801d32d5fba..97e8558b0530 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2810,6 +2810,9 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int llc_overload_pct;
+ extern unsigned int llc_imb_pct;
++extern unsigned int llc_aggr_tolerance;
++extern unsigned int llc_epoch_period;
++extern unsigned int llc_epoch_affinity_timeout;
+ extern struct static_key_false sched_cache_allowed;
+ #endif
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip
new file mode 100644
index 0000000..30b2f3a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip
@@ -0,0 +1,637 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3E1062EA481
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:19 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802883; cv=none; b=UKLk6Rg4Ag2RrVZTM6q83e57jrtOabhFLy87jTKdCORkErT5oscdmGvQFuZ8uzk4JddS6cPh1pfkZIjrorb34GjVrTfhTnjF3Ev1eA9P3f9SHm6a8HG5wxWf/yS25iz0NQWmXUw8INvgj0a9A56o6dRBuDjYNgK/XPE8bAKiBUg=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802883; c=relaxed/simple;
+	bh=6xbRUXX8feoSk8bOjg/vcAGiqy4i78lNKWOOyysMsTg=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ezTAzjYx2Rp52iZO2WWYVcoqrFo5k7CxRy+shLmmCt9X8OAnGBmN2eYuhkz/I7t0LW4rAjnmLXBSt4s5lKDI7cjNxUO/rV3B0EWqv13ojuB5QKkGvUXb3YGE9U0EUSc8TdruI55O35k40Uh0lNID1k89G7Dxb8VJ6Ckm0RWpbqE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=hXZ7RTSy; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="hXZ7RTSy"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802880; x=1796338880;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=6xbRUXX8feoSk8bOjg/vcAGiqy4i78lNKWOOyysMsTg=;
+  b=hXZ7RTSym9fS1Xvrd9iY6zdRxiZpXzgeaEnDkbWt4E9kikaWOOGcUivi
+   QbmpWan09GqoanGn0S6Vft9B7BxCpebF9EW9KXpkUelSttyWWDfdj3/y5
+   FTK7BCv2Ykd5RjEGqBmouxnoYSthhh0M052SACkie+UXmvYxcT/sOQCCX
+   HOsATO8B6T2nuON/L4dyuLl54HqVuf+JcbMOZ0ABnQ6ZFHGM/cCwqCXcJ
+   AmUI07y2Khz2g6thC1D3WG4YXreJSp+sT28iidXrCmaZBan6+WI286Msl
+   K0/hGg9Y68V2FBcOV+wIiAuy+MY5XGtKxf7nZIp0LSDOwP7fiuTJEB8U9
+   g==;
+X-CSE-ConnectionGUID: cpmnVUlITmyoLapwFs/Now==
+X-CSE-MsgGUID: 1fVK363gQBOq9Aw6XgwrKg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136182"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136182"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:18 -0800
+X-CSE-ConnectionGUID: d8cQS9oyRh+diLyesP0AjA==
+X-CSE-MsgGUID: JRVFNz3/S1eHusPWEwHUNA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763734"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:18 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing
+Date: Wed,  3 Dec 2025 15:07:20 -0800
+Message-Id: <06f0d7edbc3185ec730b50b3b00d87ace44169b3.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+
+Adds infrastructure to enable cache-aware load balancing,
+which improves cache locality by grouping tasks that share resources
+within the same cache domain. This reduces cache misses and improves
+overall data access efficiency.
+
+In this initial implementation, threads belonging to the same process
+are treated as entities that likely share working sets. The mechanism
+tracks per-process CPU occupancy across cache domains and attempts to
+migrate threads toward cache-hot domains where their process already
+has active threads, thereby enhancing locality.
+
+This provides a basic model for cache affinity. While the current code
+targets the last-level cache (LLC), the approach could be extended to
+other domain types such as clusters (L2) or node-internal groupings.
+
+At present, the mechanism selects the CPU within an LLC that has the
+highest recent runtime. Subsequent patches in this series will use this
+information in the load-balancing path to guide task placement toward
+preferred LLCs.
+
+In the future, more advanced policies could be integrated through NUMA
+balancing-for example, migrating a task to its preferred LLC when spare
+capacity exists, or swapping tasks across LLCs to improve cache affinity.
+Grouping of tasks could also be generalized from that of a process
+to be that of a NUMA group, or be user configurable.
+
+Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       Restore the original CPU scan to cover all online CPUs,
+       rather than scanning within the preferred NUMA node.
+       (Peter Zijlstra)
+    
+       Use rq->curr instead of rq->donor. (K Prateek Nayak)
+    
+       Minor fix in task_tick_cache() to use
+       if (mm->mm_sched_epoch >= rq->cpu_epoch)
+       to avoid mm_sched_epoch going backwards.
+
+ include/linux/mm_types.h |  44 +++++++
+ include/linux/sched.h    |  11 ++
+ init/Kconfig             |  11 ++
+ kernel/fork.c            |   6 +
+ kernel/sched/core.c      |   6 +
+ kernel/sched/fair.c      | 258 +++++++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h     |   8 ++
+ 7 files changed, 344 insertions(+)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 90e5790c318f..1ea16ef90566 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -939,6 +939,11 @@ typedef struct {
+ 	DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
+ } __private mm_flags_t;
+ 
++struct mm_sched {
++	u64 runtime;
++	unsigned long epoch;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -1029,6 +1034,17 @@ struct mm_struct {
+ 		 */
+ 		raw_spinlock_t cpus_allowed_lock;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		/*
++		 * Track per-cpu-per-process occupancy as a proxy for cache residency.
++		 * See account_mm_sched() and ...
++		 */
++		struct mm_sched __percpu *pcpu_sched;
++		raw_spinlock_t mm_sched_lock;
++		unsigned long mm_sched_epoch;
++		int mm_sched_cpu;
++#endif
++
+ #ifdef CONFIG_MMU
+ 		atomic_long_t pgtables_bytes;	/* size of all page tables */
+ #endif
+@@ -1487,6 +1503,34 @@ static inline unsigned int mm_cid_size(void)
+ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
++#ifdef CONFIG_SCHED_CACHE
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
++
++static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
++{
++	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++
++	if (!pcpu_sched)
++		return -ENOMEM;
++
++	mm_init_sched(mm, pcpu_sched);
++	return 0;
++}
++
++#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
++
++static inline void mm_destroy_sched(struct mm_struct *mm)
++{
++	free_percpu(mm->pcpu_sched);
++	mm->pcpu_sched = NULL;
++}
++#else /* !CONFIG_SCHED_CACHE */
++
++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
++static inline void mm_destroy_sched(struct mm_struct *mm) { }
++
++#endif /* CONFIG_SCHED_CACHE */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index b469878de25c..278b529c91df 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1406,6 +1406,10 @@ struct task_struct {
+ 	unsigned long			numa_pages_migrated;
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	struct callback_head		cache_work;
++#endif
++
+ #ifdef CONFIG_RSEQ
+ 	struct rseq __user *rseq;
+ 	u32 rseq_len;
+@@ -2428,4 +2432,11 @@ extern void migrate_enable(void);
+ 
+ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+ 
++#ifdef CONFIG_SCHED_CACHE
++static inline bool sched_cache_enabled(void)
++{
++	return false;
++}
++#endif
++
+ #endif
+diff --git a/init/Kconfig b/init/Kconfig
+index cab3ad28ca49..88556ef8cfd1 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -983,6 +983,17 @@ config NUMA_BALANCING
+ 
+ 	  This system will be inactive on UMA systems.
+ 
++config SCHED_CACHE
++	bool "Cache aware load balance"
++	default y
++	depends on SMP
++	help
++	  When enabled, the scheduler will attempt to aggregate tasks from
++	  the same process onto a single Last Level Cache (LLC) domain when
++	  possible. This improves cache locality by keeping tasks that share
++	  resources within the same cache domain, reducing cache misses and
++	  lowering data access latency.
++
+ config NUMA_BALANCING_DEFAULT_ENABLED
+ 	bool "Automatically enable NUMA aware memory/task placement"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 3da0f08615a9..aae5053d1e30 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
+ 	cleanup_lazy_tlbs(mm);
+ 
+ 	WARN_ON_ONCE(mm == current->active_mm);
++	mm_destroy_sched(mm);
+ 	mm_free_pgd(mm);
+ 	mm_free_id(mm);
+ 	destroy_context(mm);
+@@ -1083,6 +1084,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	if (mm_alloc_cid(mm, p))
+ 		goto fail_cid;
+ 
++	if (mm_alloc_sched(mm))
++		goto fail_sched;
++
+ 	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ 				     NR_MM_COUNTERS))
+ 		goto fail_pcpu;
+@@ -1092,6 +1096,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	return mm;
+ 
+ fail_pcpu:
++	mm_destroy_sched(mm);
++fail_sched:
+ 	mm_destroy_cid(mm);
+ fail_cid:
+ 	destroy_context(mm);
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index f754a60de848..e8bdf03a4b7f 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4488,6 +4488,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
+ 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ 	p->migration_pending = NULL;
+ 	init_sched_mm_cid(p);
++	init_sched_mm(p);
+ }
+ 
+ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+@@ -8791,6 +8792,11 @@ void __init sched_init(void)
+ 
+ 		rq->core_cookie = 0UL;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++		raw_spin_lock_init(&rq->cpu_epoch_lock);
++		rq->cpu_epoch_next = jiffies;
++#endif
++
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+ 	}
+ 
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 5b752324270b..cb82f558dc5b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
+ 	sa->runnable_avg = sa->util_avg;
+ }
+ 
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
++
+ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ {
+ 	u64 now = rq_clock_task(rq);
+@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ 		trace_sched_stat_runtime(running, delta_exec);
+ 		account_group_exec_runtime(running, delta_exec);
++		account_mm_sched(rq, running, delta_exec);
+ 
+ 		/* cgroup time is always accounted against the donor */
+ 		cgroup_account_cputime(donor, delta_exec);
+@@ -1193,6 +1196,259 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 	return delta_exec;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++
++/*
++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD
++ * tunable or so.
++ */
++#define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
++#define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
++
++static int llc_id(int cpu)
++{
++	if (cpu < 0)
++		return -1;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
++{
++	unsigned long epoch;
++	int i;
++
++	for_each_possible_cpu(i) {
++		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct rq *rq = cpu_rq(i);
++
++		pcpu_sched->runtime = 0;
++		pcpu_sched->epoch = rq->cpu_epoch;
++		epoch = rq->cpu_epoch;
++	}
++
++	raw_spin_lock_init(&mm->mm_sched_lock);
++	mm->mm_sched_epoch = epoch;
++	mm->mm_sched_cpu = -1;
++
++	/*
++	 * The update to mm->pcpu_sched should not be reordered
++	 * before initialization to mm's other fields, in case
++	 * the readers may get invalid mm_sched_epoch, etc.
++	 */
++	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++}
++
++/* because why would C be fully specified */
++static __always_inline void __shr_u64(u64 *val, unsigned int n)
++{
++	if (n >= 64) {
++		*val = 0;
++		return;
++	}
++	*val >>= n;
++}
++
++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	lockdep_assert_held(&rq->cpu_epoch_lock);
++
++	unsigned long n, now = jiffies;
++	long delta = now - rq->cpu_epoch_next;
++
++	if (delta > 0) {
++		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		rq->cpu_epoch += n;
++		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		__shr_u64(&rq->cpu_runtime, n);
++	}
++
++	n = rq->cpu_epoch - pcpu_sched->epoch;
++	if (n) {
++		pcpu_sched->epoch += n;
++		__shr_u64(&pcpu_sched->runtime, n);
++	}
++}
++
++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
++{
++	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
++
++	__update_mm_sched(rq, pcpu_sched);
++
++	/*
++	 * Runtime is a geometric series (r=0.5) and as such will sum to twice
++	 * the accumulation period, this means the multiplcation here should
++	 * not overflow.
++	 */
++	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
++}
++
++static inline
++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_sched *pcpu_sched;
++	unsigned long epoch;
++
++	if (!sched_cache_enabled())
++		return;
++
++	if (p->sched_class != &fair_sched_class)
++		return;
++	/*
++	 * init_task and kthreads don't having mm
++	 */
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
++
++	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
++		__update_mm_sched(rq, pcpu_sched);
++		pcpu_sched->runtime += delta_exec;
++		rq->cpu_runtime += delta_exec;
++		epoch = rq->cpu_epoch;
++	}
++
++	/*
++	 * If this task hasn't hit task_cache_work() for a while, or it
++	 * has only 1 thread, invalidate its preferred state.
++	 */
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	    get_nr_threads(p) <= 1) {
++		if (mm->mm_sched_cpu != -1)
++			mm->mm_sched_cpu = -1;
++	}
++}
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++	struct mm_struct *mm = p->mm;
++
++	if (!sched_cache_enabled())
++		return;
++
++	if (!mm || !mm->pcpu_sched)
++		return;
++
++	/* avoid moving backwards */
++	if (mm->mm_sched_epoch >= rq->cpu_epoch)
++		return;
++
++	guard(raw_spinlock)(&mm->mm_sched_lock);
++
++	if (work->next == work) {
++		task_work_add(p, work, TWA_RESUME);
++		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
++	}
++}
++
++static void __no_profile task_cache_work(struct callback_head *work)
++{
++	struct task_struct *p = current;
++	struct mm_struct *mm = p->mm;
++	unsigned long m_a_occ = 0;
++	unsigned long curr_m_a_occ = 0;
++	int cpu, m_a_cpu = -1;
++	cpumask_var_t cpus;
++
++	WARN_ON_ONCE(work != &p->cache_work);
++
++	work->next = work;
++
++	if (p->flags & PF_EXITING)
++		return;
++
++	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
++		return;
++
++	scoped_guard (cpus_read_lock) {
++		cpumask_copy(cpus, cpu_online_mask);
++
++		for_each_cpu(cpu, cpus) {
++			/* XXX sched_cluster_active */
++			struct sched_domain *sd = per_cpu(sd_llc, cpu);
++			unsigned long occ, m_occ = 0, a_occ = 0;
++			int m_cpu = -1, i;
++
++			if (!sd)
++				continue;
++
++			for_each_cpu(i, sched_domain_span(sd)) {
++				occ = fraction_mm_sched(cpu_rq(i),
++							per_cpu_ptr(mm->pcpu_sched, i));
++				a_occ += occ;
++				if (occ > m_occ) {
++					m_occ = occ;
++					m_cpu = i;
++				}
++			}
++
++			/*
++			 * Compare the accumulated occupancy of each LLC. The
++			 * reason for using accumulated occupancy rather than average
++			 * per CPU occupancy is that it works better in asymmetric LLC
++			 * scenarios.
++			 * For example, if there are 2 threads in a 4CPU LLC and 3
++			 * threads in an 8CPU LLC, it might be better to choose the one
++			 * with 3 threads. However, this would not be the case if the
++			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
++			 * if average per CPU occupancy is used).
++			 * Besides, NUMA balancing fault statistics behave similarly:
++			 * the total number of faults per node is compared rather than
++			 * the average number of faults per CPU. This strategy is also
++			 * followed here.
++			 */
++			if (a_occ > m_a_occ) {
++				m_a_occ = a_occ;
++				m_a_cpu = m_cpu;
++			}
++
++			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
++				curr_m_a_occ = a_occ;
++
++			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
++		}
++	}
++
++	if (m_a_occ > (2 * curr_m_a_occ)) {
++		/*
++		 * Avoid switching mm_sched_cpu too fast.
++		 * The reason to choose 2X is because:
++		 * 1. It is better to keep the preferred LLC stable,
++		 *    rather than changing it frequently and cause migrations
++		 * 2. 2X means the new preferred LLC has at least 1 more
++		 *    busy CPU than the old one(200% vs 100%, eg)
++		 * 3. 2X is chosen based on test results, as it delivers
++		 *    the optimal performance gain so far.
++		 */
++		mm->mm_sched_cpu = m_a_cpu;
++	}
++
++	free_cpumask_var(cpus);
++}
++
++void init_sched_mm(struct task_struct *p)
++{
++	struct callback_head *work = &p->cache_work;
++
++	init_task_work(work, task_cache_work);
++	work->next = work;
++}
++
++#else
++
++static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
++				    s64 delta_exec) { }
++
++void init_sched_mm(struct task_struct *p) { }
++
++static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
++
++#endif
++
+ /*
+  * Used by other classes to account runtime.
+  */
+@@ -13124,6 +13380,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+ 	if (static_branch_unlikely(&sched_numa_balancing))
+ 		task_tick_numa(rq, curr);
+ 
++	task_tick_cache(rq, curr);
++
+ 	update_misfit_status(curr, rq);
+ 	check_update_overutilized_status(task_rq(curr));
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index adfb6e3409d7..84118b522f22 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1194,6 +1194,12 @@ struct rq {
+ 	u64			clock_pelt_idle_copy;
+ 	u64			clock_idle_copy;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
++	u64			cpu_runtime;
++	unsigned long		cpu_epoch;
++	unsigned long		cpu_epoch_next;
++#endif
+ 
+ 	atomic_t		nr_iowait;
+ 
+@@ -3819,6 +3825,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+ static inline void init_sched_mm_cid(struct task_struct *t) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
+ 
++extern void init_sched_mm(struct task_struct *p);
++
+ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+ extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ static inline
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
new file mode 100644
index 0000000..ad4bcfd
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip
@@ -0,0 +1,229 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8872B2EC0A3
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:21 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802884; cv=none; b=PIVYWfHNGhpYcL5pUf5pbJV6z5GC4MufyMLaT00/IZT2eIAKxBzqzRglsyVDKa18ZuvGOOBF6720BmFO1QjbQTlm++JQNaJ2Li4EQo87RGn9XE96gbHXFQW46Ye00LdP+tH7Hh5mDSD6E7sACuXB9wl4PappMcJ/np+rPkSv+fk=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802884; c=relaxed/simple;
+	bh=1tsEZhdWTsEDcQ9RmMyka/N/6UwyydH6Z8nvicoX744=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version:Content-Type; b=HIfoZU9H/SZm0t6eE4dquYqikhNAFvY4+BXlcqSIZ3CtUZjOzIUSOC63YZp9YVMZHXi1YQfdjTLmXM4JflgdOMpsYGcmIdM9y97XnpuLltYZndJJ3UMie+BQAS7WTzwavGBbWlwvukQFWzaAt18tTAj+n7TvfZUbdaq3Hd/PwnQ=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gY2DTyL8; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gY2DTyL8"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802881; x=1796338881;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=1tsEZhdWTsEDcQ9RmMyka/N/6UwyydH6Z8nvicoX744=;
+  b=gY2DTyL8yWnM8kjFl12irX509n24BDz4iFKCqM6WCNUCLXRN5a5IlNUP
+   CfYI2+/YpAT4bu6uPNEPMLhPBFM2XD4LK26owQYwXoYEFxXYOPyRzMCCr
+   rISEhzC11YficDTuxwWe3QvPX3HaXsnsqXtK9HLG/hiT6NfkxrHYuu33P
+   2QVChiY0MqYwc1nvL417RDFrqZbCy7kRQLG02T5nK00USUuGMRvgZv+U3
+   gt7oM5XlbDtNyyU+5sVU7KIViaRsZSfklkuYRaOOMQ39LYUdIFQ+Ue6G0
+   EAocEYO+P59FhkDZmjjHTJ9I3dlRH+Fcb/w/MBdqObwG/r+XHEjXGxmQZ
+   g==;
+X-CSE-ConnectionGUID: leGPfNk6R8KUwcSASjrrFg==
+X-CSE-MsgGUID: gXlnURSyTm+Cie+BIy/27w==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136204"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136204"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:21 -0800
+X-CSE-ConnectionGUID: R/PsDZqXSOeZVekLwfPG7Q==
+X-CSE-MsgGUID: gEHdD8uJSdmMqZ2X+NMi/w==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763741"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:20 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions
+Date: Wed,  3 Dec 2025 15:07:21 -0800
+Message-Id: <af576e6ac2d8c45c5aef6889f818e956f9c804e5.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+When a system becomes busy and a processâs preferred LLC is
+saturated with too many threads, tasks within that LLC migrate
+frequently. These in LLC migrations introduce latency and degrade
+performance. To avoid this, task aggregation should be suppressed when
+the preferred LLC is overloaded, which requires a metric to indicate
+LLC utilization.
+
+Record per LLC utilization/cpu capacity during periodic load
+balancing. These statistics will be used in later patches to decide
+whether tasks should be aggregated into their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       Refine the comments in record_sg_llc_stats().(Peter Zijlstra).
+
+ include/linux/sched/topology.h |  4 ++
+ kernel/sched/fair.c            | 69 ++++++++++++++++++++++++++++++++++
+ 2 files changed, 73 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index bbcfdf12aa6e..0ba4697d74ba 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -68,6 +68,10 @@ struct sched_domain_shared {
+ 	atomic_t	nr_busy_cpus;
+ 	int		has_idle_cores;
+ 	int		nr_idle_scan;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned long	util_avg;
++	unsigned long	capacity ____cacheline_aligned_in_smp;
++#endif
+ };
+ 
+ struct sched_domain {
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cb82f558dc5b..b9f336300f14 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/* Called from load balancing paths with rcu_read_lock held */
++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
++					 unsigned long *cap)
++{
++	struct sched_domain_shared *sd_share;
++
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share)
++		return false;
++
++	*util = READ_ONCE(sd_share->util_avg);
++	*cap = READ_ONCE(sd_share->capacity);
++
++	return true;
++}
++#else
++static inline bool get_llc_stats(int cpu, unsigned long *util,
++				 unsigned long *cap)
++{
++	return false;
++}
++#endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+  */
+@@ -10592,6 +10615,51 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+ 	return check_cpu_capacity(rq, sd);
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Record the statistics for this scheduler group for later
++ * use. These values guide load balancing on aggregating tasks
++ * to a LLC.
++ */
++static void record_sg_llc_stats(struct lb_env *env,
++				struct sg_lb_stats *sgs,
++				struct sched_group *group)
++{
++	struct sched_domain_shared *sd_share;
++
++	if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
++		return;
++
++	/* Only care about sched domain spanning multiple LLCs */
++	if (env->sd->child != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
++		return;
++
++	/*
++	 * At this point we know this group spans a LLC domain.
++	 * Record the statistic of this group in its corresponding
++	 * shared LLC domain.
++	 * Note: sd_share cannot be obtained via sd->child->shared, because
++	 * it refers to the domain that covers the local group, while
++	 * sd_share could represent any of the LLC group.
++	 */
++	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
++					   cpumask_first(sched_group_span(group))));
++	if (!sd_share)
++		return;
++
++	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
++		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
++
++	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
++		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
++}
++#else
++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
++				       struct sched_group *group)
++{
++}
++#endif
++
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @env: The load balancing environment.
+@@ -10681,6 +10749,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	record_sg_llc_stats(env, sgs, group);
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
new file mode 100644
index 0000000..821d67b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip
@@ -0,0 +1,333 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F2692EBDDE
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:23 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802885; cv=none; b=fboZZkFKPl6gqpHDoF2b6zbyblNamhVu+FcjT54t3oU8vxsb1XXezAqbDtyJgvQY5nilFQH3AKBGOohsQ/SQ3tX2mRk+BSCtjeqUEVqOw4w0dDc2wtmgFtlHa6V/L30IDsIjeiViMUZM4y4AiA82fvOBsu4+NJQNRAWoaUu83no=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802885; c=relaxed/simple;
+	bh=TakEXE1LpDhxRe/Kb7GWIrlVFYabIDFNwz7qIMJeAzA=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=lWQIStOFn4iy99stFlHV/qSBEi3k7WL/GF8q0g3QeYxyAInDLMtgRyHdyj4lgpwV4+hcrGelSaLn9GQ314YsxP62kdg4igNnwsJ5I/UGLtE/m0W5/zOTgeJYpf5nNjxi042Eu8UJR3sDuMQmXljn/+2COvTOKDQkes+q8dJg4fs=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZJw0lk7W; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZJw0lk7W"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802883; x=1796338883;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=TakEXE1LpDhxRe/Kb7GWIrlVFYabIDFNwz7qIMJeAzA=;
+  b=ZJw0lk7W6RTQQr7pUyzeAA7+tFRn5rwcdkgzS49IJ6otxSXwAzwDWZIh
+   72+xVH8b/09ZAgA4A4sjEOCcav+jAPzfD2L3N7AxSkmW/F8BHhBoUD3JQ
+   QbRstLbqNMnMwfrcQ+qBeU1Q3VwTeXm0rmxciTrI2u6z3GCHX79/Bxc9Y
+   tid45au2Oifch9e3/2xq9ljpUEYKZAVIVVPqiF3n86ssLv/OdDy75IUHo
+   67RTdQeGc20OckklfmpRjpvC7cCT1mZKRlid3w67UBs6EEbQgCGzqXjOi
+   NdatFPNJvaFIWKoBtqpyQd9yFecmVzXENUGCr745w3Jqa3QUeXJGyO4fH
+   g==;
+X-CSE-ConnectionGUID: /fs42l2aRamlkF2vGhwf9A==
+X-CSE-MsgGUID: 2SoBy6EYTSqmAsupZBGzKg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136230"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136230"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:22 -0800
+X-CSE-ConnectionGUID: O845bRyGQ8Wd6MpeSkE96g==
+X-CSE-MsgGUID: Cz70j2EQQ0GHWMf5pEgBBw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763752"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:22 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 03/23] sched/cache: Introduce helper functions to enforce LLC migration policy
+Date: Wed,  3 Dec 2025 15:07:22 -0800
+Message-Id: <12e90c8c26c690b40e48cc1e03c785f2f99fafa8.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Cache-aware scheduling aggregates threads onto their preferred LLC,
+mainly through load balancing. When the preferred LLC becomes
+saturated, more threads are still placed there, increasing latency.
+A mechanism is needed to limit aggregation so that the preferred LLC
+does not become overloaded.
+
+Introduce helper functions can_migrate_llc() and
+can_migrate_llc_task() to enforce the LLC migration policy:
+
+  1. Aggregate a task to its preferred LLC if both source and
+     destination LLCs are not too busy (<50% utilization),
+     or if doing so will not leave the preferred LLC much more
+     imbalanced than the non-preferred one (>20% utilization
+     difference, similar to imbalance_pct of the LLC domain).
+  2. Allow moving a task from overloaded preferred LLC to a non preferred
+     LLC if this will not cause the non preferred LLC to become
+     too imbalanced to cause a later migration back.
+  3. If both LLCs are too busy, let the generic load balance to spread
+     the tasks.
+
+Further (hysteresis)action could be taken in the future to prevent tasks
+from being migrated into and out of the preferred LLC frequently (back and
+forth): the threshold for migrating a task out of its preferred LLC should
+be higher than that for migrating it into the LLC.
+
+Since aggregation tends to make the preferred LLC busier than others,
+the imbalance tolerance is controlled by llc_imb_pct. If set to 0,
+tasks may still aggregate to the preferred LLC as long as it is
+not more utilized than the source LLC, preserving the preference.
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       No change.
+
+ kernel/sched/fair.c  | 153 +++++++++++++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h |   5 ++
+ 2 files changed, 158 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b9f336300f14..710ed9943d27 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ #define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
+ #define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
+ 
++__read_mostly unsigned int llc_overload_pct       = 50;
++__read_mostly unsigned int llc_imb_pct            = 20;
++
+ static int llc_id(int cpu)
+ {
+ 	if (cpu < 0)
+@@ -9623,6 +9626,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
+ }
+ 
+ #ifdef CONFIG_SCHED_CACHE
++/*
++ * The margin used when comparing LLC utilization with CPU capacity.
++ * Parameter llc_overload_pct determines the LLC load level where
++ * active LLC aggregation is done.
++ * Derived from fits_capacity().
++ *
++ * (default: ~50%)
++ */
++#define fits_llc_capacity(util, max)	\
++	((util) * 100 < (max) * llc_overload_pct)
++
++/*
++ * The margin used when comparing utilization.
++ * is 'util1' noticeably greater than 'util2'
++ * Derived from capacity_greater().
++ * Bias is in perentage.
++ */
++/* Allows dst util to be bigger than src util by up to bias percent */
++#define util_greater(util1, util2) \
++	((util1) * 100 > (util2) * (100 + llc_imb_pct))
++
+ /* Called from load balancing paths with rcu_read_lock held */
+ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 					 unsigned long *cap)
+@@ -9638,6 +9662,135 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+ 
+ 	return true;
+ }
++
++/*
++ * Decision matrix according to the LLC utilization. To
++ * decide whether we can do task aggregation across LLC.
++ *
++ * By default, 50% is the threshold to treat the LLC as busy,
++ * and 20% is the utilization imbalance percentage to decide
++ * if the preferred LLC is busier than the non-preferred LLC.
++ * The hysteresis is used to avoid task bouncing between the
++ * preferred LLC and the non-preferred LLC.
++ *
++ * 1. moving towards the preferred LLC, dst is the preferred
++ *    LLC, src is not.
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            Y    Y    Y    N
++ * 40%            Y    Y    Y    Y
++ * 50%            Y    Y    G    G
++ * 60%            Y    Y    G    G
++ *
++ * 2. moving out of the preferred LLC, src is the preferred
++ *    LLC, dst is not:
++ *
++ * src \ dst      30%  40%  50%  60%
++ * 30%            N    N    N    N
++ * 40%            N    N    N    N
++ * 50%            N    N    G    G
++ * 60%            Y    N    G    G
++ *
++ * src :      src_util
++ * dst :      dst_util
++ * Y :        Yes, migrate
++ * N :        No, do not migrate
++ * G :        let the Generic load balance to even the load.
++ *
++ * The intention is that if both LLCs are quite busy, cache aware
++ * load balance should not be performed, and generic load balance
++ * should take effect. However, if one is busy and the other is not,
++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
++ * be considered to determine whether LLC aggregation should be
++ * performed to bias the load towards the preferred LLC.
++ */
++
++/* migration decision, 3 states are orthogonal. */
++enum llc_mig {
++	mig_forbid = 0,		/* N: Don't migrate task, respect LLC preference */
++	mig_llc,		/* Y: Do LLC preference based migration */
++	mig_unrestricted	/* G: Don't restrict generic load balance migration */
++};
++
++/*
++ * Check if task can be moved from the source LLC to the
++ * destination LLC without breaking cache aware preferrence.
++ * src_cpu and dst_cpu are arbitrary CPUs within the source
++ * and destination LLCs, respectively.
++ */
++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
++				    unsigned long tsk_util,
++				    bool to_pref)
++{
++	unsigned long src_util, dst_util, src_cap, dst_cap;
++
++	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
++	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
++		return mig_unrestricted;
++
++	if (!fits_llc_capacity(dst_util, dst_cap) &&
++	    !fits_llc_capacity(src_util, src_cap))
++		return mig_unrestricted;
++
++	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
++	dst_util = dst_util + tsk_util;
++	if (to_pref) {
++		/*
++		 * llc_imb_pct is the imbalance allowed between
++		 * preferred LLC and non-preferred LLC.
++		 * Don't migrate if we will get preferred LLC too
++		 * heavily loaded and if the dest is much busier
++		 * than the src, in which case migration will
++		 * increase the imbalance too much.
++		 */
++		if (!fits_llc_capacity(dst_util, dst_cap) &&
++		    util_greater(dst_util, src_util))
++			return mig_forbid;
++	} else {
++		/*
++		 * Don't migrate if we will leave preferred LLC
++		 * too idle, or if this migration leads to the
++		 * non-preferred LLC falls within sysctl_aggr_imb percent
++		 * of preferred LLC, leading to migration again
++		 * back to preferred LLC.
++		 */
++		if (fits_llc_capacity(src_util, src_cap) ||
++		    !util_greater(src_util, dst_util))
++			return mig_forbid;
++	}
++	return mig_llc;
++}
++
++/*
++ * Check if task p can migrate from source LLC to
++ * destination LLC in terms of cache aware load balance.
++ */
++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++							struct task_struct *p)
++{
++	struct mm_struct *mm;
++	bool to_pref;
++	int cpu;
++
++	mm = p->mm;
++	if (!mm)
++		return mig_unrestricted;
++
++	cpu = mm->mm_sched_cpu;
++	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
++		return mig_unrestricted;
++
++	if (cpus_share_cache(dst_cpu, cpu))
++		to_pref = true;
++	else if (cpus_share_cache(src_cpu, cpu))
++		to_pref = false;
++	else
++		return mig_unrestricted;
++
++	return can_migrate_llc(src_cpu, dst_cpu,
++			       task_util(p), to_pref);
++}
++
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 84118b522f22..bf72c5bab506 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2828,6 +2828,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ 
++#ifdef CONFIG_SCHED_CACHE
++extern unsigned int llc_overload_pct;
++extern unsigned int llc_imb_pct;
++#endif
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip
new file mode 100644
index 0000000..6926073
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip
@@ -0,0 +1,257 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 55E9C2EC08D
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:24 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802886; cv=none; b=d3VrPdnnjHo1v15INzZi2Be9GCCRZHIzY8RvdjoDE/lVfQN7C6RgefM63jeAgMs+Ej4xBAgNM48bikZgcfBK97s516BGyLXX1Rbvhsn/lxdjOTLJb7/BzUSsXmqizKiXSV4Q40vVu+4KUJUTuTrw0EcRJX7axQAupxl66/Njl7g=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802886; c=relaxed/simple;
+	bh=/WXShEpYiDAFPDra61vUPdbNcgE+VqMlav+UUM59jU0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=sp6UO1OW6Q3DioPA4TyMAxm2w7jZEWfXn+BecCi+DY63bhyHNOAdo2gxE9qPcZ4H/AG5K6vG0sVgNdh5TPmn2YDZ1M3oPRXJYAPeKE66XGC3smKX35V4ctG4LeLd8SIPZYPGBwl8SDEjENvTH1Cw9AGh2YoAZb6Q6CfS4bRt+vY=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=mJ6yn4qm; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="mJ6yn4qm"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802884; x=1796338884;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=/WXShEpYiDAFPDra61vUPdbNcgE+VqMlav+UUM59jU0=;
+  b=mJ6yn4qmNCzGpuUPMZdx+lsUqY/Y8q397TD5tze5hB735PCmFim3TtR3
+   Eh74z+kUDoOPtNaJnMct+g67IgKwnq6+WYRbc+f3oEEw9Wg1Gcg9yN7oU
+   vI5Oubm8s7zVFVo1CwCylUT7AAgUyeA+NaPz/BoikrttCBobaJqnnubeC
+   HmGkKxv21UFMqlb7bdh2Dv1ZUBuQd/5iPTCr2He8Z4My1BxTJHc0KlROt
+   IrrMfarEIQ6kjL275GsASGznmrL05FEBJGY2at3hHLlbpnBR+lPPkEK0Y
+   B/H+e/fK9u8hElcLfWPp6Axh3PPWmX2TiXZI/s6f1Be/ZF/FgJXPpYRSc
+   Q==;
+X-CSE-ConnectionGUID: G2tkFvPIT6SY1+ZRXOxXBw==
+X-CSE-MsgGUID: uYGT49/IQCatA5I80r2gog==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136249"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136249"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:23 -0800
+X-CSE-ConnectionGUID: uiDsRffYTdabgXsA6tZowg==
+X-CSE-MsgGUID: ym+MS4XuQPSAYB5T1Atjpw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763756"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:23 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 04/23] sched/cache: Make LLC id continuous
+Date: Wed,  3 Dec 2025 15:07:23 -0800
+Message-Id: <f7026f347934cfe710650d29bb52271a6cabccd9.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce an index mapping between CPUs and their LLCs. This provides
+a continuous per LLC index needed for cache-aware load balancing in
+later patches.
+
+The existing per_cpu llc_id usually points to the first CPU of the
+LLC domain, which is sparse and unsuitable as an array index. Using
+llc_id directly would waste memory.
+
+With the new mapping, CPUs in the same LLC share a continuous id:
+
+  per_cpu(llc_id, CPU=0...15)  = 0
+  per_cpu(llc_id, CPU=16...31) = 1
+  per_cpu(llc_id, CPU=32...47) = 2
+  ...
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       Convert the static LLC id to be allocated sequentially as LLCs are
+       discovered, and replace the old sd_llc_id. (Peter Zijlstra)
+
+ kernel/sched/fair.c     |  9 ++++++-
+ kernel/sched/sched.h    |  1 +
+ kernel/sched/topology.c | 60 +++++++++++++++++++++++++++++++++++++++--
+ 3 files changed, 67 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 710ed9943d27..0a3918269906 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct            = 20;
+ 
+ static int llc_id(int cpu)
+ {
++	int llc;
++
+ 	if (cpu < 0)
+ 		return -1;
+ 
+-	return per_cpu(sd_llc_id, cpu);
++	llc = per_cpu(sd_llc_id, cpu);
++	/* avoid race with cpu hotplug */
++	if (unlikely(llc >= max_llcs))
++		return -1;
++
++	return llc;
+ }
+ 
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index bf72c5bab506..728737641847 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2075,6 +2075,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ 
+ extern struct static_key_false sched_asym_cpucapacity;
+ extern struct static_key_false sched_cluster_active;
++extern int max_llcs;
+ 
+ static __always_inline bool sched_asym_cpucap_active(void)
+ {
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 444bdfdab731..f25d950ab015 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void)
+ 	mutex_unlock(&sched_domains_mutex);
+ }
+ 
++int max_llcs;
++
+ /* Protected by sched_domains_mutex: */
+ static cpumask_var_t sched_domains_tmpmask;
+ static cpumask_var_t sched_domains_tmpmask2;
+@@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+ 
++/*
++ * Assign continuous llc id for the CPU, and return
++ * the assigned llc id.
++ */
++static int update_llc_id(struct sched_domain *sd,
++			 int cpu)
++{
++	int id = per_cpu(sd_llc_id, cpu), i;
++
++	if (id >= 0)
++		return id;
++
++	if (sd) {
++		/* Look for any assigned id and reuse it.*/
++		for_each_cpu(i, sched_domain_span(sd)) {
++			id = per_cpu(sd_llc_id, i);
++
++			if (id >= 0) {
++				per_cpu(sd_llc_id, cpu) = id;
++				return id;
++			}
++		}
++	}
++
++	/*
++	 * When 1. there is no id assigned to this LLC domain,
++	 * or 2. the sd is NULL, we reach here.
++	 * Consider the following scenario,
++	 * CPU0~CPU95 are in the node0, CPU96~CPU191 are
++	 * in the node1. During bootup, maxcpus=96 is
++	 * appended.
++	 * case 1: When running cpu_attach_domain(CPU24)
++	 * during boot up, CPU24 is the first CPU in its
++	 * non-NULL LLC domain. However,
++	 * its corresponding llc id has not been assigned yet.
++	 *
++	 * case 2: After boot up, the CPU100 is brought up
++	 * via sysfs manually. As a result, CPU100 has only a
++	 * Numa domain attached, because CPU100 is the only CPU
++	 * of a sched domain, all its bottom domains are degenerated.
++	 * The LLC domain pointer sd is NULL for CPU100.
++	 *
++	 * For both cases, we want to increase the number of LLCs.
++	 */
++	per_cpu(sd_llc_id, cpu) = max_llcs++;
++
++	return per_cpu(sd_llc_id, cpu);
++}
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+@@ -677,14 +728,13 @@ static void update_top_cache_domain(int cpu)
+ 
+ 	sd = highest_flag_domain(cpu, SD_SHARE_LLC);
+ 	if (sd) {
+-		id = cpumask_first(sched_domain_span(sd));
+ 		size = cpumask_weight(sched_domain_span(sd));
+ 		sds = sd->shared;
+ 	}
+ 
+ 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ 	per_cpu(sd_llc_size, cpu) = size;
+-	per_cpu(sd_llc_id, cpu) = id;
++	id = update_llc_id(sd, cpu);
+ 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ 
+ 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+@@ -2488,6 +2538,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	bool has_asym = false;
+ 	bool has_cluster = false;
+ 
++	/* first scan of LLCs */
++	if (!max_llcs) {
++		for_each_possible_cpu(i)
++			per_cpu(sd_llc_id, i) = -1;
++	}
++
+ 	if (WARN_ON(cpumask_empty(cpu_map)))
+ 		goto error;
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip
new file mode 100644
index 0000000..9eeeb42
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip
@@ -0,0 +1,172 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 679562ECEBB
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:26 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802888; cv=none; b=ETXMSycIjg3hW2uD7ktvuDRCwlm80jzWlfuybxMLSJjuPv1gOLZC1i6pxE62EG9+cDFAU1hLySS0z9EjoSW7h+IC9WTpkMIZz2geJs1QP3R/eObNqU3OG+yETt/G54TGksleKQ7hmlJH6AIkTyDQ9XdCc+AMJOzQkCsvN6AteuA=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802888; c=relaxed/simple;
+	bh=0AOJ8UhIDlWuve34OwSELAi4hyIDL68J1uZ46Rj5j/U=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=oT6l76w5OE/CgwV2buKuyAjl0MI2Q/KFcNiA5tSmBm5YfGauRJZvP4km+gtrjR5EEwXVgaCsan/LhKN6+lL1MozMs4acvCaZOIR7MI0TH1a6DN/iL60iGgK73IOwTgFjrIfIZLKuBBoFD14Z4gbqwWYyV8VrRWfEVNe6RZksId4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MYgWTb60; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MYgWTb60"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802886; x=1796338886;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=0AOJ8UhIDlWuve34OwSELAi4hyIDL68J1uZ46Rj5j/U=;
+  b=MYgWTb60T6yG49rQ3nLnjAfGEf6N3B3x0R1ujoF4MP+f6thBTMMmV5A2
+   6gtXPzCButviIBBCpY7AZSw3brie2XhnzEv9X/ke/XBPmw9iwTMQXM9o0
+   iuW5LJjdLixT+ECza7WcFjH4T9QTfvwhG/w9TZhOFFXAm15dszIkONvBa
+   SXqv+2sjbXByYYFdX59mzr/UJBdZJP29/Qsoq52Bq39LKfBUjAIOaxdni
+   O3Dd1ftGoYiiVuFKxIPrD6KHkaSzbffy0qzla2yFfiBHwoJt7cDfE6IuV
+   V+N5yhbYcGH4NZwhO7yAb7il3S4WiOKkWeUjmgInRdyyz/X833IZzWB7j
+   A==;
+X-CSE-ConnectionGUID: kjwyz+xTQ4WR9aHYObBogw==
+X-CSE-MsgGUID: zaCpH7h5Qxu4sNwXX6Krdg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136266"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136266"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:25 -0800
+X-CSE-ConnectionGUID: alNeGVnjSJOwjLHIJ4OuIg==
+X-CSE-MsgGUID: 5cfoA8m6Su+MIcnd+RQoGg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763763"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:25 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes
+Date: Wed,  3 Dec 2025 15:07:24 -0800
+Message-Id: <a968a2eb5b0c515d1baeeee4db50fcd7b0d83da0.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+With cache-aware scheduling enabled, each task is assigned a
+preferred LLC ID. This allows quick identification of the LLC domain
+where the task prefers to run, similar to numa_preferred_nid in
+NUMA balancing.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Align preferred LLC with NUMA balancing's preferred node.
+
+ include/linux/sched.h |  1 +
+ init/init_task.c      |  3 +++
+ kernel/sched/fair.c   | 18 ++++++++++++++++++
+ 3 files changed, 22 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 278b529c91df..1ad46220cd04 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1408,6 +1408,7 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	int				preferred_llc;
+ #endif
+ 
+ #ifdef CONFIG_RSEQ
+diff --git a/init/init_task.c b/init/init_task.c
+index a55e2189206f..44bae72b5b7d 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_group	= NULL,
+ 	.numa_faults	= NULL,
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	.preferred_llc  = -1,
++#endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ 	.kasan_depth	= 1,
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0a3918269906..10cec83f65d5 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
+ 	unsigned long epoch;
++	int mm_sched_llc = -1;
+ 
+ 	if (!sched_cache_enabled())
+ 		return;
+@@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
++
++	if (mm->mm_sched_cpu != -1) {
++		mm_sched_llc = llc_id(mm->mm_sched_cpu);
++
++#ifdef CONFIG_NUMA_BALANCING
++		/*
++		 * Don't assign preferred LLC if it
++		 * conflicts with NUMA balancing.
++		 */
++		if (p->numa_preferred_nid >= 0 &&
++		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
++			mm_sched_llc = -1;
++#endif
++	}
++
++	if (p->preferred_llc != mm_sched_llc)
++		p->preferred_llc = mm_sched_llc;
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip
new file mode 100644
index 0000000..da576b4
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip
@@ -0,0 +1,289 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C93962EF652
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:27 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802890; cv=none; b=EDsiu7g2BtXvvoS9BKwrirW/B8ldDhmwGPx+cdJzoxBtklhxCuicf7XZFi+5IO9eicj+U0q988drhlH0OJjM+IwUt0amTGbw3mfM6d+6WZDelOH8Kc3PIbWBuITzHpbg31UVRdkj3UEviuqp+uvpMTrssPknIugATiCNu3Bm+08=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802890; c=relaxed/simple;
+	bh=VfUUqC84e+k4dM9OCiHr0qSll3wkyw96Z2hiwhlrd+g=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=bMgyCWF3/XMpBtns9xgAQbuvJYsQoxOLy5qU1v3Ure2zyH7eaHG4ZLbKyqgBn1NINjkU2O0RPcPn7whkPdiyLRm36oluEWQ4viCDhC3YxOj/EZYMjqKw4E92UmhMBk5j0NYcW2RvXkMIEQxCZjUg4qUDiMfwP1eraXWWdJgmvkk=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=HkSBtET5; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="HkSBtET5"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802887; x=1796338887;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=VfUUqC84e+k4dM9OCiHr0qSll3wkyw96Z2hiwhlrd+g=;
+  b=HkSBtET5tJuyrYVLfwF6tgrJB3jPRTx01PEveXBF1wIqsiOJxkXhzAOm
+   sC1smgQCW8wgJyR4E1u9VSEyU2s5OeGIeEuC988/p/oKmWX8sR4t5I1+Q
+   tI0jgAIHPovP+AIphgRpysIDP7uveWJciGMii/zPUANlnHxP4W7VRq2eJ
+   sBFqpGeZy1Ve8fewNRoxQswiP1fA+sTe9iwHVjtYcP+1v4kzgt4NxJNt7
+   wXwMA6vcMf7L8X5pDnsHkNo+K4j1B34n8SEcNJu9+4em9z3ghkY3MGzod
+   zaVcGH6lY2mH/znHiuVlkKaau6etkJB5XXnU6Zdt6/ZSkCkDGyN6SoMYU
+   A==;
+X-CSE-ConnectionGUID: k1qC8aFmROqogX9M8c8KUg==
+X-CSE-MsgGUID: rOmGsn1SSNWPs0ITFZygdw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136288"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136288"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:27 -0800
+X-CSE-ConnectionGUID: cvzDuwr8RH6q0DcIt04zOw==
+X-CSE-MsgGUID: uxGA3PlMTN6liJURHzFh/g==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763775"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:26 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 06/23] sched/cache: Track LLC-preferred tasks per runqueue
+Date: Wed,  3 Dec 2025 15:07:25 -0800
+Message-Id: <f086ad5603dca8749678aec805ca13214eea04a8.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+For each runqueue, track the number of tasks with an LLC preference
+and how many of them are running on their preferred LLC. This mirrors
+nr_numa_running and nr_preferred_running for NUMA balancing, and will
+be used by cache-aware load balancing in later patches.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Invoke task_of() once and reuse its result afterwards.
+            (Peter Zijlstra)
+            Remove hacky reset_llc_stats() and introduce sched_llc_active flag
+            to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak)
+
+ include/linux/sched.h |  2 ++
+ init/init_task.c      |  1 +
+ kernel/sched/core.c   |  5 ++++
+ kernel/sched/fair.c   | 60 ++++++++++++++++++++++++++++++++++++++++---
+ kernel/sched/sched.h  |  6 +++++
+ 5 files changed, 71 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 1ad46220cd04..466ba8b7398c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1408,6 +1408,8 @@ struct task_struct {
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 	struct callback_head		cache_work;
++	/*the p is currently refcounted in a rq's preferred llc stats*/
++	bool				sched_llc_active;
+ 	int				preferred_llc;
+ #endif
+ 
+diff --git a/init/init_task.c b/init/init_task.c
+index 44bae72b5b7d..ee78837b0aa2 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+ 	.numa_faults	= NULL,
+ #endif
+ #ifdef CONFIG_SCHED_CACHE
++	.sched_llc_active = false,
+ 	.preferred_llc  = -1,
+ #endif
+ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index e8bdf03a4b7f..48626c81ba8e 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value)
+ }
+ EXPORT_SYMBOL(__trace_set_current_state);
+ 
++int task_llc(const struct task_struct *p)
++{
++	return per_cpu(sd_llc_id, task_cpu(p));
++}
++
+ /*
+  * Serialization rules:
+  *
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 10cec83f65d5..d46a70a9d9fb 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1223,6 +1223,43 @@ static int llc_id(int cpu)
+ 	return llc;
+ }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
++{
++	int pref_llc;
++
++	if (!sched_cache_enabled())
++		return;
++
++	pref_llc = p->preferred_llc;
++	if (pref_llc < 0)
++		return;
++
++	rq->nr_llc_running++;
++	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
++	p->sched_llc_active = true;
++}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
++{
++	int pref_llc;
++
++	/*
++	 * Borrow the uc_se->active from uclamp_rq_inc_id(),
++	 * uclamp_rq_dec_id() to avoid the unbalanced calculation
++	 * of rq statistics.
++	 */
++	if (unlikely(!p->sched_llc_active))
++		return;
++
++	pref_llc = p->preferred_llc;
++	if (pref_llc < 0)
++		return;
++
++	rq->nr_llc_running--;
++	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
++	p->sched_llc_active = false;
++}
++
+ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ {
+ 	unsigned long epoch;
+@@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
+ 	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
+ }
+ 
++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
++
+ static inline
+ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+@@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ #endif
+ 	}
+ 
+-	if (p->preferred_llc != mm_sched_llc)
++	/* task not on rq accounted later in account_entity_enqueue() */
++	if (task_running_on_cpu(rq->cpu, p) &&
++	    p->preferred_llc != mm_sched_llc) {
++		account_llc_dequeue(rq, p);
+ 		p->preferred_llc = mm_sched_llc;
++		account_llc_enqueue(rq, p);
++	}
+ }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+@@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { }
+ 
+ static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+ 
++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
++
++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
++
+ #endif
+ 
+ /*
+@@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	update_load_add(&cfs_rq->load, se->load.weight);
+ 	if (entity_is_task(se)) {
++		struct task_struct *p = task_of(se);
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+-		account_numa_enqueue(rq, task_of(se));
++		account_numa_enqueue(rq, p);
++		account_llc_enqueue(rq, p);
+ 		list_add(&se->group_node, &rq->cfs_tasks);
+ 	}
+ 	cfs_rq->nr_queued++;
+@@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+ 	update_load_sub(&cfs_rq->load, se->load.weight);
+ 	if (entity_is_task(se)) {
+-		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
++		struct task_struct *p = task_of(se);
++		struct rq *rq = rq_of(cfs_rq);
++
++		account_numa_dequeue(rq, p);
++		account_llc_dequeue(rq, p);
+ 		list_del_init(&se->group_node);
+ 	}
+ 	cfs_rq->nr_queued--;
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 728737641847..ee8b70647835 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1126,6 +1126,10 @@ struct rq {
+ 	unsigned int		nr_preferred_running;
+ 	unsigned int		numa_migrate_on;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int		nr_pref_llc_running;
++	unsigned int		nr_llc_running;
++#endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+ 	unsigned int		has_blocked_load;
+@@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
+ 
+ #endif /* !CONFIG_NUMA_BALANCING */
+ 
++int task_llc(const struct task_struct *p);
++
+ static inline void
+ queue_balance_callback(struct rq *rq,
+ 		       struct balance_callback *head,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
new file mode 100644
index 0000000..56edbcb
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip
@@ -0,0 +1,293 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7F2A72F0696
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:29 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802892; cv=none; b=JuI4HP7FPjUZRRvIF57U5a+nyKFVaejSLBjOwb2o4K+dyMy+TzvS6alNai1tmhDlx/F2kpTdrbKJxXsp0ye0xTv9vWh98FuHcXDXimNg3p+EZ0AClnIocNRkMFznzOXiGUgsNO6KJzOsOmRV7MqRji4PoMn2fV9YYulhopDCdW0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802892; c=relaxed/simple;
+	bh=Kgfm8ZrVAem+cuIFSErLp11pWO+uaVSLeCf068dctEI=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=oWgrj/+vmbD4ydPoKoPApP5RMU0UBhjF4mxsiADMVL/t5AARqr//6C8rqPkshWdzhhrhMPF1AzqYud7ZATo+YBem2D9OjWwAWcvEU+adG0BNbDeKX0F/tFC7FpYkxBtH1K1PhGVx8OIwbNowGJZ5W0OZkvMWwyvk09t3vXbHMn4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Ff+wBHml; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Ff+wBHml"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802889; x=1796338889;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Kgfm8ZrVAem+cuIFSErLp11pWO+uaVSLeCf068dctEI=;
+  b=Ff+wBHmlGI9Ls+hPQ/icfiRSQpZE9xFA2dMFUkAvN4HoLDq9rsPxZPeZ
+   8VRONCVnKKzfdp0/tx6ByohayUgQnukEiUM/5FG80edcOUwn8pLvcV6CD
+   rsakyGnOPLHSStQkG1+f0q6DnjhqobEUdJaywwMsE54fftDticAbLprId
+   3bhB2AwAPJQjK37rs0/N96in+m4FjW7qil9FvPJrQKe2CXx6Vw8vc05XH
+   UOnoKjT+4VoaXotKSh3uNxjPZTKFSxLyHcD1a3z71R7y9pyahaHenJnCZ
+   3UkyBEcsW2m1c1Cx8k4IAc/bj/uxMr+zGfxYNNEZL+3nmX/2zLcKYH7UG
+   w==;
+X-CSE-ConnectionGUID: XVdRsMs0TMKO/Xjz8IoNBA==
+X-CSE-MsgGUID: 8lt8Jb1nTZqY7huRsHsSQw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136318"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136318"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:28 -0800
+X-CSE-ConnectionGUID: HuS/ZH/YT/Cjm+dSF3UiRw==
+X-CSE-MsgGUID: YDDbEJCdQwCGXEo7YEaNTQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763787"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:28 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter
+Date: Wed,  3 Dec 2025 15:07:26 -0800
+Message-Id: <63091f7ca7bb473fbc176af86a87d27a07a6e149.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Each runqueue is assigned an array where each element tracks
+the number of tasks preferring a given LLC, indexed from 0 to
+max_llcs - 1.
+
+For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
+this runqueue which prefer to run within LLC3.
+
+The load balancer can use this information to identify busy
+runqueues and migrate tasks to their preferred LLC domains.
+This array will be reallocated at runtime if the number of LLCs
+increases due to CPU hotplug. Only extending the buffer(rather
+than shrinking it) is supported to simplify the implementation.
+
+Introduce the buffer allocation mechanism, and the statistics
+will be calculated in the subsequent patch.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+        Remove static allocation of per runqueue LLC preference arrays.
+        Allocate array size to the actual number of LLCs online. (Peter Zijlstra, Madadi Vineeth Reddy)
+
+ kernel/sched/core.c     |   1 +
+ kernel/sched/sched.h    |   1 +
+ kernel/sched/topology.c | 117 +++++++++++++++++++++++++++++++++++++++-
+ 3 files changed, 118 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 48626c81ba8e..ce533dc485f5 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -8800,6 +8800,7 @@ void __init sched_init(void)
+ #ifdef CONFIG_SCHED_CACHE
+ 		raw_spin_lock_init(&rq->cpu_epoch_lock);
+ 		rq->cpu_epoch_next = jiffies;
++		rq->nr_pref_llc = NULL;
+ #endif
+ 
+ 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index ee8b70647835..8f2a779825e4 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1129,6 +1129,7 @@ struct rq {
+ #ifdef CONFIG_SCHED_CACHE
+ 	unsigned int		nr_pref_llc_running;
+ 	unsigned int		nr_llc_running;
++	unsigned int		*nr_pref_llc;
+ #endif
+ #ifdef CONFIG_NO_HZ_COMMON
+ 	unsigned long		last_blocked_load_update_tick;
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index f25d950ab015..d583399fc6a1 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -17,8 +17,121 @@ void sched_domains_mutex_unlock(void)
+ 	mutex_unlock(&sched_domains_mutex);
+ }
+ 
++/* the number of max LLCs being detected */
++static int new_max_llcs;
++/* the current number of max LLCs */
+ int max_llcs;
+ 
++#ifdef CONFIG_SCHED_CACHE
++
++static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
++{
++	unsigned int *new = NULL;
++
++	new = kcalloc(new_max_llcs, sizeof(unsigned int),
++		      GFP_KERNEL | __GFP_NOWARN);
++
++	if (!new) {
++		*gc = NULL;
++	} else {
++		/*
++		 * Place old entry in garbage collector
++		 * for later disposal.
++		 */
++		*gc = old;
++	}
++	return new;
++}
++
++static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
++{
++	int i;
++
++	if (!old)
++		return;
++
++	for (i = 0; i < max_llcs; i++)
++		new[i] = old[i];
++}
++
++static int resize_llc_pref(void)
++{
++	unsigned int *__percpu *tmp_llc_pref;
++	int i, ret = 0;
++
++	if (new_max_llcs <= max_llcs)
++		return 0;
++
++	/*
++	 * Allocate temp percpu pointer for old llc_pref,
++	 * which will be released after switching to the
++	 * new buffer.
++	 */
++	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
++	if (!tmp_llc_pref)
++		return -ENOMEM;
++
++	for_each_present_cpu(i)
++		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
++
++	/*
++	 * Resize the per rq nr_pref_llc buffer and
++	 * switch to this new buffer.
++	 */
++	for_each_present_cpu(i) {
++		struct rq_flags rf;
++		unsigned int *new;
++		struct rq *rq;
++
++		rq = cpu_rq(i);
++		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
++		if (!new) {
++			ret = -ENOMEM;
++
++			goto release_old;
++		}
++
++		/*
++		 * Locking rq ensures that rq->nr_pref_llc values
++		 * don't change with new task enqueue/dequeue
++		 * when we repopulate the newly enlarged array.
++		 */
++		rq_lock_irqsave(rq, &rf);
++		populate_new_pref_llcs(rq->nr_pref_llc, new);
++		rq->nr_pref_llc = new;
++		rq_unlock_irqrestore(rq, &rf);
++	}
++
++release_old:
++	/*
++	 * Load balance is done under rcu_lock.
++	 * Wait for load balance before and during resizing to
++	 * be done. They may refer to old nr_pref_llc[]
++	 * that hasn't been resized.
++	 */
++	synchronize_rcu();
++	for_each_present_cpu(i)
++		kfree(*per_cpu_ptr(tmp_llc_pref, i));
++
++	free_percpu(tmp_llc_pref);
++
++	/* succeed and update */
++	if (!ret)
++		max_llcs = new_max_llcs;
++
++	return ret;
++}
++
++#else
++
++static int resize_llc_pref(void)
++{
++	max_llcs = new_max_llcs;
++	return 0;
++}
++
++#endif
++
+ /* Protected by sched_domains_mutex: */
+ static cpumask_var_t sched_domains_tmpmask;
+ static cpumask_var_t sched_domains_tmpmask2;
+@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
+ 	 *
+ 	 * For both cases, we want to increase the number of LLCs.
+ 	 */
+-	per_cpu(sd_llc_id, cpu) = max_llcs++;
++	per_cpu(sd_llc_id, cpu) = new_max_llcs++;
+ 
+ 	return per_cpu(sd_llc_id, cpu);
+ }
+@@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	if (has_cluster)
+ 		static_branch_inc_cpuslocked(&sched_cluster_active);
+ 
++	resize_llc_pref();
++
+ 	if (rq && sched_debug_verbose)
+ 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip
new file mode 100644
index 0000000..4706f26
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip
@@ -0,0 +1,142 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D24952F0C5B
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:30 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802893; cv=none; b=oh6ql8wRtQTKo8nnK9dUK9t3JsNVUN1SrqTTOLrpZpUDsIKZ+qt9qst5oOs9c5FDd2R9eecOFriCSP4q8iJw0WZIClfw/A2n3lz9QanZX0TndqedBRildmD/ptw2VXSsbXzzCrUFl3ehtEIBnQQqE0gyq5YyFY1waemEa1gZMq0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802893; c=relaxed/simple;
+	bh=ubwbCrnLe+FpFs84fmQJ8NDFPPh85CKovnWcqS4HszM=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Q4hmeOs7hnwqOE8JDGvxpGVeABvVS45aiDvLk6ZpSrPGuTfn+4YcfZc0AFuBMnvnutRPD41rCA1to3LTp3U/rg4Ky2sVe8bcd4xUTzxW+ljCc0tBYewYHhc60QRARoN5k0NGQJalWwDG5Ur5+u4g9f7uSgwIhh8HrXiFwlORSOs=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ngl+EBZ5; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ngl+EBZ5"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802890; x=1796338890;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=ubwbCrnLe+FpFs84fmQJ8NDFPPh85CKovnWcqS4HszM=;
+  b=ngl+EBZ5jKSYuF1GoScWtzUvUawQCvSqX6vXeypzCig51al5M6EFhEW6
+   6ZPkta/KDGc5tm3cLZAn+Q0r4sAGXevBcvNbEeEF94NWh0Q5o4Qi40yoE
+   6fENyQt6WsIYC5Biv3AXCHk/Ns+vA3D+5k8K971vxD5ci0G6jwAhua/Ip
+   V4EYKsxzhnY36WL45Wqmck026Nhmf3XpLNt/wYGNgwSMFF7INI6pnMGxW
+   qdO3IW9AZPldmpFj84igpzlIJMlsU2GHA5/5/K1uwnar4bbN3Va12Jz5l
+   CXyXS2But8o6/1q/DIrjmb1ErBv9PahFCMwFzVlsm1m+7SCCYHQiWGEv4
+   A==;
+X-CSE-ConnectionGUID: 1JiC3BwvQxSVrDu/qUQ3kA==
+X-CSE-MsgGUID: lKtqqExhQ5+Xp1L40Y0AnQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136340"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136340"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:30 -0800
+X-CSE-ConnectionGUID: lyR4BYigTY+QEoG74KEnKQ==
+X-CSE-MsgGUID: lE96dcq2TgepzkLPnZNnrg==
+X-Ironport-Invalid-End-Of-Message: True
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763795"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:30 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 08/23] sched/cache: Calculate the per runqueue task LLC preference
+Date: Wed,  3 Dec 2025 15:07:27 -0800
+Message-Id: <f659bd08e561b377bb7057b6e1b5a656e738c8c6.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Calculate the number of tasks' LLC preferences for each runqueue.
+This statistic is computed during task enqueue and dequeue
+operations, and is used by the cache-aware load balancing.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Split from previous patch for easier review.
+
+ kernel/sched/fair.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d46a70a9d9fb..b0e87616e377 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1231,11 +1231,12 @@ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ 		return;
+ 
+ 	pref_llc = p->preferred_llc;
+-	if (pref_llc < 0)
++	if (pref_llc < 0 || pref_llc >= max_llcs)
+ 		return;
+ 
+ 	rq->nr_llc_running++;
+ 	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
++	rq->nr_pref_llc[pref_llc]++;
+ 	p->sched_llc_active = true;
+ }
+ 
+@@ -1252,11 +1253,12 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ 		return;
+ 
+ 	pref_llc = p->preferred_llc;
+-	if (pref_llc < 0)
++	if (pref_llc < 0 || pref_llc >= max_llcs)
+ 		return;
+ 
+ 	rq->nr_llc_running--;
+ 	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
++	rq->nr_pref_llc[pref_llc]--;
+ 	p->sched_llc_active = false;
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip
new file mode 100644
index 0000000..0efcbfc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip
@@ -0,0 +1,160 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 761A92F12BE
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:32 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802894; cv=none; b=VOvR4Yo5MT+v4vvHJHnJrL04tUMLwfYbb4+GQbWJ3QO13hC1zjlHArO6dzcuGLllayHXLBw43BKllYMjOKohjC7Fzd9T9m3hYmCRq3WLpZzHqcCQuO2JcQTdEeD/rjnDRhN1lGZeCfQEi5WHKdPb8iHSUPG9WfZsKEu6JozCWHQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802894; c=relaxed/simple;
+	bh=Hwjod13ydyBeyAl1Bc0MaWee5egwZS7IehFiRUr+3EU=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Qm3SflXuxBKuYopJgqhcvipXf7FPYSYSF15V5hLWMr9nUpsAfdv+d2spbB0P7Tw1LmX/zkoTpJ7guZJ5VbPuMzy9Baf9HL/h+ZfC7oU8NJtxgafnNNwl0O1u1CDaxlhc7yoqMW17JyUgVXekWAPj30g3bMDCDrz5uBQLCvlVneA=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=cSsts8rq; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="cSsts8rq"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802892; x=1796338892;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Hwjod13ydyBeyAl1Bc0MaWee5egwZS7IehFiRUr+3EU=;
+  b=cSsts8rq9lESYUplMXqyaf7fQNdZgkgjFqCazxZIqivu0ulnrxBxtLfr
+   2q49FeXJtEzQZUFodeAzsWSFeSbbR0eNrEPCzAiJg3hLVd3plskFuoc8R
+   LSKLX41Wp9fMgp9Ou54k2TxPn+ZJpABPQDMRZBxyysFrDh3CB41EwtGEs
+   RrfwNP72MRObV0Rpqk7QGgKlk2FmXjIY1nC71X0MFH6YEKKSRhWDNHOyK
+   9xcJGzOrMyQT5S0kQJJP+Yjr1dE5itsHoR0sqlWiS8N54X7izsEc5kZbZ
+   a2UxxHPNluXsMUFiW8C3sWBY39nJzoHIE5rPFYFCFz7BLdiv2vnTIfuTx
+   g==;
+X-CSE-ConnectionGUID: +jOJhU2XTqKlvSAAbvSNZg==
+X-CSE-MsgGUID: bWKw4Hx3R3mw3p0kqP8M9w==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136361"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136361"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:31 -0800
+X-CSE-ConnectionGUID: fNxL8O0TTpG29riKu1HOzA==
+X-CSE-MsgGUID: yN1bkFJBSRe3C9XnLxsEJA==
+X-Ironport-Invalid-End-Of-Message: True
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763802"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:31 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 09/23] sched/cache: Count tasks prefering destination LLC in a sched group
+Date: Wed,  3 Dec 2025 15:07:28 -0800
+Message-Id: <1eb6a231ec82b37483208983f0cf10eec823ec9d.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, tabulate the number of tasks on each runqueue
+that prefer the LLC contains the env->dst_cpu in a sched group.
+
+For example, consider a system with 4 LLC sched groups (LLC0 to LLC3)
+balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has
+2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is
+selected as the busiest source to pick tasks from.
+
+Within a source LLC, the total number of tasks preferring a destination
+LLC is computed by summing counts across all CPUs in that LLC. For
+instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring
+LLC3, the total for LLC0 is 3.
+
+These statistics allow the load balancer to choose tasks from source
+sched groups that best match their preferred LLCs.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+        Convert nr_pref_llc array in sg_lb_stats to a single
+        variable as only the dst LLC stat is needed.
+        (K Prateek Nayak)
+
+ kernel/sched/fair.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b0e87616e377..4d7803f69a74 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10445,6 +10445,9 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int nr_pref_llc;
++#endif
+ };
+ 
+ /*
+@@ -10912,6 +10915,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ {
+ 	int i, nr_running, local_group, sd_flags = env->sd->flags;
+ 	bool balancing_at_rd = !env->sd->parent;
++#ifdef CONFIG_SCHED_CACHE
++	int dst_llc = llc_id(env->dst_cpu);
++#endif
+ 
+ 	memset(sgs, 0, sizeof(*sgs));
+ 
+@@ -10932,6 +10938,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		if (cpu_overutilized(i))
+ 			*sg_overutilized = 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++		if (sched_cache_enabled() && llc_id(i) != dst_llc &&
++		    dst_llc >= 0)
++			sgs->nr_pref_llc += rq->nr_pref_llc[dst_llc];
++#endif
++
+ 		/*
+ 		 * No need to call idle_cpu() if nr_running is not 0
+ 		 */
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip
new file mode 100644
index 0000000..0bb5b14
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip
@@ -0,0 +1,142 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3209E2EDD45
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:34 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802897; cv=none; b=AN9K8aiWJQG7HDbeaWXpGDetIW2icpqGbDr6zs/psxf+4ZLm2ceitwFSdlkxUNnHO69aqE5S3Lgw8UXlsXoedmM4Pr7i5RbMpn7L1KrlbpjXV6xeAEYh8XRvFtihZU5ev2z3gpc9wUtfTNoORHKd7LfpH7/RywEIWMBBa/DRGKQ=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802897; c=relaxed/simple;
+	bh=p+3h65+/r+G8M/UVKx3C3o18pTa5Qaadr44RFr//JJM=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=MmMfghNG3eQEQnrI1wgmAlkBPcwScfTCOYIB2L9oD0PhxTEQvycV+raEGlUU7tq/cOm1m41tgx1zgYVTnsY1VCpNGnM6slJtSvukwWoNbVbq6sVz9SyOM9hVO35VnfPEJ/kFPYJD7nSsZDAVCSBbwe4MWGUKumJjlC3jPA1Gp5w=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=oK3XGSFi; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="oK3XGSFi"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802894; x=1796338894;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=p+3h65+/r+G8M/UVKx3C3o18pTa5Qaadr44RFr//JJM=;
+  b=oK3XGSFi2bGDDnHY3Lou8C7HjUQfAlxc1xp5Jsb4tWssOTetEyKk8VhS
+   xWt++svfjbe9DJCu7kK8NB54Iyuv23cDcsruzAVgtKiHf34SlRWKEmzrW
+   D+oCFG7YN+VzH5prFgSppmI032uc/cJAJ/qAKAOk+5EqFUqWcIySUNujp
+   dnKCK0NZsBYY0rnhzU9NpLtzRd0sgBD+P+q/gVsngGR9F8P7Ojt0z+4k+
+   FNbn0vTsTTr/tR3CHEUKYnt1XKHxIQth0oKpXgg30ClUCUHrWShO5n1wq
+   sHaXMI4sp88m3bKftZXPxnzsOaTk5Sy2iUOBeydtIg4kqCpHbvNeeio00
+   A==;
+X-CSE-ConnectionGUID: eUWbdnCGTbS8UdiOjQqeaw==
+X-CSE-MsgGUID: M8ATD04uQSWlmkQNqjCJ+A==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136382"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136382"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:33 -0800
+X-CSE-ConnectionGUID: +B4a0CVGS5aDMise1kmgcw==
+X-CSE-MsgGUID: mYFfuf8aQyCTLl73SGkFmQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763810"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:33 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 10/23] sched/cache: Check local_group only once in update_sg_lb_stats()
+Date: Wed,  3 Dec 2025 15:07:29 -0800
+Message-Id: <2581fa14a0083bbd22b50837cd86003e59192c00.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+There is no need to check the local group twice for both group_asym_packing
+and group_smt_balance. Adjust the code to facilitate future checks for group
+types (cache-aware load balancing) as well.
+
+No functional changes are expected.
+
+Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       New code cleanup patch. (Peter Zijlstra)
+
+ kernel/sched/fair.c | 18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 4d7803f69a74..6e4c1ae1bdda 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10984,14 +10984,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_weight = group->group_weight;
+ 
+-	/* Check if dst CPU is idle and preferred to this group */
+-	if (!local_group && env->idle && sgs->sum_h_nr_running &&
+-	    sched_group_asym(env, sgs, group))
+-		sgs->group_asym_packing = 1;
+-
+-	/* Check for loaded SMT group to be balanced to dst CPU */
+-	if (!local_group && smt_balance(env, sgs, group))
+-		sgs->group_smt_balance = 1;
++	if (!local_group) {
++		/* Check if dst CPU is idle and preferred to this group */
++		if (env->idle && sgs->sum_h_nr_running &&
++		    sched_group_asym(env, sgs, group))
++			sgs->group_asym_packing = 1;
++
++		/* Check for loaded SMT group to be balanced to dst CPU */
++		if (smt_balance(env, sgs, group))
++			sgs->group_smt_balance = 1;
++	}
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
new file mode 100644
index 0000000..0f73957
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip
@@ -0,0 +1,276 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C1A762F6160
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:36 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802899; cv=none; b=Y+KSvOIGNo37S4ppd6Zqb+qeXMYg9H7oOVVSwDUONcSmPmmNo+OfFtkUVVLhQy9Kszncjru9WbcIa9UEetZqhMPsmMY2k5fVZ6RAWQZpLFm3o5ZOTcH4gt2vkBWUME5YgLQA3NYdBf+3LQy/lgsvGtAErx6vO+QUxr5PuBX7rAE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802899; c=relaxed/simple;
+	bh=CwrhaA/K9sEcx5ifxeMnRiF7w0oKVkh5kmhIRkZCI08=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=oku/UCNYCKxtHDLcs7jWCa04/T613otu/fvMOx46pM5Fk461C8jmF88SnvfkaEKbY/tPKG6ssSj+6jJ5qq4aFqOkczxx9qajmomVw1d15n0Nxc/H0Jxmj7YmItsjsTy0cRx3h5fJ6U2M4vg8NuLnjq+H/GqT2czhMHBhUawwwc0=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=NQQGq9b9; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="NQQGq9b9"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802896; x=1796338896;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=CwrhaA/K9sEcx5ifxeMnRiF7w0oKVkh5kmhIRkZCI08=;
+  b=NQQGq9b9vgYmeBpCZdLnkhTURcvp8LDZjz79tucf4QAOjRH6WMoJ7DIc
+   VCEpH4PZk5+dZi9trvIpapAwsuwYkQegVq+/LDqHzSrIt129SaHxgL94Y
+   5nrvAHUr0MUD5UNXllanE0V0Fykum1uE2UTQDl3LnIDioTcTzOYpAO1X1
+   4qycYWShsJLluL7efSyQ+/SgISKYo/HIyxL8OBYx1D4XH6mSLaqEpIaiX
+   g8GbNG2ofsWe9Fe2YAYpsC9b78PtUUg4W2Vm4/GWu3tuk8/oeCtghHVCm
+   rv/mHq9+NoDA+NgB2cghgRnsU5NYvBkjZ9v38NvuhidP8frlEkqZR1gb1
+   Q==;
+X-CSE-ConnectionGUID: p32r4lRGQkiQ0lJdqe2vYQ==
+X-CSE-MsgGUID: Aoa+xDJNReuNdKo3ZUU7Lw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136420"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136420"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:36 -0800
+X-CSE-ConnectionGUID: bwFbXM5NTD2aXs2HmVfWRA==
+X-CSE-MsgGUID: oV/d19IyQLihLj6NBNyYIA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763827"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:35 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 11/23] sched/cache: Prioritize tasks preferring destination LLC during balancing
+Date: Wed,  3 Dec 2025 15:07:30 -0800
+Message-Id: <ce9c071a11620b3ae7c155849483f8cbdbe0837e.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During LLC load balancing, first check for tasks that prefer the
+destination LLC and balance them to it before others.
+
+Mark source sched groups containing tasks preferring non local LLCs
+with the group_llc_balance flag. This ensures the load balancer later
+pulls or pushes these tasks toward their preferred LLCs.
+
+The load balancer selects the busiest sched_group and migrates tasks
+to less busy groups to distribute load across CPUs.
+
+With cache-aware scheduling enabled, the busiest sched_group is
+the one with most tasks preferring the destination LLC. If
+the group has the llc_balance flag set, cache aware load balancing is
+triggered.
+
+Introduce the helper function update_llc_busiest() to identify the
+sched_group with the most tasks preferring the destination LLC.
+
+Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+       Fix comparison in can_migrate_llc(), which uses an uninitialized
+       env->src_cpu. Use the candidate group's first CPU instead. (Aaron Lu)
+    
+       Fix a race condition during bootup with build_sched_domains(),
+       where the per-cpu(sd_llc_id) is reset to -1. (lkp/0day)
+       Put the set of group_llc_balance and the usage of it into
+       1 patch. (Peter Zijlstra)
+    
+       Change group_llc_balance priority to be lower than group_overloaded
+       and embed it into normal load balance path. (Peter Zijlstra)
+    
+       Remove the sched group's SD_SHARE_LLC check in llc_balance(), because
+       we should allow tasks migration across NUMA nodes to their preferred LLC,
+       where the domain does not have SD_SHARE_LLC flag.
+
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6e4c1ae1bdda..db555c11b5b8 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9531,6 +9531,11 @@ enum group_type {
+ 	 * from balancing the load across the system.
+ 	 */
+ 	group_imbalanced,
++	/*
++	 * There are tasks running on non-preferred LLC, possible to move
++	 * them to their preferred LLC without creating too much imbalance.
++	 */
++	group_llc_balance,
+ 	/*
+ 	 * The CPU is overloaded and can't provide expected CPU cycles to all
+ 	 * tasks.
+@@ -10440,6 +10445,7 @@ struct sg_lb_stats {
+ 	enum group_type group_type;
+ 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+ 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
++	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
+ 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	unsigned int nr_numa_running;
+@@ -10698,6 +10704,9 @@ group_type group_classify(unsigned int imbalance_pct,
+ 	if (group_is_overloaded(imbalance_pct, sgs))
+ 		return group_overloaded;
+ 
++	if (sgs->group_llc_balance)
++		return group_llc_balance;
++
+ 	if (sg_imbalanced(group))
+ 		return group_imbalanced;
+ 
+@@ -10890,11 +10899,55 @@ static void record_sg_llc_stats(struct lb_env *env,
+ 	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+ 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+ }
++
++/*
++ * Do LLC balance on sched group that contains LLC, and have tasks preferring
++ * to run on LLC in idle dst_cpu.
++ */
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	if (!sched_cache_enabled())
++		return false;
++
++	if (env->sd->flags & SD_SHARE_LLC)
++		return false;
++
++	if (sgs->nr_pref_llc &&
++	    can_migrate_llc(cpumask_first(sched_group_span(group)),
++			    env->dst_cpu, 0, true) == mig_llc)
++		return true;
++
++	return false;
++}
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	/*
++	 * There are more tasks that want to run on dst_cpu's LLC.
++	 */
++	return sgs->nr_pref_llc > busiest->nr_pref_llc;
++}
+ #else
+ static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+ 				       struct sched_group *group)
+ {
+ }
++
++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
++			       struct sched_group *group)
++{
++	return false;
++}
++
++static bool update_llc_busiest(struct lb_env *env,
++			       struct sg_lb_stats *busiest,
++			       struct sg_lb_stats *sgs)
++{
++	return false;
++}
+ #endif
+ 
+ /**
+@@ -10993,6 +11046,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 		/* Check for loaded SMT group to be balanced to dst CPU */
+ 		if (smt_balance(env, sgs, group))
+ 			sgs->group_smt_balance = 1;
++
++		/* Check for tasks in this group can be moved to their preferred LLC */
++		if (llc_balance(env, sgs, group))
++			sgs->group_llc_balance = 1;
+ 	}
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+@@ -11056,6 +11113,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 		/* Select the overloaded group with highest avg_load. */
+ 		return sgs->avg_load > busiest->avg_load;
+ 
++	case group_llc_balance:
++		/* Select the group with most tasks preferring dst LLC */
++		return update_llc_busiest(env, busiest, sgs);
++
+ 	case group_imbalanced:
+ 		/*
+ 		 * Select the 1st imbalanced group as we don't have any way to
+@@ -11318,6 +11379,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
+ 			return false;
+ 		break;
+ 
++	case group_llc_balance:
+ 	case group_imbalanced:
+ 	case group_asym_packing:
+ 	case group_smt_balance:
+@@ -11450,6 +11512,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
+ 			return NULL;
+ 		break;
+ 
++	case group_llc_balance:
+ 	case group_imbalanced:
+ 	case group_asym_packing:
+ 	case group_smt_balance:
+@@ -11949,7 +12012,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
+ 	 * group's child domain.
+ 	 */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+-	    sibling_imbalance(env, &sds, busiest, local) > 1)
++	    (busiest->group_type == group_llc_balance ||
++	    sibling_imbalance(env, &sds, busiest, local) > 1))
+ 		goto force_balance;
+ 
+ 	if (busiest->group_type != group_overloaded) {
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
new file mode 100644
index 0000000..b5197ec
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip
@@ -0,0 +1,191 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8446B2FCBE3
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:38 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802901; cv=none; b=AczUsF+ErJIRlmzmMhdwLmi7ZupDah78/dCkfXKoZGQ3XVlhu9qwGaFYSDg3FFQU9754xRJEORkGrcVZU1ssicX++R+V0FXfSTdSUEZWfvt980XcoUhlWnK7J8un6y7YNQXxJBfZVrhj31WyccQPJJevDmK67sgqqF6PsKk29mc=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802901; c=relaxed/simple;
+	bh=da90OiAHbhR9NPA8Ratl9FUXidYv15t1ql0bzkXvmzA=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=hCA0ZezNljYOVjtzlpNDPYqpoGKoW7yU4ihuYN4DdplXI8ZjqyOysntDUcfzbne+6CzBonX2R+LOUwUNh5V4ZvlW0NEG+WGaT266Gr89t7EmmUAyb0SQ4i4NDSbCHrELFwlVL45n3XsDuBwIKNxjYMRKZj90lzt9XJuGVK0hJpE=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=E/oDxO7e; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="E/oDxO7e"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802898; x=1796338898;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=da90OiAHbhR9NPA8Ratl9FUXidYv15t1ql0bzkXvmzA=;
+  b=E/oDxO7ef4nI4G5J3jOvjR+X/vFua+P9e3AZXKLJcxFJriNr7Ua944xG
+   AxkcNTluTudW0fa7LiL2oLSyXQGNm4wxTedztXy+Kb3GNW3m1xItQPgjY
+   yaKpw+/5zQcwTUlI7cSSe2yq6pGi70PjZnOQeUYqx+6LdidqnzQeT9x0d
+   oKfUVrBxLwV+bxjJ5X7pfb+amTWF/9P1/Z2cwQnN4MgR4+xZfJ/oQETi0
+   OhZkv30WMo989iIGaDW9QOVZENXrnIYuSR0poLGwGoz4vGxEA6oadIK33
+   rSOZLBiBoM9ORQbnZoVJ4AxudF9GCXu3fDkCd/li1EhJxcKamQHTatJeP
+   g==;
+X-CSE-ConnectionGUID: Ktrog9qIS3GVBMh0FikIKg==
+X-CSE-MsgGUID: wFdH+7CRS02fjEaxSZiyog==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136444"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136444"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:37 -0800
+X-CSE-ConnectionGUID: 6HowfZdBQD20KHd0gzJbtg==
+X-CSE-MsgGUID: 6WhOzrMuS8+5P3U6JQUdyA==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763835"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:37 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 12/23] sched/cache: Add migrate_llc_task migration type for cache-aware balancing
+Date: Wed,  3 Dec 2025 15:07:31 -0800
+Message-Id: <ab44d75ad69b81b669cbab41695c0e89407c6feb.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Introduce a new migration type, migrate_llc_task, to support
+cache-aware load balancing.
+
+After identifying the busiest sched_group (having the most tasks
+preferring the destination LLC), mark migrations with this type.
+During load balancing, each runqueue in the busiest sched_group is
+examined, and the runqueue with the highest number of tasks preferring
+the destination CPU is selected as the busiest runqueue.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Remove unnecessary cpus_share_cache() check in
+            sched_balance_find_src_rq() (K Prateek Nayak)
+
+ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
+ 1 file changed, 31 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index db555c11b5b8..529adf342ce0 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9547,7 +9547,8 @@ enum migration_type {
+ 	migrate_load = 0,
+ 	migrate_util,
+ 	migrate_task,
+-	migrate_misfit
++	migrate_misfit,
++	migrate_llc_task
+ };
+ 
+ #define LBF_ALL_PINNED	0x01
+@@ -10134,6 +10135,10 @@ static int detach_tasks(struct lb_env *env)
+ 			env->imbalance -= util;
+ 			break;
+ 
++		case migrate_llc_task:
++			env->imbalance--;
++			break;
++
+ 		case migrate_task:
+ 			env->imbalance--;
+ 			break;
+@@ -11766,6 +11771,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (busiest->group_type == group_llc_balance) {
++		/* Move a task that prefer local LLC */
++		env->migration_type = migrate_llc_task;
++		env->imbalance = 1;
++		return;
++	}
++#endif
++
+ 	if (busiest->group_type == group_imbalanced) {
+ 		/*
+ 		 * In the group_imb case we cannot rely on group-wide averages
+@@ -12073,6 +12087,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ 	unsigned int busiest_nr = 0;
++#ifdef CONFIG_SCHED_CACHE
++	unsigned int busiest_pref_llc = 0;
++	int dst_llc;
++#endif
+ 	int i;
+ 
+ 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+@@ -12181,6 +12199,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
+ 			}
+ 			break;
+ 
++		case migrate_llc_task:
++#ifdef CONFIG_SCHED_CACHE
++			dst_llc = llc_id(env->dst_cpu);
++			if (dst_llc >= 0 &&
++			    busiest_pref_llc < rq->nr_pref_llc[dst_llc]) {
++				busiest_pref_llc = rq->nr_pref_llc[dst_llc];
++				busiest = rq;
++			}
++#endif
++			break;
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
+ 				busiest_nr = nr_running;
+@@ -12363,6 +12391,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 	case migrate_misfit:
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
++	case migrate_llc_task:
++		break;
+ 	}
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
new file mode 100644
index 0000000..35fef7d
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip
@@ -0,0 +1,195 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97BB42FFF98
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:39 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802903; cv=none; b=KDFpHcGAhKnHRBZFMFMtHMoRnhc4icrwIxIA8u+Vif5oz7Z18LHjkzu1IOV8tRYJFy4lXDjG6wYe22JV6BPtT9JAf2mUHKRyigHv1MkoPNBeRIKSEJ51iH0zebfyiiIhyx46QCps5MkfKG9xVMGg3N7ENza6Vv2+y6dsL+Zp0lE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802903; c=relaxed/simple;
+	bh=8SM2jHHpi12dQS+zJornGRPQxkuowwvNXMVhwIeDBGA=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=U/JdvWoJw/IU3s2ub70NWLePIaQBRwHwPYibO+bbRJhw5I3xBFJgWmgkN/HfBIb1ABZRWNcUN5ladx9wdRE4q84V9sG4/k/92/pAoHRgP60/SkA1N0lBh+0oNDDaOMmcaJymNEYAB4Y+PlNTanSz07u82e6zrOmPcftrMVq0eQg=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=QFrFmdLP; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="QFrFmdLP"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802900; x=1796338900;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=8SM2jHHpi12dQS+zJornGRPQxkuowwvNXMVhwIeDBGA=;
+  b=QFrFmdLPstihmD8vLzP896hsrOed6TFf664ZbLxgCKDyVP1ElFu/KxlL
+   cWka8HAx7lSbtKJIRs2zDLb662V+u3vSkOL/+GmAmBZOGy6YahHgzdZ+w
+   Cm8JPiAUQ0kzPS2n/rAw++vW0A14d5QX1S2PZ0RvAxgtjOMIEQght4vtw
+   NlNGyMxSykwrfzzHo/Khc6YFVxKydWs7zQdFb7hjDddawl3rivgSTQ4lM
+   rXsDbUmw/L0HUCnUtshRY/GabXqs3gMSK3t3UfCRyfscjIhW5T7A4/xG6
+   Ul+07Ph3CpTYgJ6hsHVxiRy1rZKIhjL1V7FiHZTJQ8OxeBn2eVIqDAWXn
+   g==;
+X-CSE-ConnectionGUID: Ew56K6WcQq6rz+G+xWa07Q==
+X-CSE-MsgGUID: UdKIolArSn+zoD5IfTBGAQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136469"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136469"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:39 -0800
+X-CSE-ConnectionGUID: 2oDVFmE5Rvu2YRtXhtrPTg==
+X-CSE-MsgGUID: oS68+IQVQH2bxeYLezigJQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763850"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:38 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 13/23] sched/cache: Handle moving single tasks to/from their preferred LLC
+Date: Wed,  3 Dec 2025 15:07:32 -0800
+Message-Id: <d561c06433e15279674895d3de430a4313dbf973.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+If the busiest runqueue has only one task, active balancing may be
+invoked to move it. However, before migration, check whether the task
+is running on its preferred LLC.
+
+Do not move a lone task to another LLC if it would move the task
+away from its preferred LLC or cause excessive imbalance between LLCs.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Remove uneeded preferred LLC migration check from
+            active_load_balance_cpu_stop().
+
+ kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 50 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 529adf342ce0..aed3fab98d7c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9878,12 +9878,57 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu
+ 			       task_util(p), to_pref);
+ }
+ 
++/*
++ * Check if active load balance breaks LLC locality in
++ * terms of cache aware load balance.
++ */
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	if (!sched_cache_enabled())
++		return false;
++
++	if (cpus_share_cache(env->src_cpu, env->dst_cpu))
++		return false;
++	/*
++	 * All tasks prefer to stay on their current CPU.
++	 * Do not pull a task from its preferred CPU if:
++	 * 1. It is the only task running there; OR
++	 * 2. Migrating it away from its preferred LLC would violate
++	 *    the cache-aware scheduling policy.
++	 */
++	if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) {
++		unsigned long util = 0;
++		struct task_struct *cur;
++
++		if (env->src_rq->nr_running <= 1)
++			return true;
++
++		rcu_read_lock();
++		cur = rcu_dereference(env->src_rq->curr);
++		if (cur)
++			util = task_util(cur);
++		rcu_read_unlock();
++
++		if (can_migrate_llc(env->src_cpu, env->dst_cpu,
++				    util, false) == mig_forbid)
++			return true;
++	}
++
++	return false;
++}
+ #else
+ static inline bool get_llc_stats(int cpu, unsigned long *util,
+ 				 unsigned long *cap)
+ {
+ 	return false;
+ }
++
++static inline bool
++break_llc_locality(struct lb_env *env)
++{
++	return false;
++}
+ #endif
+ /*
+  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+@@ -12279,6 +12324,9 @@ static int need_active_balance(struct lb_env *env)
+ {
+ 	struct sched_domain *sd = env->sd;
+ 
++	if (break_llc_locality(env))
++		return 0;
++
+ 	if (asym_active_balance(env))
+ 		return 1;
+ 
+@@ -12298,7 +12346,8 @@ static int need_active_balance(struct lb_env *env)
+ 			return 1;
+ 	}
+ 
+-	if (env->migration_type == migrate_misfit)
++	if (env->migration_type == migrate_misfit ||
++	    env->migration_type == migrate_llc_task)
+ 		return 1;
+ 
+ 	return 0;
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
new file mode 100644
index 0000000..49a9124
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip
@@ -0,0 +1,206 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C1938F513
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:41 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802905; cv=none; b=sPmV7aM8SfneES++JSxoAMTpkJxsxkIaVzLucunnA9mKqP6A+4Tm600kyT9VTXTzXq34T39lXTUp9sHWoERIl8w+bTu7J1HC+rfyTlXxwEVQV8C99GFpkkbN1BPFHILnrVb4xczJGDnWK5dD50Ye9FIBTMyihvIerGvjfEsmqNE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802905; c=relaxed/simple;
+	bh=lHL2pgABc7GHr6ACmg9H32RJUswizn6AHQFobrHrbFw=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=G7rXkqjakmupf9n++e5JGAkMIXq3jqgQc6G6Gw5IYyY/VhHNnlVMfdVNOcDomPtYPBMavf9m7Y2bsSMUvQExqTt6CASUZ8aGZ8iX+XoR/Ej28b5EwCnggenbKxXL4Xj0/E38v+KIJD/T8MnOLbFEeGjSREtAQxxgu/2prdjZMw8=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FiUlG+0K; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FiUlG+0K"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802901; x=1796338901;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=lHL2pgABc7GHr6ACmg9H32RJUswizn6AHQFobrHrbFw=;
+  b=FiUlG+0K/UC9vVMh/oPWl1WUBZdhy5MrB44PaaHkXUAA4jYHkLTFSSsi
+   qocTAQQFuheK8JLYpFg2R7aU2iv4GZRGXge93BEc9kS9nTpx4oQOMWekm
+   +vXMxJj28JhCGkxAcIYAkVQvbks0I4+snX/or9+O6+kLtJoq4VW98lvHt
+   gsZRnKPvbTAbfB8BLT4mfbZqijYwb7I27I0TW2bqZx35wIeRxh9EeBFyi
+   ROuei6K/cuomwGMaKK20uTZT8/nP1CIoBiGImBAQQNhK7Hgo6jMMsFX23
+   lLTcZHF+7w8PBbIBEKU+iwv08wqwC5Czno4lf4DE3GutioUzRJHIw2uIq
+   A==;
+X-CSE-ConnectionGUID: LC9AWrvJQPSiwhGuWRZBoA==
+X-CSE-MsgGUID: vAm1J3bzStCyct17U9yt9Q==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136497"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136497"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:40 -0800
+X-CSE-ConnectionGUID: pVhkYbuLQ6qhXibAglfMuQ==
+X-CSE-MsgGUID: 3iy0SCYQQeWYaEaxUD8biw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763859"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:40 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 14/23] sched/cache: Consider LLC preference when selecting tasks for load balancing
+Date: Wed,  3 Dec 2025 15:07:33 -0800
+Message-Id: <048601436d24f19e84c0a002e1c5897f95853276.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+Currently, task selection from the busiest runqueue ignores LLC
+preferences. Reorder tasks in the busiest queue to prioritize selection
+as follows:
+
+  1. Tasks preferring the destination CPU's LLC
+  2. Tasks with no LLC preference
+  3. Tasks preferring an LLC different from their current one
+  4. Tasks preferring the LLC they are currently on
+
+This improves the likelihood that tasks are migrated to their
+preferred LLC.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: No change.
+
+ kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 65 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index aed3fab98d7c..dd09a816670e 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10092,6 +10092,68 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+ 	return NULL;
+ }
+ 
++#ifdef CONFIG_SCHED_CACHE
++/*
++ * Prepare lists to detach tasks in the following order:
++ * 1. tasks that prefer dst cpu's LLC
++ * 2. tasks that have no preference in LLC
++ * 3. tasks that prefer LLC other than the ones they are on
++ * 4. tasks that prefer the LLC that they are currently on.
++ */
++static struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	struct task_struct *p;
++	LIST_HEAD(pref_old_llc);
++	LIST_HEAD(pref_new_llc);
++	LIST_HEAD(no_pref_llc);
++	LIST_HEAD(pref_other_llc);
++
++	if (!sched_cache_enabled())
++		return tasks;
++
++	if (cpus_share_cache(env->dst_cpu, env->src_cpu))
++		return tasks;
++
++	while (!list_empty(tasks)) {
++		p = list_last_entry(tasks, struct task_struct, se.group_node);
++
++		if (p->preferred_llc == llc_id(env->dst_cpu)) {
++			list_move(&p->se.group_node, &pref_new_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == llc_id(env->src_cpu)) {
++			list_move(&p->se.group_node, &pref_old_llc);
++			continue;
++		}
++
++		if (p->preferred_llc == -1) {
++			list_move(&p->se.group_node, &no_pref_llc);
++			continue;
++		}
++
++		list_move(&p->se.group_node, &pref_other_llc);
++	}
++
++	/*
++	 * We detach tasks from list tail in detach tasks.  Put tasks
++	 * to be chosen first at end of list.
++	 */
++	list_splice(&pref_new_llc, tasks);
++	list_splice(&no_pref_llc, tasks);
++	list_splice(&pref_other_llc, tasks);
++	list_splice(&pref_old_llc, tasks);
++	return tasks;
++}
++#else
++static inline struct list_head
++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
++{
++	return tasks;
++}
++#endif
++
+ /*
+  * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
+  * busiest_rq, as part of a balancing operation within domain "sd".
+@@ -10100,7 +10162,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
+  */
+ static int detach_tasks(struct lb_env *env)
+ {
+-	struct list_head *tasks = &env->src_rq->cfs_tasks;
++	struct list_head *tasks;
+ 	unsigned long util, load;
+ 	struct task_struct *p;
+ 	int detached = 0;
+@@ -10119,6 +10181,8 @@ static int detach_tasks(struct lb_env *env)
+ 	if (env->imbalance <= 0)
+ 		return 0;
+ 
++	tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks);
++
+ 	while (!list_empty(tasks)) {
+ 		/*
+ 		 * We don't want to steal all, otherwise we may be treated likewise,
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
new file mode 100644
index 0000000..85bcc3b
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip
@@ -0,0 +1,251 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F15862EC0B3
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:43 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802907; cv=none; b=gaatxX9hyfNCQNZuo8e4RU3vaqRhxVWET62DnEKpixJNU5xDEVuougssJt9/6wdKqXoIUOBKaKYsQEEI9+soes2dovmZhy3fGDXwD4VJshA6aArNO/9BRtmRmrSUH+Qeb4uxqCx6TiODM+aPCVtCEwIA755BalFPfrmj7+qULOI=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802907; c=relaxed/simple;
+	bh=Qq+bxGUfP5y5uzFrPweEIf2ig+fLfO0Fva+8tsaaHnM=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=PMemmZdPDG2ErK7Z6ebePwKSI9cabjQRZi7fOaAynPsVbH0TYAxCQkgG7kmEu1N/+0Kmoqb2iEytzk5b6Y83O56eTuw4wsJTpcQbn5OA5nrv8fwKgYRvMuPqwTWStSC5o/clmWh6Un/rG7VXFCAXnoxf+tadmloUwr1ceD4Iuek=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=K1c6F2rL; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="K1c6F2rL"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802904; x=1796338904;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Qq+bxGUfP5y5uzFrPweEIf2ig+fLfO0Fva+8tsaaHnM=;
+  b=K1c6F2rLIDMugmXFGo0VPRa3CkwpTWx9IJrRa/hsq4UrL7DnV0pw8ajG
+   BaGeCuW4iC0q3KpRjUrb5Gjs2+rOB74bBmgvjzvP0Bgae0TPuFdvMjX23
+   z6+gGGgG19Wv4ve1vRjEwTT08BRcUINH2YNXiTUVgX6ibcCJComlk0Y6n
+   quNDMVfwdU0hQZhwOtrSHXPRqMojx8I7m9WQ/PmD1woe8uT6yci0V4u2u
+   jfnFFUMEbPvj3J6FUSZjuQwGSGo/EqXqp0xk/5KRyXKafHJF8xEhV/udJ
+   e4v9JDT09EYShziT4Bzd1zuoH2hhzYHA7OeJFLCwdgppCCBWwVA2w3KmJ
+   w==;
+X-CSE-ConnectionGUID: 9y9zHDIITpm9FBwYNTu03A==
+X-CSE-MsgGUID: Qrhpktr7Tg2wRc89JHsg/g==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136537"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136537"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:42 -0800
+X-CSE-ConnectionGUID: JansXFpeT5WbVZuKS0jHBA==
+X-CSE-MsgGUID: NjtVIxeZSgSD5Yg4dtwXow==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763888"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:42 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Chen Yu <yu.c.chen@intel.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 15/23] sched/cache: Respect LLC preference in task migration and detach
+Date: Wed,  3 Dec 2025 15:07:34 -0800
+Message-Id: <1c75f54a2e259737eb9b15c98a5c1d1f142fdef6.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+During the final step of load balancing, can_migrate_task() now
+considers a task's LLC preference before moving it out of its
+preferred LLC.
+
+Additionally, add checks in detach_tasks() to prevent selecting tasks
+that prefer their current LLC.
+
+Co-developed-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Leave out tasks under core scheduling from the cache aware
+            load balance. (K Prateek Nayak)
+    
+            Reduce the degree of honoring preferred_llc in detach_tasks().
+            If certain conditions are met, stop migrating tasks that prefer
+            their current LLC and instead continue load balancing from other
+            busiest runqueues. (K Prateek Nayak)
+
+ kernel/sched/fair.c  | 63 ++++++++++++++++++++++++++++++++++++++++++--
+ kernel/sched/sched.h | 13 +++++++++
+ 2 files changed, 74 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index dd09a816670e..580a967efdac 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9852,8 +9852,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+  * Check if task p can migrate from source LLC to
+  * destination LLC in terms of cache aware load balance.
+  */
+-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+-							struct task_struct *p)
++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
++					 struct task_struct *p)
+ {
+ 	struct mm_struct *mm;
+ 	bool to_pref;
+@@ -10025,6 +10025,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
+ 	if (env->flags & LBF_ACTIVE_LB)
+ 		return 1;
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_cache_enabled() &&
++	    can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid &&
++	    !task_has_sched_core(p))
++		return 0;
++#endif
++
+ 	degrades = migrate_degrades_locality(p, env);
+ 	if (!degrades)
+ 		hot = task_hot(p, env);
+@@ -10146,12 +10153,55 @@ static struct list_head
+ 	list_splice(&pref_old_llc, tasks);
+ 	return tasks;
+ }
++
++static bool stop_migrate_src_rq(struct task_struct *p,
++				struct lb_env *env,
++				int detached)
++{
++	if (!sched_cache_enabled() || p->preferred_llc == -1 ||
++	    cpus_share_cache(env->src_cpu, env->dst_cpu) ||
++	    env->sd->nr_balance_failed)
++		return false;
++
++	/*
++	 * Stop migration for the src_rq and pull from a
++	 * different busy runqueue in the following cases:
++	 *
++	 * 1. Trying to migrate task to its preferred
++	 *    LLC, but the chosen task does not prefer dest
++	 *    LLC - case 3 in order_tasks_by_llc(). This violates
++	 *    the goal of migrate_llc_task. However, we should
++	 *    stop detaching only if some tasks have been detached
++	 *    and the imbalance has been mitigated.
++	 *
++	 * 2. Don't detach more tasks if the remaining tasks want
++	 *    to stay. We know the remaining tasks all prefer the
++	 *    current LLC, because after order_tasks_by_llc(), the
++	 *    tasks that prefer the current LLC are the least favored
++	 *    candidates to be migrated out.
++	 */
++	if (env->migration_type == migrate_llc_task &&
++	    detached && llc_id(env->dst_cpu) != p->preferred_llc)
++		return true;
++
++	if (llc_id(env->src_cpu) == p->preferred_llc)
++		return true;
++
++	return false;
++}
+ #else
+ static inline struct list_head
+ *order_tasks_by_llc(struct lb_env *env, struct list_head *tasks)
+ {
+ 	return tasks;
+ }
++
++static bool stop_migrate_src_rq(struct task_struct *p,
++				struct lb_env *env,
++				int detached)
++{
++	return false;
++}
+ #endif
+ 
+ /*
+@@ -10205,6 +10255,15 @@ static int detach_tasks(struct lb_env *env)
+ 
+ 		p = list_last_entry(tasks, struct task_struct, se.group_node);
+ 
++		/*
++		 * Check if detaching current src_rq should be stopped, because
++		 * doing so would break cache aware load balance. If we stop
++		 * here, the env->flags has LBF_ALL_PINNED, which would cause
++		 * the load balance to pull from another busy runqueue.
++		 */
++		if (stop_migrate_src_rq(p, env, detached))
++			break;
++
+ 		if (!can_migrate_task(p, env))
+ 			goto next;
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 8f2a779825e4..40798a06e058 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1485,6 +1485,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
+ extern void sched_core_get(void);
+ extern void sched_core_put(void);
+ 
++static inline bool task_has_sched_core(struct task_struct *p)
++{
++	if (sched_core_disabled())
++		return false;
++
++	return !!p->core_cookie;
++}
++
+ #else /* !CONFIG_SCHED_CORE: */
+ 
+ static inline bool sched_core_enabled(struct rq *rq)
+@@ -1524,6 +1532,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
+ 	return true;
+ }
+ 
++static inline bool task_has_sched_core(struct task_struct *p)
++{
++	return false;
++}
++
+ #endif /* !CONFIG_SCHED_CORE */
+ 
+ #ifdef CONFIG_RT_GROUP_SCHED
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip
new file mode 100644
index 0000000..31f6859
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip
@@ -0,0 +1,192 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3AB2EFDAD
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:44 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802907; cv=none; b=DqEqfXaSW0ZZxydgpHnr//9Y+r8Kz4ipcj+CchWbORZ48RCt17FQ2DquLW8sfqca/x+abOrEYIPaq71/GVkzdhR5YktmlcdFPno7ta7IuxETAlghruG+YXcsfmrH3WvfypIFBRxcIK9G7zQ7Meao90BbtEmbg2ZH1AORZqaQMHw=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802907; c=relaxed/simple;
+	bh=UY2I5n5Zb5eoLU5mFytvpnggFlTCSd5WOZCBICo1NK0=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=ss76k4YY8rB/Z6uAGDFyQbUZ7bARhHHFMR8yOxKyMTjDj6HDUJk3fTrjyBpd8eZwWLWJd6uE+i5j5z2Y9c/kkgK7AnD0FSS5RcyHMwddwez0X8IBpyAwZBkh9Vkri2qy0caEGEQrs66nsLD9/pRtuqh/ensvo0F7AVsRu2xo2+M=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=b/WFlJ1d; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="b/WFlJ1d"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802904; x=1796338904;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=UY2I5n5Zb5eoLU5mFytvpnggFlTCSd5WOZCBICo1NK0=;
+  b=b/WFlJ1dlftZ7EAiu5bb8CTSjdtBeseHX8isQ4Wht5vD1dxWm6RURFOT
+   R1B3Vg98GKNKQd2LzX3IPnNH9KdzkcCltvIyuRjvzvHEAhFOFxsI/nNCA
+   UEadn+0Fte3u19UFuKUeR+zfOfQY/nrc24OBpPT4wpQKXE96Ne4Zzhez9
+   CGKthr3Nhi0su6EqgFcgXSic3+e2vAZwxOJETpVdCkTcXOxPoH3AQRibc
+   89EqfPOQ7c13HxarJn7Y8fuv5oRcK9m2z4cMXZ93jLuPQkW6wM0YzTFzA
+   la772T94DglzvBNsM6aU73BVVoFLW1MUMY65Xa6wwGE8bwa6iEUdQtZCN
+   w==;
+X-CSE-ConnectionGUID: hV5QNWsDRNeWT6DD4+0r9w==
+X-CSE-MsgGUID: zyp/cB/OQI6PxkSBENXT7w==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136566"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136566"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:44 -0800
+X-CSE-ConnectionGUID: iafqWAoBQZGMBBV/tLdIww==
+X-CSE-MsgGUID: S+YoPmfDSRiUGxMt3Qjmgw==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763904"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:43 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org,
+	Libo Chen <libo.chen@oracle.com>
+Subject: [PATCH v2 16/23] sched/cache: Introduce sched_cache_present to enable cache aware scheduling for multi LLCs NUMA node
+Date: Wed,  3 Dec 2025 15:07:35 -0800
+Message-Id: <7453e3f901878608959f23dacaa36dfc0432c05b.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Cache-aware load balancing should only be enabled if there are more
+than 1 LLCs within 1 NUMA node. sched_cache_present is introduced to
+indicate whether this platform supports this topology.
+
+Suggested-by: Libo Chen <libo.chen@oracle.com>
+Suggested-by: Adam Li <adamli@os.amperecomputing.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2:
+    	Use flag sched_cache_present to indicate whether a platform
+    	supports cache aware scheduling. Change this flag from staic key.
+    	There should be only 1 static key to control the cache aware
+    	scheduling. (Peter Zijlstra)
+
+ kernel/sched/topology.c | 20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index d583399fc6a1..9799e3a9a609 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -24,6 +24,8 @@ int max_llcs;
+ 
+ #ifdef CONFIG_SCHED_CACHE
+ 
++static bool sched_cache_present;
++
+ static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
+ {
+ 	unsigned int *new = NULL;
+@@ -54,7 +56,7 @@ static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
+ 		new[i] = old[i];
+ }
+ 
+-static int resize_llc_pref(void)
++static int resize_llc_pref(bool has_multi_llcs)
+ {
+ 	unsigned int *__percpu *tmp_llc_pref;
+ 	int i, ret = 0;
+@@ -102,6 +104,11 @@ static int resize_llc_pref(void)
+ 		rq_unlock_irqrestore(rq, &rf);
+ 	}
+ 
++	if (has_multi_llcs) {
++		sched_cache_present = true;
++		pr_info_once("Cache aware load balance is enabled on the platform.\n");
++	}
++
+ release_old:
+ 	/*
+ 	 * Load balance is done under rcu_lock.
+@@ -124,7 +131,7 @@ static int resize_llc_pref(void)
+ 
+ #else
+ 
+-static int resize_llc_pref(void)
++static int resize_llc_pref(bool has_multi_llcs)
+ {
+ 	max_llcs = new_max_llcs;
+ 	return 0;
+@@ -2644,6 +2651,7 @@ static int
+ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+ {
+ 	enum s_alloc alloc_state = sa_none;
++	bool has_multi_llcs = false;
+ 	struct sched_domain *sd;
+ 	struct s_data d;
+ 	struct rq *rq = NULL;
+@@ -2736,10 +2744,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 				 * between LLCs and memory channels.
+ 				 */
+ 				nr_llcs = sd->span_weight / child->span_weight;
+-				if (nr_llcs == 1)
++				if (nr_llcs == 1) {
+ 					imb = sd->span_weight >> 3;
+-				else
++				} else {
+ 					imb = nr_llcs;
++					has_multi_llcs = true;
++				}
+ 				imb = max(1U, imb);
+ 				sd->imb_numa_nr = imb;
+ 
+@@ -2787,7 +2797,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
+ 	if (has_cluster)
+ 		static_branch_inc_cpuslocked(&sched_cluster_active);
+ 
+-	resize_llc_pref();
++	resize_llc_pref(has_multi_llcs);
+ 
+ 	if (rq && sched_debug_verbose)
+ 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip
new file mode 100644
index 0000000..c27b26e
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip
@@ -0,0 +1,172 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2CA91309EF4
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:46 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802908; cv=none; b=nzlhLORShQGH6z2OKPCwgPj3fFYQBq0S4kjlB8PdpAMAbRvUDKx69/o9oLg1lRga1/7uLzN7ZJmwClhqm7REccEFVBXjMxnF8O6F1qeXlUxSc5j6wsPAdvgE25W54gtIVxKBjQRnZDVLeIGtXbaxk29EoCqp7pm1fCpS1IY7jQo=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802908; c=relaxed/simple;
+	bh=7G8GAR73tqFcdrEyXVcfBaeUwRwA82VAe47pEbdUV2w=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Jh1NMZniFEQvMeyAac4yMWESOURMqAUIKW5GcomnPyFPuACvinoSr0dUF9HnUWSFLODn+/4wiWm4ySl8YKMzKSgIL7OQSmo169aanmL/sbmdbfeduyjfscZaBGqL5cQYK99GiDZLKPt44QcYP3KC0gclEaC+Rkd8OiTRxeMU500=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=E0yq8JMN; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="E0yq8JMN"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802906; x=1796338906;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=7G8GAR73tqFcdrEyXVcfBaeUwRwA82VAe47pEbdUV2w=;
+  b=E0yq8JMN3sNhZ58s1b5iZ/cpqNuM9N0pDevJEvrPce0R2mUndVkmGScN
+   McHDjEQAdkFny/+9qg6ANdvlFmYlDA/4TibC4Yz5kBPZKGiM/VEgmSwNx
+   Wv+0fExbPAqEqTORsnJ61vyIc7KAkoB0P/ug+G27y1gOBAwA36EGLI/OA
+   /yCpUK6WyND+MO1j8Jd+Z6+AKRhUgaidNDGg0GWIIit5s7o17SsHVlDsV
+   qRWNYanMa3En1ALugyelInfcAx8tLNFNwwlqUz9ZCh6D2uuGRuoBR5fLH
+   VziKp+AH5f2oXxMZP43VD+u7hWt+ni9sCpFuAa1/qPyus5y+HPClviJWH
+   w==;
+X-CSE-ConnectionGUID: oDqO/ga6T/+BT+4b/VYbEw==
+X-CSE-MsgGUID: BsO2ZD3WSAih53lppi2XZQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136597"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136597"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:45 -0800
+X-CSE-ConnectionGUID: vKt+yECETT+2Z5MJs0mW1A==
+X-CSE-MsgGUID: XpexGbaTSRGCCth9FMIgbg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763921"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:45 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 17/23] sched/cache: Record the number of active threads per process for cache-aware scheduling
+Date: Wed,  3 Dec 2025 15:07:36 -0800
+Message-Id: <d14c517c70f1e19bb1223ae705d9a4311420b2ee.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+A performance regression was observed by Prateek when running hackbench
+with many threads per process (high fd count). To avoid this, processes
+with a large number of active threads are excluded from cache-aware
+scheduling.
+
+With sched_cache enabled, record the number of active threads in each
+process during the periodic task_cache_work(). While iterating over
+CPUs, if the currently running task belongs to the same process as the
+task that launched task_cache_work(), increment the active thread count.
+
+This number will be used by subsequent patch to inhibit cache aware
+load balance.
+
+Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: No change.
+
+ include/linux/mm_types.h |  1 +
+ kernel/sched/fair.c      | 11 +++++++++--
+ 2 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 1ea16ef90566..04743983de4d 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -1043,6 +1043,7 @@ struct mm_struct {
+ 		raw_spinlock_t mm_sched_lock;
+ 		unsigned long mm_sched_epoch;
+ 		int mm_sched_cpu;
++		u64 nr_running_avg ____cacheline_aligned_in_smp;
+ #endif
+ 
+ #ifdef CONFIG_MMU
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 580a967efdac..2f38ad82688f 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1421,11 +1421,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
+ 
+ static void __no_profile task_cache_work(struct callback_head *work)
+ {
+-	struct task_struct *p = current;
++	struct task_struct *p = current, *cur;
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long curr_m_a_occ = 0;
+-	int cpu, m_a_cpu = -1;
++	int cpu, m_a_cpu = -1, nr_running = 0;
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1458,6 +1458,12 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 					m_occ = occ;
+ 					m_cpu = i;
+ 				}
++				rcu_read_lock();
++				cur = rcu_dereference(cpu_rq(i)->curr);
++				if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
++				    cur->mm == mm)
++					nr_running++;
++				rcu_read_unlock();
+ 			}
+ 
+ 			/*
+@@ -1501,6 +1507,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 		mm->mm_sched_cpu = m_a_cpu;
+ 	}
+ 
++	update_avg(&mm->nr_running_avg, nr_running);
+ 	free_cpumask_var(cpus);
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
new file mode 100644
index 0000000..d348566
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip
@@ -0,0 +1,175 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3BC5B30C376
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:48 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802910; cv=none; b=onpA+M+D8g+bB7DNpp5zLpepvUh9w8T9C2/oqeKTUWUlV9lpl/W31aZarTCR7uvwI9r/kkm/FD7MwcDDnX7hNWvSaLIvFHtht8DxsLrUWb3j5NtWoxy2IAV7VHzxT0RxTQbEVmk6ub/tCK+n4V2wt8/jU8sGCZYABu8xUNFmQzE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802910; c=relaxed/simple;
+	bh=BCBRwLmdA+4IVzADPAWhC/3F5wk90mYr0XsPdVDldug=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=P22IAZf3pO0DwcaeGaXfPF45reu5KwrXd9udmOhkXnd4XQpVPzlUupze8eBT005FfxLXJRNYY4JgHS7VRdg5qBGX8VhBoX9G0rOKgnTr7U9RHG4jdp1TU4xtGdenBrAxzksuJ/5c09oa/Ni6O8HCwsplWWOi+6exHbX7OKSFqwo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Ty7FUw1A; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Ty7FUw1A"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802908; x=1796338908;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=BCBRwLmdA+4IVzADPAWhC/3F5wk90mYr0XsPdVDldug=;
+  b=Ty7FUw1AorJFrTn1pShKiLwJJ/bjWAtb7y1krTlw9/SRwaxzgqmmczqo
+   u3N/1SifTNffuhxC1c0FAisXDHgXvvqPgSL0eykN2kILgw5XGJw02WLu5
+   DTsTU9YL6pY9pb/nL5ZARaF9QKCpSpfipEIM2etVGVvo5Q7kFSTOXs+H8
+   iIxOD/4oSuYwezAxsdbkRhhzIdd7YfjUSvB9o0XWfU4YnsJl/heMOcJ7B
+   H3ZduMD5RF+5BphEK1nTa5CXhVJ0S2nzOaIUo5QipmWAbfGExiFD7Dfvc
+   B8hxG4haeF2aHk7F8TdO+F6bVlL/xt/ae41Mu5pc0GlLavso3K0AzD+Xh
+   Q==;
+X-CSE-ConnectionGUID: 93dV145yReO721FOecTa9w==
+X-CSE-MsgGUID: cCQ1dcHHRZCkHxfc9efEaQ==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136621"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136621"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:47 -0800
+X-CSE-ConnectionGUID: gYbWyA1jQSuPW79ZakwQKg==
+X-CSE-MsgGUID: TU95ucBJS6iZ5dz55kzZsQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763946"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:47 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 18/23] sched/cache: Disable cache aware scheduling for processes with high thread counts
+Date: Wed,  3 Dec 2025 15:07:37 -0800
+Message-Id: <b063de86c9d52611748a664d1d6bf3ecdaace2f3.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+If the number of active threads within the process exceeds the number
+of Cores(divided by SMTs number) in the LLC, do not enable cache-aware
+scheduling. This is because there is a risk of cache contention within
+the preferred LLC when too many threads are present.
+
+Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: No change.
+
+ kernel/sched/fair.c | 29 +++++++++++++++++++++++++++--
+ 1 file changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 2f38ad82688f..6afa3f9a4e9b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1223,6 +1223,18 @@ static int llc_id(int cpu)
+ 	return llc;
+ }
+ 
++static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
++{
++	int smt_nr = 1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active())
++		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
++#endif
++
++	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++}
++
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+ {
+ 	int pref_llc;
+@@ -1365,10 +1377,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 
+ 	/*
+ 	 * If this task hasn't hit task_cache_work() for a while, or it
+-	 * has only 1 thread, invalidate its preferred state.
++	 * has only 1 thread, or has too many active threads, invalidate
++	 * its preferred state.
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+-	    get_nr_threads(p) <= 1) {
++	    get_nr_threads(p) <= 1 ||
++	    exceed_llc_nr(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1435,6 +1449,13 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
++	if (get_nr_threads(p) <= 1) {
++		if (mm->mm_sched_cpu != -1)
++			mm->mm_sched_cpu = -1;
++
++		return;
++	}
++
+ 	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
+ 		return;
+ 
+@@ -9874,6 +9895,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
++	/* skip cache aware load balance for single/too many threads */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++		return mig_unrestricted;
++
+ 	if (cpus_share_cache(dst_cpu, cpu))
+ 		to_pref = true;
+ 	else if (cpus_share_cache(src_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
new file mode 100644
index 0000000..f83468e
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip
@@ -0,0 +1,258 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9101E2EC54D
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:50 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802913; cv=none; b=mZ6zgozB73YTe2Q60NzNJeXcrA6dwd6hmTIv0PKyoFj0ekz5KBJkRG1qM2/BURh0aF7CFHE0sYQDT25Sh/ho6UmSGiIRzP3Vlf26ErGeRZYynNy7Hu4jA7k4JybnWrC09LDy8qEGxsIyAxdcr/3QTceL1Zxm0kxxCEBV46nlDEI=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802913; c=relaxed/simple;
+	bh=ty+thnKFxG9+3T4ifTVEX04pmBe/l14iXANMioAm72I=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=bowCsa1//bbyzKU9WSJiWQsUHsXrqBQlvs/cAKgMyk/m4Bld010TDYg5UwVzdHKRvlpaid+xFoVz12quGwWlGa5F6HadDbBqKTBPP6/p1CNg91urhPN3p32qxubeGCoBIbuMM7MCO6I/YdFGB6u4/f5TpvPg3YmLnLcjC8/C7Xc=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=bxWe6OeK; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="bxWe6OeK"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802910; x=1796338910;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=ty+thnKFxG9+3T4ifTVEX04pmBe/l14iXANMioAm72I=;
+  b=bxWe6OeKUH0dPxqgW1jI5HE2e1z6OmOiyR4hMvqwqKai+AqvYcbOCYwu
+   JOlPn9ZWYosHECHx5UGnkdTGEzkOmDWCRC2K3ypKwePUhIyD1337RCjJ3
+   uixa8Z2lYSQS2J5GJVC48B2f/yhUzBFPqFV4CEHvCoMLsK1cOf7W1aP4l
+   eQBVHvIxVJB4mpBt3ae1f/13ipHHAFwfwmFLo4k5SToBHKxSAT6nyvK8a
+   Vm37u8PzhAmKBcxxBJlGGGzpwc2T4MC/PWSin17i5/r/Xk+DaSUzLnxaF
+   ZlP2B1+lT/NuonQU/h16sWvSe3/WRw4AeV5gKIbsttEfaewPOisfGEd7j
+   g==;
+X-CSE-ConnectionGUID: Jgmht7L1SaW2ul5kAUA6dw==
+X-CSE-MsgGUID: 8CE3l3r/SEaFaHk/6vdVRg==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136653"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136653"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:49 -0800
+X-CSE-ConnectionGUID: 88MGzjBCTmOWjRxLdU7vUw==
+X-CSE-MsgGUID: Bi68ivGaS76IdMdbGxb19w==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763965"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:49 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 19/23] sched/cache: Avoid cache-aware scheduling for memory-heavy processes
+Date: Wed,  3 Dec 2025 15:07:38 -0800
+Message-Id: <f1acf9ae29b7f69f3c74492b77d780a242343dfe.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Prateek and Tingyin reported that memory-intensive workloads (such as
+stream) can saturate memory bandwidth and caches on the preferred LLC
+when sched_cache aggregates too many threads.
+
+To mitigate this, estimate a process's memory footprint by comparing
+its RSS (anonymous and shared pages) to the size of the LLC. If RSS
+exceeds the LLC size, skip cache-aware scheduling.
+
+Note that RSS is only an approximation of the memory footprint.
+By default, the comparison is strict, but a later patch will allow
+users to provide a hint to adjust this threshold.
+
+According to the test from Adam, some systems do not have shared L3
+but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].
+
+Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/
+
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+
+Notes:
+    v1->v2: Assigned curr_cpu in task_cache_work() before checking
+            exceed_llc_capacity(mm, curr_cpu) to avoid out-of-bound
+            access.(lkp/0day)
+
+ include/linux/cacheinfo.h | 21 ++++++++++-------
+ kernel/sched/fair.c       | 49 +++++++++++++++++++++++++++++++++++----
+ 2 files changed, 57 insertions(+), 13 deletions(-)
+
+diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
+index c8f4f0a0b874..82d0d59ca0e1 100644
+--- a/include/linux/cacheinfo.h
++++ b/include/linux/cacheinfo.h
+@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
+ 
+ const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
+ 
+-/*
+- * Get the cacheinfo structure for the cache associated with @cpu at
+- * level @level.
+- * cpuhp lock must be held.
+- */
+-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level)
+ {
+ 	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ 	int i;
+ 
+-	lockdep_assert_cpus_held();
+-
+ 	for (i = 0; i < ci->num_leaves; i++) {
+ 		if (ci->info_list[i].level == level) {
+ 			if (ci->info_list[i].attributes & CACHE_ID)
+@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
+ 	return NULL;
+ }
+ 
++/*
++ * Get the cacheinfo structure for the cache associated with @cpu at
++ * level @level.
++ * cpuhp lock must be held.
++ */
++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
++{
++	lockdep_assert_cpus_held();
++
++	return _get_cpu_cacheinfo_level(cpu, level);
++}
++
+ /*
+  * Get the id of the cache associated with @cpu at level @level.
+  * cpuhp lock must be held.
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 6afa3f9a4e9b..424ec601cfdf 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1223,6 +1223,38 @@ static int llc_id(int cpu)
+ 	return llc;
+ }
+ 
++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
++{
++	struct cacheinfo *ci;
++	unsigned long rss;
++	unsigned int llc;
++
++	/*
++	 * get_cpu_cacheinfo_level() can not be used
++	 * because it requires the cpu_hotplug_lock
++	 * to be held. Use _get_cpu_cacheinfo_level()
++	 * directly because the 'cpu' can not be
++	 * offlined at the moment.
++	 */
++	ci = _get_cpu_cacheinfo_level(cpu, 3);
++	if (!ci) {
++		/*
++		 * On system without L3 but with shared L2,
++		 * L2 becomes the LLC.
++		 */
++		ci = _get_cpu_cacheinfo_level(cpu, 2);
++		if (!ci)
++			return true;
++	}
++
++	llc = ci->size;
++
++	rss = get_mm_counter(mm, MM_ANONPAGES) +
++		get_mm_counter(mm, MM_SHMEMPAGES);
++
++	return (llc <= (rss * PAGE_SIZE));
++}
++
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+ 	int smt_nr = 1;
+@@ -1382,7 +1414,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 */
+ 	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
+ 	    get_nr_threads(p) <= 1 ||
+-	    exceed_llc_nr(mm, cpu_of(rq))) {
++	    exceed_llc_nr(mm, cpu_of(rq)) ||
++	    exceed_llc_capacity(mm, cpu_of(rq))) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 	}
+@@ -1439,7 +1472,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	struct mm_struct *mm = p->mm;
+ 	unsigned long m_a_occ = 0;
+ 	unsigned long curr_m_a_occ = 0;
+-	int cpu, m_a_cpu = -1, nr_running = 0;
++	int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
+ 	cpumask_var_t cpus;
+ 
+ 	WARN_ON_ONCE(work != &p->cache_work);
+@@ -1449,7 +1482,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
+ 	if (p->flags & PF_EXITING)
+ 		return;
+ 
+-	if (get_nr_threads(p) <= 1) {
++	curr_cpu = task_cpu(p);
++	if (get_nr_threads(p) <= 1 ||
++	    exceed_llc_capacity(mm, curr_cpu)) {
+ 		if (mm->mm_sched_cpu != -1)
+ 			mm->mm_sched_cpu = -1;
+ 
+@@ -9895,8 +9930,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+ 	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+ 		return mig_unrestricted;
+ 
+-	/* skip cache aware load balance for single/too many threads */
+-	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
++	/*
++	 * Skip cache aware load balance for single/too many threads
++	 * or large footprint.
++	 */
++	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
++	    exceed_llc_capacity(mm, dst_cpu))
+ 		return mig_unrestricted;
+ 
+ 	if (cpus_share_cache(dst_cpu, cpu))
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip
new file mode 100644
index 0000000..8a42b76
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip
@@ -0,0 +1,478 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E85E42F0C6F
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:52 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802915; cv=none; b=NKB81c5nkJMF1m/c1AQra8pCalQ/VATWqz8ZHIWg0eoz6hnNECnbqY6IjBOdnDBFvVl/b9HVmkECeNM1mHW2uEI8K209dQ6+mwy42BNPEeHaX20qEOS7RazcHKvkjiS5SxHlmYAv1Sx5K4HGlnkZ+3m/wG0/DRyA26pbDpUaoF0=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802915; c=relaxed/simple;
+	bh=j5hfiRZ2EYaCTsQGDmAvNRTgCCnUI1j/ItMFRbl9uzY=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=V2hqbFyqQGneKfxIcpO2Kc5dagTB+TDzJUq23BN2DeHLv/PgsNga9e2rv+hmluwZMbEcHv9RyyZKJ8F8TwCiuK0Z3yMm4l1RIXSG3p6TYCnyj/3zsuh7jcDOrc/cJgzZvLgpTBDOt79ulEa8r4q4GzHG4PsV4tL2S7Y8MOiS1eo=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=aHHISq0g; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="aHHISq0g"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802913; x=1796338913;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=j5hfiRZ2EYaCTsQGDmAvNRTgCCnUI1j/ItMFRbl9uzY=;
+  b=aHHISq0gwB38J2pv7w+1lfXdj3ALD4Re5eBGYwuwYbgrSTS87mzWr9d9
+   6z8UE8JAD8ovVTi9HPH2Dj4nm47BQyJFWTB7aSIByFBZvHQDMif8JcxQo
+   YN44mNhAEn4CrrZXow3MjME9dhVbGveKvuIPn5IfCupOo2V/UomJWHR8v
+   dtkYFqLnVw3S3bkna5BsUdpRh9ZBimaMuGq/+WwGF2nx4rrzpNdxn0j5U
+   3rhoVYZ01bV7elVPmaWw/ckqsd0iILZe0x+W0mSMx9qrnSVEtbw4rvo6z
+   M5hLadE9a+KUPXiCE/w4A03eCnExBDNTMSqLbTk/r37NYHjbU70zyE3SM
+   g==;
+X-CSE-ConnectionGUID: EZWPyiB6S9KiFT6DKfxjxA==
+X-CSE-MsgGUID: 07XoWa+5TBOCIV3mZenWgw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136682"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136682"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:51 -0800
+X-CSE-ConnectionGUID: MiEptcrPQgi3rw/P5nNNDA==
+X-CSE-MsgGUID: DrdTMc52RGuwpeHC+Js9gg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763975"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:51 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 20/23] sched/cache: Add user control to adjust the parameters of cache-aware scheduling
+Date: Wed,  3 Dec 2025 15:07:39 -0800
+Message-Id: <e5336e1a9fd555ba6af4a35ea5f834be6f5186b6.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Introduce a set of debugfs knobs to control the enabling of
+and parameters for cache-aware load balancing.
+
+(1) llc_enabled
+llc_enabled acts as the primary switch - users can toggle it to
+enable or disable cache aware load balancing.
+
+(2) llc_aggr_tolerance
+With sched_cache enabled, the scheduler uses a process's RSS as a
+proxy for its LLC footprint to determine if aggregating tasks on the
+preferred LLC could cause cache contention. If RSS exceeds the LLC
+size, aggregation is skipped. Some workloads with large RSS but small
+actual memory footprints may still benefit from aggregation. Since
+the kernel cannot efficiently track per-task cache usage (resctrl is
+user-space only), userspace can provide a more accurate hint.
+
+Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
+users control how strictly RSS limits aggregation. Values range from
+0 to 100:
+
+  - 0: Cache-aware scheduling is disabled.
+  - 1: Strict; tasks with RSS larger than LLC size are skipped.
+  - 100: Aggressive; tasks are aggregated regardless of RSS.
+
+For example, with a 32MB L3 cache:
+
+  - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
+  - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
+    (784GB = (1 + (99 - 1) * 256) * 32MB).
+
+Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
+how strictly the number of active threads is considered when doing
+cache aware load balance. The number of SMTs is also considered.
+High SMT counts reduce the aggregation capacity, preventing excessive
+task aggregation on SMT-heavy systems like Power10/Power11.
+
+For example, with 8 Cores/16 CPUs in a L3:
+
+  - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
+  - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
+    785 = (1 + (99 - 1) * 8).
+
+(3) llc_epoch_period/llc_epoch_affinity_timeout
+Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
+into tunable.
+
+Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
+Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
+Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
+Suggested-by: Tingyin Duan <tingyin.duan@gmail.com>
+Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+---
+
+Notes:
+    v1->v2: Remove the smt_nr check in fits_llc_capacity().
+            (Aaron Lu)
+
+ include/linux/sched.h   |  4 ++-
+ kernel/sched/debug.c    | 62 ++++++++++++++++++++++++++++++++++++++++
+ kernel/sched/fair.c     | 63 ++++++++++++++++++++++++++++++++++++-----
+ kernel/sched/sched.h    |  5 ++++
+ kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
+ 5 files changed, 178 insertions(+), 10 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 466ba8b7398c..95bf080bbbf0 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
+ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+ 
+ #ifdef CONFIG_SCHED_CACHE
++DECLARE_STATIC_KEY_FALSE(sched_cache_on);
++
+ static inline bool sched_cache_enabled(void)
+ {
+-	return false;
++	return static_branch_unlikely(&sched_cache_on);
+ }
+ #endif
+ 
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 02e16b70a790..cde324672103 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
+ 	.release	= single_release,
+ };
+ 
++#ifdef CONFIG_SCHED_CACHE
++#define SCHED_CACHE_CREATE_CONTROL(name, max)			  \
++static ssize_t sched_cache_write_##name(struct file *filp,	  \
++					const char __user *ubuf,  \
++					size_t cnt, loff_t *ppos) \
++{								  \
++	char buf[16];						  \
++	unsigned int val;					  \
++	if (cnt > 15)						  \
++		cnt = 15;					  \
++	if (copy_from_user(&buf, ubuf, cnt))			  \
++		return -EFAULT;					  \
++	buf[cnt] = '\0';					  \
++	if (kstrtouint(buf, 10, &val))				  \
++		return -EINVAL;					  \
++	if (val > (max))						  \
++		return -EINVAL;					  \
++	llc_##name = val;					  \
++	if (!strcmp(#name, "enabled"))				  \
++		sched_cache_set(false);				  \
++	*ppos += cnt;						  \
++	return cnt;						  \
++}								  \
++static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
++{								  \
++	seq_printf(m, "%d\n", llc_##name);			  \
++	return 0;						  \
++}								  \
++static int sched_cache_open_##name(struct inode *inode,		  \
++				   struct file *filp)		  \
++{								  \
++	return single_open(filp, sched_cache_show_##name, NULL);  \
++}								  \
++static const struct file_operations sched_cache_fops_##name = {	  \
++	.open		= sched_cache_open_##name,		  \
++	.write		= sched_cache_write_##name,		  \
++	.read		= seq_read,				  \
++	.llseek		= seq_lseek,				  \
++	.release	= single_release,			  \
++}
++
++SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
++SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
++SCHED_CACHE_CREATE_CONTROL(enabled, 1);
++#endif /* SCHED_CACHE */
++
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ #endif /* CONFIG_NUMA_BALANCING */
+ 
++#ifdef CONFIG_SCHED_CACHE
++	debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_overload_pct);
++	debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_imb_pct);
++	debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_aggr_tolerance);
++	debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
++			    &sched_cache_fops_enabled);
++	debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
++			   &llc_epoch_period);
++	debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
++			   &llc_epoch_affinity_timeout);
++#endif
++
+ 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ 
+ 	debugfs_fair_server_init();
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 424ec601cfdf..a2e2d6742481 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
+ 
+ __read_mostly unsigned int llc_overload_pct       = 50;
+ __read_mostly unsigned int llc_imb_pct            = 20;
++__read_mostly unsigned int llc_aggr_tolerance     = 1;
++__read_mostly unsigned int llc_epoch_period       = EPOCH_PERIOD;
++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
+ 
+ static int llc_id(int cpu)
+ {
+@@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
+ 	return llc;
+ }
+ 
++static inline int get_sched_cache_scale(int mul)
++{
++	if (!llc_aggr_tolerance)
++		return 0;
++
++	if (llc_aggr_tolerance == 100)
++		return INT_MAX;
++
++	return (1 + (llc_aggr_tolerance - 1) * mul);
++}
++
+ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ {
++	unsigned int llc, scale;
+ 	struct cacheinfo *ci;
+ 	unsigned long rss;
+-	unsigned int llc;
+ 
+ 	/*
+ 	 * get_cpu_cacheinfo_level() can not be used
+@@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
+ 	rss = get_mm_counter(mm, MM_ANONPAGES) +
+ 		get_mm_counter(mm, MM_SHMEMPAGES);
+ 
+-	return (llc <= (rss * PAGE_SIZE));
++	/*
++	 * Scale the LLC size by 256*llc_aggr_tolerance
++	 * and compare it to the task's RSS size.
++	 *
++	 * Suppose the L3 size is 32MB. If the
++	 * llc_aggr_tolerance is 1:
++	 * When the RSS is larger than 32MB, the process
++	 * is regarded as exceeding the LLC capacity. If
++	 * the llc_aggr_tolerance is 99:
++	 * When the RSS is larger than 784GB, the process
++	 * is regarded as exceeding the LLC capacity because:
++	 * 784GB = (1 + (99 - 1) * 256) * 32MB
++	 */
++	scale = get_sched_cache_scale(256);
++	if (scale == INT_MAX)
++		return false;
++
++	return ((llc * scale) <= (rss * PAGE_SIZE));
+ }
+ 
+ static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
+ {
+-	int smt_nr = 1;
++	int smt_nr = 1, scale;
+ 
+ #ifdef CONFIG_SCHED_SMT
+ 	if (sched_smt_active())
+ 		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
+ #endif
++	/*
++	 * Scale the Core number in a LLC by llc_aggr_tolerance
++	 * and compare it to the task's active threads.
++	 *
++	 * Suppose the number of Cores in LLC is 8.
++	 * Every core has 2 SMTs.
++	 * If the llc_aggr_tolerance is 1: When the
++	 * nr_running is larger than 8, the process
++	 * is regarded as exceeding the LLC capacity.
++	 * If the llc_aggr_tolerance is 99:
++	 * When the nr_running is larger than 785,
++	 * the process is regarded as exceeding
++	 * the LLC capacity:
++	 * 785 = 1 + (99 - 1) * 8
++	 */
++	scale = get_sched_cache_scale(1);
++	if (scale == INT_MAX)
++		return false;
+ 
+-	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
++	return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
+ }
+ 
+ static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+@@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+ 	long delta = now - rq->cpu_epoch_next;
+ 
+ 	if (delta > 0) {
+-		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
++		n = (delta + llc_epoch_period - 1) / llc_epoch_period;
+ 		rq->cpu_epoch += n;
+-		rq->cpu_epoch_next += n * EPOCH_PERIOD;
++		rq->cpu_epoch_next += n * llc_epoch_period;
+ 		__shr_u64(&rq->cpu_runtime, n);
+ 	}
+ 
+@@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	 * has only 1 thread, or has too many active threads, invalidate
+ 	 * its preferred state.
+ 	 */
+-	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
++	if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
+ 	    get_nr_threads(p) <= 1 ||
+ 	    exceed_llc_nr(mm, cpu_of(rq)) ||
+ 	    exceed_llc_capacity(mm, cpu_of(rq))) {
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 40798a06e058..15d126bd3728 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #ifdef CONFIG_SCHED_CACHE
+ extern unsigned int llc_overload_pct;
+ extern unsigned int llc_imb_pct;
++extern unsigned int llc_aggr_tolerance;
++extern unsigned int llc_epoch_period;
++extern unsigned int llc_epoch_affinity_timeout;
++extern unsigned int llc_enabled;
++void sched_cache_set(bool locked);
+ #endif
+ 
+ #ifdef CONFIG_SCHED_HRTICK
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 9799e3a9a609..818599ddaaef 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -26,6 +26,49 @@ int max_llcs;
+ 
+ static bool sched_cache_present;
+ 
++unsigned int llc_enabled = 1;
++DEFINE_STATIC_KEY_FALSE(sched_cache_on);
++
++/*
++ * Enable/disable cache aware scheduling according to
++ * user input and the presence of hardware support.
++ */
++static void _sched_cache_set(bool enable, bool locked)
++{
++	if (enable) {
++		if (locked)
++			static_branch_enable_cpuslocked(&sched_cache_on);
++		else
++			static_branch_enable(&sched_cache_on);
++	} else {
++		if (locked)
++			static_branch_disable_cpuslocked(&sched_cache_on);
++		else
++			static_branch_disable(&sched_cache_on);
++	}
++}
++
++void sched_cache_set(bool locked)
++{
++	/* hardware does not support */
++	if (!sched_cache_present) {
++		if (static_branch_likely(&sched_cache_on))
++			_sched_cache_set(false, locked);
++
++		return;
++	}
++
++	/* user wants it or not ?*/
++	if (llc_enabled) {
++		if (!static_branch_likely(&sched_cache_on))
++			_sched_cache_set(true, locked);
++
++	} else {
++		if (static_branch_likely(&sched_cache_on))
++			_sched_cache_set(false, locked);
++	}
++}
++
+ static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
+ {
+ 	unsigned int *new = NULL;
+@@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
+ 	 * new buffer.
+ 	 */
+ 	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
+-	if (!tmp_llc_pref)
+-		return -ENOMEM;
++	if (!tmp_llc_pref) {
++		sched_cache_present = false;
++		ret = -ENOMEM;
++
++		goto out;
++	}
+ 
+ 	for_each_present_cpu(i)
+ 		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
+@@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
+ 		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
+ 		if (!new) {
+ 			ret = -ENOMEM;
++			sched_cache_present = false;
+ 
+ 			goto release_old;
+ 		}
+@@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
+ 	if (!ret)
+ 		max_llcs = new_max_llcs;
+ 
++out:
++	sched_cache_set(true);
+ 	return ret;
+ }
+ 
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip
new file mode 100644
index 0000000..91cb19a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip
@@ -0,0 +1,174 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D10542F12DD
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:53 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802915; cv=none; b=AmWzQXbFY2sN5heLcp4s9rWoLO7pjURsg464nsA8jjoqA5nJagwpJv9G+UJULof1tTaFgz2GmAr0hHkABofj6ydnfXE2fd4hRRYb7GE+M+4gERnZr5wAJOQw/zTEmxBeWSSE5iNgbAWmM054GBUn6MCdpITYzuKbb1BP7b3L3sk=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802915; c=relaxed/simple;
+	bh=heewblv8+VUSifHzkX3W2P+i26TuBbpse5E1oodIHu8=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=Xpw0JLbgiphYf3Sab645eHcm9Luo+Mx2FuFXrjcPsXJxnYfglU5zHbY1C3nGcYUTlQht3caQEhhC7tRceDrXIkNZHUg5zn5pvhgic99RbM9RtmxCAUWRJKHEvQHILxmwPExiCxVB0m/pqwl8+stVV67Gqhqd6Lhw1hT41ldDB9s=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Cxg4oTl4; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Cxg4oTl4"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802913; x=1796338913;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=heewblv8+VUSifHzkX3W2P+i26TuBbpse5E1oodIHu8=;
+  b=Cxg4oTl4FXmQXHOKywDD1PXh0TwFbaiKduxzegiGnyiEGbaHQGeStB45
+   heDXhCr5sdgqIhbxUFp1vM0glTwn0l4/6ZiEL/dgHN9LNlGjaYsII9jc1
+   2qGZ9JRhqrUWqdc8Jm6fWF0Wuz16A6ncwR05z1/osHOGjbKNCnVNF9Y0l
+   4FSdn5Pg7wz/0mo5Tfd9kz21TLqYSS8tlCVsn5MnhfbvMVKYOtZOb0WKR
+   3KiZKcH2I7DsvpgO/euP9zAwOTpRdP8eIGES5K1LCg7I6oiUiavAKbHWR
+   nP3xATAIJENhZb+rdETusA0Fs1MIUcnKK88Vr8NJIw3yCIQUWh4CdT9qz
+   A==;
+X-CSE-ConnectionGUID: 32GrqILbQRmayJvwXZJ8Bg==
+X-CSE-MsgGUID: F77VqvbYTl+X7/J43cXgzw==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136713"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136713"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:53 -0800
+X-CSE-ConnectionGUID: U2qPsSdSRnej2tO9wNUi2g==
+X-CSE-MsgGUID: YiZKZfnpSMaNZy2O6pI9Xg==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199763990"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:52 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 21/23] -- DO NOT APPLY!!! -- sched/cache/stats: Add schedstat for cache aware load balancing
+Date: Wed,  3 Dec 2025 15:07:40 -0800
+Message-Id: <71b94a7547f7843230270e20b84ecb0a540ab604.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Debug patch only.
+
+With cache-aware load balancing enabled, statistics related to its activity
+are exposed via /proc/schedstat and debugfs. For instance, if users want to
+verify metrics like the number of exceeding RSS and nr_running limits, they
+can filter the output of /sys/kernel/debug/sched/debug and compute the required
+statistics manually:
+
+llc_exceed_cap SUM: 6
+llc_exceed_nr SUM: 4531
+
+Furthermore, these statistics exposed in /proc/schedstats can be queried manually
+or via perf sched stats[1] with minor modifications.
+
+Link: https://lore.kernel.org/all/20250909114227.58802-1-swapnil.sapkal@amd.com #1
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/linux/sched/topology.h | 1 +
+ kernel/sched/fair.c            | 1 +
+ kernel/sched/stats.c           | 5 +++--
+ 3 files changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 0ba4697d74ba..8702c1e731a0 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -108,6 +108,7 @@ struct sched_domain {
+ 	unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES];
+ 	unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES];
+ 	unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_imbalance_llc[CPU_MAX_IDLE_TYPES];
+ 	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+ 	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+ 	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a2e2d6742481..742e455b093e 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -12684,6 +12684,7 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
+ 		__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ 		break;
+ 	case migrate_llc_task:
++		__schedstat_add(sd->lb_imbalance_llc[idle], env->imbalance);
+ 		break;
+ 	}
+ }
+diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
+index d1c9429a4ac5..3736f6102261 100644
+--- a/kernel/sched/stats.c
++++ b/kernel/sched/stats.c
+@@ -104,7 +104,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+  * Bump this up when changing the output format or the meaning of an existing
+  * format, so that tools can adapt (or abort)
+  */
+-#define SCHEDSTAT_VERSION 17
++#define SCHEDSTAT_VERSION 18
+ 
+ static int show_schedstat(struct seq_file *seq, void *v)
+ {
+@@ -139,7 +139,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
+ 			seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
+ 				   cpumask_pr_args(sched_domain_span(sd)));
+ 			for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
+-				seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
++				seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u",
+ 				    sd->lb_count[itype],
+ 				    sd->lb_balanced[itype],
+ 				    sd->lb_failed[itype],
+@@ -147,6 +147,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
+ 				    sd->lb_imbalance_util[itype],
+ 				    sd->lb_imbalance_task[itype],
+ 				    sd->lb_imbalance_misfit[itype],
++				    sd->lb_imbalance_llc[itype],
+ 				    sd->lb_gained[itype],
+ 				    sd->lb_hot_gained[itype],
+ 				    sd->lb_nobusyq[itype],
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip
new file mode 100644
index 0000000..434c1b6
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip
@@ -0,0 +1,172 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B53DB2EDD63
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:55 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802917; cv=none; b=Sv2g8yh/ssOUkCxGvmjgju6aonEWXYABCuXTb+U7pmXY6LV36x4JKu1MuMeuYO1vCluXZy/7Ay7i1yE6FtkBqXrqbYaDn/USnq7xKePL08B+Z5erY6PuyaIsHhWqUANVdUR5D6Behj/PK8qsySaRT1rgt6AitMIk8lP+NbOAHCE=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802917; c=relaxed/simple;
+	bh=Z1RTLO9XI8wzi8HuLfkDHGWZXGFVHLiwaXN3uD70l1Y=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=CATqSWfOo+6YE9nXLVWZJO6JnMOLrl52x8cVMx1zwPuSpCTUr3IN5JnkiXN2GyKQ26mCPXBWcWxBHdzMY7E9cxtAmJLxGzbXdU2Fg+4DSuAYi1K0o6tozFHYiuKS+6QKbzMtYuK8+ri9bLYJjOu4P79WeHsP8FgYaIsrRFSRu2w=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=mJ2c+rcP; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="mJ2c+rcP"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802915; x=1796338915;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=Z1RTLO9XI8wzi8HuLfkDHGWZXGFVHLiwaXN3uD70l1Y=;
+  b=mJ2c+rcP1UOBgGP4yRYC4G9oY4qxvoF1rz/E8g2VluXVhdaKym+KKeiM
+   98QozNlJsgm6c2psR2Mp1UJhkz/Z+hMiEVNErwajLDcIdLXPKWwrmkhgP
+   CWKO4YFSmv7sZsGBLUL6MPnqDCpqzgPQvR5FKXPgi7m3I3rXLqAaZgLzM
+   bfubfkiwaBvcluOfyoYhJ37GeqSNPw53SP+PU0pGAu+cSL5BeyuIN+g+r
+   dRFzsYKK0wBWGsqYyMy6aje2lH7qKav3U/83YEE1h0WkyFF5hAmr4RJRT
+   /HIg5gjIb43mMeVrXXMSuFG2ajgVo7HXw1utNSmLiOQiREq43MfL2zw8y
+   w==;
+X-CSE-ConnectionGUID: 9oIPtAybQ2qo8rrXonYwDQ==
+X-CSE-MsgGUID: zYiFP9hSSKCIjoTpSYg3HA==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136743"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136743"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:54 -0800
+X-CSE-ConnectionGUID: WESeCoKDRrGN0u/wU3Xx1Q==
+X-CSE-MsgGUID: 8VfHl2BFSC60lwjXMiz/LQ==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199764003"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:54 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 22/23] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace to track the load balance statistics
+Date: Wed,  3 Dec 2025 15:07:41 -0800
+Message-Id: <445303c70d8d464c35c97f33d4be7b752e8db5ae.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Debug patch only.
+
+The user leverages this trace event (via bpftrace, etc)to monitor the cache
+aware load balance activity - whether the tasks are moved to their preferred
+LLC, or moved out of their preferred LLC.
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ include/trace/events/sched.h | 31 +++++++++++++++++++++++++++++++
+ kernel/sched/fair.c          | 10 ++++++++++
+ 2 files changed, 41 insertions(+)
+
+diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
+index 7b2645b50e78..bd03f49f7e3c 100644
+--- a/include/trace/events/sched.h
++++ b/include/trace/events/sched.h
+@@ -10,6 +10,37 @@
+ #include <linux/tracepoint.h>
+ #include <linux/binfmts.h>
+ 
++TRACE_EVENT(sched_attach_task,
++
++	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
++		 int attach_cpu, int attach_llc),
++
++	TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
++
++	TP_STRUCT__entry(
++			__array(	char,	comm,	TASK_COMM_LEN	)
++			__field(	pid_t,	pid			)
++			__field(	int,	pref_cpu		)
++			__field(	int,	pref_llc		)
++			__field(	int,	attach_cpu		)
++			__field(	int,	attach_llc		)
++	),
++
++	TP_fast_assign(
++		      memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
++		      __entry->pid	= t->pid;
++		      __entry->pref_cpu	= pref_cpu;
++		      __entry->pref_llc	= pref_llc;
++		      __entry->attach_cpu	= attach_cpu;
++		      __entry->attach_llc	= attach_llc;
++	),
++
++	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
++		  __entry->comm, __entry->pid,
++		  __entry->pref_cpu, __entry->pref_llc,
++		  __entry->attach_cpu, __entry->attach_llc)
++);
++
+ /*
+  * Tracepoint for calling kthread_stop, performed to end a kthread:
+  */
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 742e455b093e..e47b4096f0a6 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10487,6 +10487,16 @@ static void attach_task(struct rq *rq, struct task_struct *p)
+ {
+ 	lockdep_assert_rq_held(rq);
+ 
++#ifdef CONFIG_SCHED_CACHE
++	if (p->mm) {
++		int pref_cpu = p->mm->mm_sched_cpu;
++
++		trace_sched_attach_task(p,
++					pref_cpu,
++					pref_cpu != -1 ? llc_id(pref_cpu) : -1,
++					cpu_of(rq), llc_id(cpu_of(rq)));
++	}
++#endif
+ 	WARN_ON_ONCE(task_rq(p) != rq);
+ 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+ 	wakeup_preempt(rq, p, 0);
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip
new file mode 100644
index 0000000..6969f82
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip
@@ -0,0 +1,323 @@
+From mboxrd@z Thu Jan  1 00:00:00 1970
+Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11])
+	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
+	(No client certificate requested)
+	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0CB492EBBB7
+	for <linux-kernel@vger.kernel.org>; Wed,  3 Dec 2025 23:01:56 +0000 (UTC)
+Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11
+ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
+	t=1764802919; cv=none; b=m/i8AM9jez30fmSC1ThjI0YmAYEwTjLN0aX4/W91cI/xJdwDY3yhTCxjuRQMXmg8XAbCVHRL4AColOXBfQy71E1URs7aT+GLFscw7WH4+OFmIN9YsDx0KaMus5WdBjhF8tzszL6TEZ12kmt42mlqOXQoE5Z3dqzJYmLCcriio58=
+ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
+	s=arc-20240116; t=1764802919; c=relaxed/simple;
+	bh=wLrJr/SuamOWjFO9gpHP5B2k8lcK+6x8dlASnUWXGe8=;
+	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
+	 MIME-Version; b=eJbxWPHDUsl7XKuqPrYe829WccTGNXVp007ecq2JrHaVKwuvPh4j19TPROJM5V4vppIdkk1U3AT26iFdDx2qrmsewZCkwqlDeBPDJqbvbZbY+3Vimkg2ojZhH8CLl94yalOO4ZSXRjWefBovmf2taUbRtFOEBHGk1S0e1XvQ7G4=
+ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jEU0XIYU; arc=none smtp.client-ip=198.175.65.11
+Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com
+Authentication-Results: smtp.subspace.kernel.org;
+	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jEU0XIYU"
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
+  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
+  t=1764802917; x=1796338917;
+  h=from:to:cc:subject:date:message-id:in-reply-to:
+   references:mime-version:content-transfer-encoding;
+  bh=wLrJr/SuamOWjFO9gpHP5B2k8lcK+6x8dlASnUWXGe8=;
+  b=jEU0XIYUmUP1w1odUpoZztkux4d2T4uFzSQDEoeQkO6AEZ1yfHcuVfq9
+   YwImXDzBWY46rQh33rL3qoP+4HJZhnOXgjU9/vwFZtLvGkGs5rHvI8YBx
+   jDLfActh0h/lcktc8ZNAWUhHLuPaktpxkehHuTNiQ+/PYiyL7+Hj8Xdrd
+   41rYFhxJEN7aCEKecsCgMgtV2kyKG5rxF89kVp/FA/73jNvUXDa5pRoN7
+   yqtdT/I+zUDFwYL0JDyMdCOZxceWrOHrciU5DroHkoBLTkvVc7oA5oIMh
+   KkFun1mmeV+tcvGf8EXfa3CUEmb0TvEhrDlTxbkcFqltiq0sEOiCw8NXE
+   w==;
+X-CSE-ConnectionGUID: 7s0dCQLrSayFkNv254nlIw==
+X-CSE-MsgGUID: oFf+c8koRFSSrN6sf+Ly3g==
+X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136770"
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="77136770"
+Received: from fmviesa004.fm.intel.com ([10.60.135.144])
+  by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:56 -0800
+X-CSE-ConnectionGUID: F/ChnV0DRm2XulsnpHzSUQ==
+X-CSE-MsgGUID: VHoDeeBRRb2BuWXAH6c04Q==
+X-ExtLoop1: 1
+X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; 
+   d="scan'208";a="199764012"
+Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
+  by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:55 -0800
+From: Tim Chen <tim.c.chen@linux.intel.com>
+To: Peter Zijlstra <peterz@infradead.org>,
+	Ingo Molnar <mingo@redhat.com>,
+	K Prateek Nayak <kprateek.nayak@amd.com>,
+	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
+	Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Chen Yu <yu.c.chen@intel.com>,
+	Juri Lelli <juri.lelli@redhat.com>,
+	Dietmar Eggemann <dietmar.eggemann@arm.com>,
+	Steven Rostedt <rostedt@goodmis.org>,
+	Ben Segall <bsegall@google.com>,
+	Mel Gorman <mgorman@suse.de>,
+	Valentin Schneider <vschneid@redhat.com>,
+	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
+	Hillf Danton <hdanton@sina.com>,
+	Shrikanth Hegde <sshegde@linux.ibm.com>,
+	Jianyong Wu <jianyong.wu@outlook.com>,
+	Yangyu Chen <cyy@cyyself.name>,
+	Tingyin Duan <tingyin.duan@gmail.com>,
+	Vern Hao <vernhao@tencent.com>,
+	Vern Hao <haoxing990@gmail.com>,
+	Len Brown <len.brown@intel.com>,
+	Tim Chen <tim.c.chen@linux.intel.com>,
+	Aubrey Li <aubrey.li@intel.com>,
+	Zhao Liu <zhao1.liu@intel.com>,
+	Chen Yu <yu.chen.surf@gmail.com>,
+	Adam Li <adamli@os.amperecomputing.com>,
+	Aaron Lu <ziqianlu@bytedance.com>,
+	Tim Chen <tim.c.chen@intel.com>,
+	linux-kernel@vger.kernel.org
+Subject: [PATCH v2 23/23] -- DO NOT APPLY!!! -- sched/cache/debug: Display the per LLC occupancy for each process via proc fs
+Date: Wed,  3 Dec 2025 15:07:42 -0800
+Message-Id: <0eaf9b9f89f0d97dbf46b760421f65aee3ffe063.1764801860.git.tim.c.chen@linux.intel.com>
+X-Mailer: git-send-email 2.32.0
+In-Reply-To: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+References: <cover.1764801860.git.tim.c.chen@linux.intel.com>
+Precedence: bulk
+X-Mailing-List: linux-kernel@vger.kernel.org
+List-Id: <linux-kernel.vger.kernel.org>
+List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
+List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+
+From: Chen Yu <yu.c.chen@intel.com>
+
+Debug patch only.
+
+Show the per-LLC occupancy in /proc/{PID}/schedstat, with each column
+corresponding to one LLC. This can be used to verify if the cache-aware
+load balancer works as expected by aggregating threads onto dedicated LLCs.
+
+Suppose there are 2 LLCs and the sampling duration is 10 seconds:
+
+Enable the cache aware load balance:
+0 12281  <--- LLC0 residency delta is 0, LLC1 is 12 seconds
+0 18881
+0 16217
+
+disable the cache aware load balance:
+6497 15802
+9299 5435
+17811 8278
+
+Signed-off-by: Chen Yu <yu.c.chen@intel.com>
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+---
+ fs/proc/base.c           | 22 ++++++++++++++++++++++
+ include/linux/mm_types.h | 19 +++++++++++++++++--
+ include/linux/sched.h    |  3 +++
+ kernel/sched/fair.c      | 40 ++++++++++++++++++++++++++++++++++++++--
+ 4 files changed, 80 insertions(+), 4 deletions(-)
+
+diff --git a/fs/proc/base.c b/fs/proc/base.c
+index 6299878e3d97..f4be96f4bd01 100644
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -518,6 +518,28 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ 		   (unsigned long long)task->se.sum_exec_runtime,
+ 		   (unsigned long long)task->sched_info.run_delay,
+ 		   task->sched_info.pcount);
++#ifdef CONFIG_SCHED_CACHE
++	if (sched_cache_enabled()) {
++		struct mm_struct *mm = task->mm;
++		u64 *llc_runtime;
++
++		if (!mm)
++			return 0;
++
++		llc_runtime = kcalloc(max_llcs, sizeof(u64), GFP_KERNEL);
++		if (!llc_runtime)
++			return 0;
++
++		if (get_mm_per_llc_runtime(task, llc_runtime))
++			goto out;
++
++		for (int i = 0; i < max_llcs; i++)
++			seq_printf(m, "%llu ", llc_runtime[i]);
++		seq_puts(m, "\n");
++out:
++		kfree(llc_runtime);
++	}
++#endif
+ 
+ 	return 0;
+ }
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 04743983de4d..255c22be7312 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -944,6 +944,10 @@ struct mm_sched {
+ 	unsigned long epoch;
+ };
+ 
++struct mm_time {
++	u64 runtime_ns;
++};
++
+ struct kioctx_table;
+ struct iommu_mm_data;
+ struct mm_struct {
+@@ -1040,6 +1044,7 @@ struct mm_struct {
+ 		 * See account_mm_sched() and ...
+ 		 */
+ 		struct mm_sched __percpu *pcpu_sched;
++		struct mm_time __percpu *pcpu_time;
+ 		raw_spinlock_t mm_sched_lock;
+ 		unsigned long mm_sched_epoch;
+ 		int mm_sched_cpu;
+@@ -1505,16 +1510,24 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
+ #endif /* CONFIG_SCHED_MM_CID */
+ 
+ #ifdef CONFIG_SCHED_CACHE
+-void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched,
++		   struct mm_time __percpu *pcpu_time);
+ 
+ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+ {
+ 	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
++	struct mm_time __percpu *pcpu_time;
+ 
+ 	if (!pcpu_sched)
+ 		return -ENOMEM;
+ 
+-	mm_init_sched(mm, pcpu_sched);
++	pcpu_time = alloc_percpu_noprof(struct mm_time);
++	if (!pcpu_time) {
++		free_percpu(mm->pcpu_sched);
++		return -ENOMEM;
++	}
++
++	mm_init_sched(mm, pcpu_sched, pcpu_time);
+ 	return 0;
+ }
+ 
+@@ -1523,7 +1536,9 @@ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+ static inline void mm_destroy_sched(struct mm_struct *mm)
+ {
+ 	free_percpu(mm->pcpu_sched);
++	free_percpu(mm->pcpu_time);
+ 	mm->pcpu_sched = NULL;
++	mm->pcpu_time = NULL;
+ }
+ #else /* !CONFIG_SCHED_CACHE */
+ 
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 95bf080bbbf0..875ac3f4208b 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2442,6 +2442,9 @@ static inline bool sched_cache_enabled(void)
+ {
+ 	return static_branch_unlikely(&sched_cache_on);
+ }
++
++int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf);
++extern int max_llcs;
+ #endif
+ 
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index e47b4096f0a6..205208f061bb 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1355,16 +1355,19 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+ 	p->sched_llc_active = false;
+ }
+ 
+-void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched,
++		   struct mm_time __percpu *_pcpu_time)
+ {
+ 	unsigned long epoch;
+ 	int i;
+ 
+ 	for_each_possible_cpu(i) {
+ 		struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
++		struct mm_time *pcpu_time = per_cpu_ptr(_pcpu_time, i);
+ 		struct rq *rq = cpu_rq(i);
+ 
+ 		pcpu_sched->runtime = 0;
++		pcpu_time->runtime_ns = 0;
+ 		pcpu_sched->epoch = rq->cpu_epoch;
+ 		epoch = rq->cpu_epoch;
+ 	}
+@@ -1379,6 +1382,8 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
+ 	 * the readers may get invalid mm_sched_epoch, etc.
+ 	 */
+ 	smp_store_release(&mm->pcpu_sched, _pcpu_sched);
++	/* same as above */
++	smp_store_release(&mm->pcpu_time, _pcpu_time);
+ }
+ 
+ /* because why would C be fully specified */
+@@ -1428,11 +1433,39 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
+ 
+ static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+ 
++/* p->pi_lock is hold */
++int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf)
++{
++	struct mm_struct *mm = p->mm;
++	struct mm_time *pcpu_time;
++	int cpu;
++
++	if (!mm)
++		return -EINVAL;
++
++	rcu_read_lock();
++	for_each_online_cpu(cpu) {
++		int llc = llc_id(cpu);
++		u64 runtime_ms;
++
++		if (llc < 0)
++			continue;
++
++		pcpu_time = per_cpu_ptr(mm->pcpu_time, cpu);
++		runtime_ms = div_u64(pcpu_time->runtime_ns, NSEC_PER_MSEC);
++		buf[llc] += runtime_ms;
++	}
++	rcu_read_unlock();
++
++	return 0;
++}
++
+ static inline
+ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ {
+ 	struct mm_struct *mm = p->mm;
+ 	struct mm_sched *pcpu_sched;
++	struct mm_time *pcpu_time;
+ 	unsigned long epoch;
+ 	int mm_sched_llc = -1;
+ 
+@@ -1444,14 +1477,17 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+ 	/*
+ 	 * init_task and kthreads don't having mm
+ 	 */
+-	if (!mm || !mm->pcpu_sched)
++	if (!mm || !mm->pcpu_sched || !mm->pcpu_time)
+ 		return;
+ 
+ 	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
++	pcpu_time = per_cpu_ptr(p->mm->pcpu_time, cpu_of(rq));
+ 
+ 	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ 		__update_mm_sched(rq, pcpu_sched);
+ 		pcpu_sched->runtime += delta_exec;
++		/* pure runtime without decay */
++		pcpu_time->runtime_ns += delta_exec;
+ 		rq->cpu_runtime += delta_exec;
+ 		epoch = rq->cpu_epoch;
+ 	}
+-- 
+2.32.0
+
+
diff --git a/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip b/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip
new file mode 100644
index 0000000..d5645f0
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip
@@ -0,0 +1,259 @@
+From 0f35040de59371ad542b915d7b91176c9910dadc Mon Sep 17 00:00:00 2001
+From: Harry Yoo <harry.yoo@oracle.com>
+Date: Mon, 8 Dec 2025 00:41:47 +0900
+Subject: mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache
+ destruction
+
+Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
+caches when a cache is destroyed. This is unnecessary; only the RCU
+sheaves belonging to the cache being destroyed need to be flushed.
+
+As suggested by Vlastimil Babka, introduce a weaker form of
+kvfree_rcu_barrier() that operates on a specific slab cache.
+
+Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and
+call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache().
+
+Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on
+cache destruction.
+
+The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
+5900X machine (1 socket), by loading slub_kunit module.
+
+Before:
+  Total calls: 19
+  Average latency (us): 18127
+  Total time (us): 344414
+
+After:
+  Total calls: 19
+  Average latency (us): 10066
+  Total time (us): 191264
+
+Two performance regression have been reported:
+  - stress module loader test's runtime increases by 50-60% (Daniel)
+  - internal graphics test's runtime on Tegra234 increases by 35% (Jon)
+
+They are fixed by this change.
+
+Suggested-by: Vlastimil Babka <vbabka@suse.cz>
+Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
+Reported-and-tested-by: Daniel Gomez <da.gomez@samsung.com>
+Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
+Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
+Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
+Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
+Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/slab.h |  7 +++++++
+ mm/slab.h            |  1 +
+ mm/slab_common.c     | 52 +++++++++++++++++++++++++++++++++++--------------
+ mm/slub.c            | 55 ++++++++++++++++++++++++++++------------------------
+ 4 files changed, 75 insertions(+), 40 deletions(-)
+
+diff --git a/include/linux/slab.h b/include/linux/slab.h
+index cf443f064a667e..2482992248dc9c 100644
+--- a/include/linux/slab.h
++++ b/include/linux/slab.h
+@@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void)
+ 	rcu_barrier();
+ }
+ 
++static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
++{
++	rcu_barrier();
++}
++
+ static inline void kfree_rcu_scheduler_running(void) { }
+ #else
+ void kvfree_rcu_barrier(void);
+ 
++void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
++
+ void kfree_rcu_scheduler_running(void);
+ #endif
+ 
+diff --git a/mm/slab.h b/mm/slab.h
+index f730e012553ccd..e767aa7e91b098 100644
+--- a/mm/slab.h
++++ b/mm/slab.h
+@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
+ 
+ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+ void flush_all_rcu_sheaves(void);
++void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
+ 
+ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+ 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
+diff --git a/mm/slab_common.c b/mm/slab_common.c
+index 84dfff4f7b1fce..dd8a49d6f9cc3d 100644
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
+ 		return;
+ 
+ 	/* in-flight kfree_rcu()'s may include objects from our cache */
+-	kvfree_rcu_barrier();
++	kvfree_rcu_barrier_on_cache(s);
+ 
+ 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
+ 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
+@@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+ }
+ EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+ 
+-/**
+- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+- *
+- * Note that a single argument of kvfree_rcu() call has a slow path that
+- * triggers synchronize_rcu() following by freeing a pointer. It is done
+- * before the return from the function. Therefore for any single-argument
+- * call that will result in a kfree() to a cache that is to be destroyed
+- * during module exit, it is developer's responsibility to ensure that all
+- * such calls have returned before the call to kmem_cache_destroy().
+- */
+-void kvfree_rcu_barrier(void)
++static inline void __kvfree_rcu_barrier(void)
+ {
+ 	struct kfree_rcu_cpu_work *krwp;
+ 	struct kfree_rcu_cpu *krcp;
+ 	bool queued;
+ 	int i, cpu;
+ 
+-	flush_all_rcu_sheaves();
+-
+ 	/*
+ 	 * Firstly we detach objects and queue them over an RCU-batch
+ 	 * for all CPUs. Finally queued works are flushed for each CPU.
+@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
+ 		}
+ 	}
+ }
++
++/**
++ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
++ *
++ * Note that a single argument of kvfree_rcu() call has a slow path that
++ * triggers synchronize_rcu() following by freeing a pointer. It is done
++ * before the return from the function. Therefore for any single-argument
++ * call that will result in a kfree() to a cache that is to be destroyed
++ * during module exit, it is developer's responsibility to ensure that all
++ * such calls have returned before the call to kmem_cache_destroy().
++ */
++void kvfree_rcu_barrier(void)
++{
++	flush_all_rcu_sheaves();
++	__kvfree_rcu_barrier();
++}
+ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+ 
++/**
++ * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
++ *                               specific slab cache.
++ * @s: slab cache to wait for
++ *
++ * See the description of kvfree_rcu_barrier() for details.
++ */
++void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
++{
++	if (s->cpu_sheaves)
++		flush_rcu_sheaves_on_cache(s);
++	/*
++	 * TODO: Introduce a version of __kvfree_rcu_barrier() that works
++	 * on a specific slab cache.
++	 */
++	__kvfree_rcu_barrier();
++}
++EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
++
+ static unsigned long
+ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+ {
+@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
+ }
+ 
+ #endif /* CONFIG_KVFREE_RCU_BATCHED */
+-
+diff --git a/mm/slub.c b/mm/slub.c
+index 2acce22590f846..f22ba8be29e060 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
+ 
+ 
+ /* needed for kvfree_rcu_barrier() */
+-void flush_all_rcu_sheaves(void)
++void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
+ {
+ 	struct slub_flush_work *sfw;
+-	struct kmem_cache *s;
+ 	unsigned int cpu;
+ 
+-	cpus_read_lock();
+-	mutex_lock(&slab_mutex);
++	mutex_lock(&flush_lock);
+ 
+-	list_for_each_entry(s, &slab_caches, list) {
+-		if (!s->cpu_sheaves)
+-			continue;
++	for_each_online_cpu(cpu) {
++		sfw = &per_cpu(slub_flush, cpu);
+ 
+-		mutex_lock(&flush_lock);
++		/*
++		 * we don't check if rcu_free sheaf exists - racing
++		 * __kfree_rcu_sheaf() might have just removed it.
++		 * by executing flush_rcu_sheaf() on the cpu we make
++		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
++		 */
+ 
+-		for_each_online_cpu(cpu) {
+-			sfw = &per_cpu(slub_flush, cpu);
++		INIT_WORK(&sfw->work, flush_rcu_sheaf);
++		sfw->s = s;
++		queue_work_on(cpu, flushwq, &sfw->work);
++	}
+ 
+-			/*
+-			 * we don't check if rcu_free sheaf exists - racing
+-			 * __kfree_rcu_sheaf() might have just removed it.
+-			 * by executing flush_rcu_sheaf() on the cpu we make
+-			 * sure the __kfree_rcu_sheaf() finished its call_rcu()
+-			 */
++	for_each_online_cpu(cpu) {
++		sfw = &per_cpu(slub_flush, cpu);
++		flush_work(&sfw->work);
++	}
+ 
+-			INIT_WORK(&sfw->work, flush_rcu_sheaf);
+-			sfw->s = s;
+-			queue_work_on(cpu, flushwq, &sfw->work);
+-		}
++	mutex_unlock(&flush_lock);
++}
+ 
+-		for_each_online_cpu(cpu) {
+-			sfw = &per_cpu(slub_flush, cpu);
+-			flush_work(&sfw->work);
+-		}
++void flush_all_rcu_sheaves(void)
++{
++	struct kmem_cache *s;
++
++	cpus_read_lock();
++	mutex_lock(&slab_mutex);
+ 
+-		mutex_unlock(&flush_lock);
++	list_for_each_entry(s, &slab_caches, list) {
++		if (!s->cpu_sheaves)
++			continue;
++		flush_rcu_sheaves_on_cache(s);
+ 	}
+ 
+ 	mutex_unlock(&slab_mutex);
+-- 
+cgit 1.2.3-korg
+
diff --git a/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch
new file mode 100644
index 0000000..a2d49fc
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch
@@ -0,0 +1,3395 @@
+From 185514200e2848a5af6dc9e6165096ed34ee9d38 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 27 Feb 2026 09:11:53 +0100
+Subject: [PATCH 2/8] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   71 +-
+ include/net/tcp_ecn.h              |    6 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2233 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   42 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 17 files changed, 1938 insertions(+), 554 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 20b8c6e21fef..e334b7a7aac2 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -236,7 +236,8 @@ struct tcp_sock {
+ 		tcp_usec_ts : 1, /* TSval values in usec */
+ 		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+ 		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+-		recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_txrx);
+ 
+ 	/* RX read-mostly hotpath cache lines */
+@@ -292,7 +293,8 @@ struct tcp_sock {
+  *	0x5?10 << 16 + snd_wnd in net byte order
+  */
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	u8	received_ce_pending:4, /* Not yet transmit cnt of received_ce */
+ 		unused2:4;
+ 	u8	accecn_minlen:2,/* Minimum length of AccECN option sent */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index ecb362025c4e..9de884b7fe01 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -137,8 +137,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index e0a5cf2f7818..6a4a5f38c072 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -406,6 +406,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_DEMAND_CWR	BIT(2)
+ #define	TCP_ECN_SEEN		BIT(3)
+ #define	TCP_ECN_MODE_ACCECN	BIT(4)
++#define	TCP_ECN_LOW		BIT(5)
++#define	TCP_ECN_ECT_PERMANENT	BIT(6)
+ 
+ #define	TCP_ECN_DISABLED	0
+ #define	TCP_ECN_MODE_PENDING	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+@@ -851,6 +853,15 @@ static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -956,6 +967,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -1066,9 +1082,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1181,6 +1202,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1209,9 +1231,12 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_ECT_1_NEGOTIATION	BIT(3)
+ /* Cannot fallback to RFC3168 during AccECN negotiation */
+ #define TCP_CONG_NO_FALLBACK_RFC3168	BIT(4)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	BIT(5)
+ #define TCP_CONG_MASK  (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \
+ 			TCP_CONG_NEEDS_ACCECN | TCP_CONG_ECT_1_NEGOTIATION | \
+-			TCP_CONG_NO_FALLBACK_RFC3168)
++			TCP_CONG_NO_FALLBACK_RFC3168 | \
++			TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1231,10 +1256,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1245,7 +1273,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1269,8 +1299,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1336,6 +1369,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1376,6 +1417,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1388,6 +1430,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2562,7 +2619,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
+index a709fb1756eb..5b2f85419201 100644
+--- a/include/net/tcp_ecn.h
++++ b/include/net/tcp_ecn.h
+@@ -613,10 +613,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
+ 		  tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
+ 		  tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -634,6 +633,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 			tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ 		} else {
+ 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
++
++			if (dst)
++				tcp_set_ecn_low_from_dst(sk, dst);
+ 		}
+ 	}
+ }
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index dab9493c791b..cce4975fdcfe 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -517,12 +517,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index dce3113787a7..6efba4f74f6f 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+ #define TCPI_OPT_TFO_CHILD	128 /* child from a Fast Open option on SYN */
++#define TCPI_OPT_ECN_LOW	256 /* Low-latency ECN enabled at conn init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index b71c22475c51..85d95a59708e 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index e01492234b0b..27893b774e08 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 81666571ecfb..86d1a689b41a 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3471,6 +3471,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4251,6 +4252,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..9279be755c16 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,123 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++
++	return tcp_ecn_mode_any(tp) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +384,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +411,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +435,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +458,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +475,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +536,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +549,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +581,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +601,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +672,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +683,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +712,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +741,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +797,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +805,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +851,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +860,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +888,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +925,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +948,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +973,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2362,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2399,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index e9f6c77e0631..8e5e77a77e91 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -238,6 +238,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 0d080a3e27d6..bdc0cdda875d 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -358,7 +358,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+@@ -376,7 +376,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		if (!tcp_ecn_mode_rfc3168(tp))
+ 			break;
+@@ -1305,7 +1305,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1670,6 +1675,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3905,7 +3921,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3922,6 +3939,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3932,6 +3950,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -4058,6 +4081,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -4130,7 +4154,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_in_ack_event(sk, flag);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -4155,6 +4179,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -4180,7 +4205,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5909,13 +5934,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 9776c921d1bb..990df5f9e6c4 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -498,6 +498,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 479afb714bdf..a9eb14d0cf47 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -348,7 +348,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1762,7 +1763,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	u16 flags;
+ 	int nlen;
+@@ -1837,6 +1838,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2193,13 +2218,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2940,6 +2964,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -3152,6 +3177,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 160080c9021d..06ee74f2c01d 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -702,6 +702,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 			       tcp_timeout_expires(sk));
+ 		return;
+ 	}
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.53.0
+
diff --git a/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch b/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch
new file mode 100644
index 0000000..47563fe
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch
@@ -0,0 +1,1729 @@
+From 663014be05bfb67ae7852cbd651afec0db18995c Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 27 Feb 2026 09:09:13 +0100
+Subject: [PATCH 5/8] hdmi
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 304 +++++++++++++----
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |   4 +
+ .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c |  49 ++-
+ drivers/gpu/drm/amd/display/dc/core/dc.c      |   3 +
+ .../gpu/drm/amd/display/dc/core/dc_resource.c |   2 +-
+ drivers/gpu/drm/amd/display/dc/dc.h           |   1 +
+ drivers/gpu/drm/amd/display/dc/dc_stream.h    |   2 +
+ drivers/gpu/drm/amd/display/dc/dc_types.h     |   7 +-
+ drivers/gpu/drm/amd/display/dc/dm_helpers.h   |   2 +-
+ .../amd/display/include/ddc_service_types.h   |   1 +
+ .../amd/display/modules/freesync/freesync.c   |   4 +
+ .../amd/display/modules/inc/mod_info_packet.h |  17 +-
+ .../display/modules/info_packet/info_packet.c | 307 ++++++++++++------
+ drivers/gpu/drm/amd/include/amd_shared.h      |   6 +
+ drivers/gpu/drm/drm_atomic_uapi.c             |   8 +
+ drivers/gpu/drm/drm_connector.c               | 188 +++++++++++
+ drivers/gpu/drm/drm_crtc.c                    |   2 +
+ drivers/gpu/drm/drm_edid.c                    |  41 ++-
+ drivers/gpu/drm/drm_mode_config.c             |   6 +
+ include/drm/drm_connector.h                   |  99 ++++++
+ include/drm/drm_crtc.h                        |   9 +
+ include/drm/drm_mode_config.h                 |   6 +
+ 22 files changed, 872 insertions(+), 196 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index bc4d6d5009bf..bc9aca604aa0 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -2069,6 +2069,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
+ 	if (amdgpu_dc_debug_mask & DC_SKIP_DETECTION_LT)
+ 		adev->dm.dc->debug.skip_detection_link_training = true;
+ 
++	if (amdgpu_dc_debug_mask & DC_OVERRIDE_PCON_VRR_ID_CHECK)
++		adev->dm.dc->debug.override_pcon_vrr_id_check = true;
++
+ 	adev->dm.dc->debug.visual_confirm = amdgpu_dc_visual_confirm;
+ 
+ 	/* TODO: Remove after DP2 receiver gets proper support of Cable ID feature */
+@@ -7370,7 +7373,7 @@ create_stream_for_sink(struct drm_connector *connector,
+ 	update_stream_signal(stream, sink);
+ 
+ 	if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)
+-		mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket);
++		mod_build_hf_vsif_infopacket(stream, &stream->hfvsif_infopacket);
+ 
+ 	if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT ||
+ 	    stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST ||
+@@ -7829,6 +7832,8 @@ amdgpu_dm_connector_atomic_duplicate_state(struct drm_connector *connector)
+ 	__drm_atomic_helper_connector_duplicate_state(connector, &new_state->base);
+ 
+ 	new_state->freesync_capable = state->freesync_capable;
++	new_state->freesync_on_desktop_capable =
++		state->freesync_on_desktop_capable;
+ 	new_state->abm_level = state->abm_level;
+ 	new_state->scaling = state->scaling;
+ 	new_state->underscan_enable = state->underscan_enable;
+@@ -8945,6 +8950,7 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
+ 	aconnector->audio_inst = -1;
+ 	aconnector->pack_sdp_v1_3 = false;
+ 	aconnector->as_type = ADAPTIVE_SYNC_TYPE_NONE;
++	aconnector->hdmi_allm_capable = false;
+ 	memset(&aconnector->vsdb_info, 0, sizeof(aconnector->vsdb_info));
+ 	mutex_init(&aconnector->hpd_lock);
+ 	mutex_init(&aconnector->handle_mst_msg_ready);
+@@ -9035,8 +9041,10 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
+ 	    connector_type == DRM_MODE_CONNECTOR_eDP) {
+ 		drm_connector_attach_hdr_output_metadata_property(&aconnector->base);
+ 
+-		if (!aconnector->mst_root)
++		if (!aconnector->mst_root) {
+ 			drm_connector_attach_vrr_capable_property(&aconnector->base);
++			drm_connector_attach_passive_vrr_capable_property(&aconnector->base);
++		}
+ 
+ 		if (adev->dm.hdcp_workqueue)
+ 			drm_connector_attach_content_protection_property(&aconnector->base, true);
+@@ -9140,6 +9148,10 @@ int amdgpu_dm_initialize_hdmi_connector(struct amdgpu_dm_connector *aconnector)
+ 	struct drm_device *ddev = aconnector->base.dev;
+ 	struct device *hdmi_dev = ddev->dev;
+ 
++	/* ALLM */
++	drm_connector_attach_allm_capable_property(&aconnector->base);
++	drm_connector_attach_allm_mode_property(&aconnector->base);
++
+ 	if (amdgpu_dc_debug_mask & DC_DISABLE_HDMI_CEC) {
+ 		drm_info(ddev, "HDMI-CEC feature masked\n");
+ 		return -EINVAL;
+@@ -9607,7 +9619,11 @@ static void update_freesync_state_on_stream(
+ 
+ 	aconn = (struct amdgpu_dm_connector *)new_stream->dm_stream_context;
+ 
+-	if (aconn && (aconn->as_type == FREESYNC_TYPE_PCON_IN_WHITELIST || aconn->vsdb_info.replay_mode)) {
++	if (aconn && aconn->as_type == ADAPTIVE_SYNC_TYPE_HDMI)
++		packet_type = PACKET_TYPE_VTEM;
++
++	else if (aconn && (aconn->as_type == ADAPTIVE_SYNC_TYPE_PCON_ALLOWED ||
++		      aconn->vsdb_info.replay_mode)) {
+ 		pack_sdp_v1_3 = aconn->pack_sdp_v1_3;
+ 
+ 		if (aconn->vsdb_info.amd_vsdb_version == 1)
+@@ -10826,6 +10842,31 @@ static int amdgpu_dm_atomic_setup_commit(struct drm_atomic_state *state)
+ 	return 0;
+ }
+ 
++static void update_allm_state_on_crtc_stream(struct dm_crtc_state *new_crtc_state,
++					     const struct drm_connector_state *new_conn)
++{
++	struct mod_freesync_config *config = &new_crtc_state->freesync_config;
++	struct dc_stream_state *new_stream = new_crtc_state->stream;
++	bool allm_active = false;
++
++	switch (new_conn->allm_mode) {
++	case DRM_ALLM_MODE_ENABLED_DYNAMIC:
++		allm_active = config->state == VRR_STATE_ACTIVE_VARIABLE ||
++			      new_stream->content_type == DISPLAY_CONTENT_TYPE_GAME;
++		break;
++
++	case DRM_ALLM_MODE_ENABLED_FORCED:
++		allm_active = true;
++		break;
++
++	case DRM_ALLM_MODE_DISABLED:
++	default:
++		allm_active = false;
++	}
++
++	new_stream->hdmi_allm_active = allm_active;
++}
++
+ /**
+  * amdgpu_dm_atomic_commit_tail() - AMDgpu DM's commit tail implementation.
+  * @state: The atomic state to commit
+@@ -10868,12 +10909,14 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
+ 	for_each_oldnew_connector_in_state(state, connector, old_con_state, new_con_state, i) {
+ 		struct dm_connector_state *dm_new_con_state = to_dm_connector_state(new_con_state);
+ 		struct dm_connector_state *dm_old_con_state = to_dm_connector_state(old_con_state);
++		struct amdgpu_dm_connector *dm_conn = to_amdgpu_dm_connector(connector);
+ 		struct amdgpu_crtc *acrtc = to_amdgpu_crtc(dm_new_con_state->base.crtc);
+ 		struct dc_surface_update *dummy_updates;
+ 		struct dc_stream_update stream_update;
+ 		struct dc_info_packet hdr_packet;
+ 		struct dc_stream_status *status = NULL;
+ 		bool abm_changed, hdr_changed, scaling_changed, output_color_space_changed = false;
++		bool allm_changed = false;
+ 
+ 		memset(&stream_update, 0, sizeof(stream_update));
+ 
+@@ -10903,7 +10946,11 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
+ 		hdr_changed =
+ 			!drm_connector_atomic_hdr_metadata_equal(old_con_state, new_con_state);
+ 
+-		if (!scaling_changed && !abm_changed && !hdr_changed && !output_color_space_changed)
++		allm_changed = dm_conn->hdmi_allm_capable &&
++			       (new_con_state->allm_mode != old_con_state->allm_mode);
++
++		if (!scaling_changed && !abm_changed && !hdr_changed &&
++		    !output_color_space_changed && !allm_changed)
+ 			continue;
+ 
+ 		stream_update.stream = dm_new_crtc_state->stream;
+@@ -10933,6 +10980,17 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state)
+ 			stream_update.hdr_static_metadata = &hdr_packet;
+ 		}
+ 
++		if (allm_changed) {
++			update_allm_state_on_crtc_stream(dm_new_crtc_state, new_con_state);
++			mod_build_hf_vsif_infopacket(dm_new_crtc_state->stream,
++				&dm_new_crtc_state->stream->hfvsif_infopacket);
++
++			stream_update.hdmi_allm_active =
++				&dm_new_crtc_state->stream->hdmi_allm_active;
++			stream_update.hfvsif_infopacket =
++				&dm_new_crtc_state->stream->hfvsif_infopacket;
++		}
++
+ 		status = dc_stream_get_status(dm_new_crtc_state->stream);
+ 
+ 		if (WARN_ON(!status))
+@@ -11312,6 +11370,12 @@ static void get_freesync_config_for_crtc(
+ 		config.vsif_supported = true;
+ 		config.btr = true;
+ 
++		if (new_con_state->freesync_on_desktop_capable)
++			new_crtc_state->stream->freesync_on_desktop =
++				!new_crtc_state->base.passive_vrr_disabled;
++		else
++			new_crtc_state->stream->freesync_on_desktop = false;
++
+ 		if (fs_vid_mode) {
+ 			config.state = VRR_STATE_ACTIVE_FIXED;
+ 			config.fixed_refresh_in_uhz = new_crtc_state->freesync_config.fixed_refresh_in_uhz;
+@@ -11323,6 +11387,7 @@ static void get_freesync_config_for_crtc(
+ 		}
+ 	} else {
+ 		config.state = VRR_STATE_UNSUPPORTED;
++		new_crtc_state->stream->freesync_on_desktop = false;
+ 	}
+ out:
+ 	new_crtc_state->freesync_config = config;
+@@ -13114,8 +13179,8 @@ static void parse_edid_displayid_vrr(struct drm_connector *connector,
+ 	}
+ }
+ 
+-static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector,
+-			  const struct edid *edid, struct amdgpu_hdmi_vsdb_info *vsdb_info)
++static int parse_amd_vsdb_did(struct amdgpu_dm_connector *aconnector,
++			      const struct edid *edid, struct amdgpu_hdmi_vsdb_info *vsdb_info)
+ {
+ 	u8 *edid_ext = NULL;
+ 	int i;
+@@ -13131,6 +13196,9 @@ static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector,
+ 			break;
+ 	}
+ 
++	if (i == edid->extensions)
++		return false;
++
+ 	while (j < EDID_LENGTH - sizeof(struct amd_vsdb_block)) {
+ 		struct amd_vsdb_block *amd_vsdb = (struct amd_vsdb_block *)&edid_ext[j];
+ 		unsigned int ieeeId = (amd_vsdb->ieee_id[2] << 16) | (amd_vsdb->ieee_id[1] << 8) | (amd_vsdb->ieee_id[0]);
+@@ -13149,13 +13217,13 @@ static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector,
+ 	return false;
+ }
+ 
+-static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector,
++static bool parse_amd_vsdb_cea(struct amdgpu_dm_connector *aconnector,
+ 			       const struct edid *edid,
+ 			       struct amdgpu_hdmi_vsdb_info *vsdb_info)
+ {
++	struct amdgpu_hdmi_vsdb_info vsdb_local = {0};
+ 	u8 *edid_ext = NULL;
+ 	int i;
+-	bool valid_vsdb_found = false;
+ 
+ 	/*----- drm_find_cea_extension() -----*/
+ 	/* No EDID or EDID extensions */
+@@ -13176,9 +13244,99 @@ static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector,
+ 	if (edid_ext[0] != CEA_EXT)
+ 		return -ENODEV;
+ 
+-	valid_vsdb_found = parse_edid_cea(aconnector, edid_ext, EDID_LENGTH, vsdb_info);
++	if (!parse_edid_cea(aconnector, edid_ext, EDID_LENGTH, &vsdb_local))
++		return -ENODEV;
++
++	*vsdb_info = vsdb_local;
++	return false;
++}
++
++static bool is_monitor_range_invalid(const struct drm_connector *conn)
++{
++	return conn->display_info.monitor_range.min_vfreq == 0 ||
++	       conn->display_info.monitor_range.max_vfreq == 0;
++}
++
++/*
++ * Returns true if (max_vfreq - min_vfreq) > 10
++ */
++static bool is_freesync_capable(const struct drm_monitor_range_info *range)
++{
++	return (range->max_vfreq - range->min_vfreq) > 10;
++}
++
++static void monitor_range_from_vsdb(struct drm_display_info *display,
++				    const struct amdgpu_hdmi_vsdb_info *vsdb)
++{
++	display->monitor_range.min_vfreq = vsdb->min_refresh_rate_hz;
++	display->monitor_range.max_vfreq = vsdb->max_refresh_rate_hz;
++}
++
++/**
++ * Get VRR range from HDMI VRR info in EDID. If VRRmax == 0,
++ * try getting upper bound from AMD vsdb.
++ *
++ * @conn: drm_connector with HDMI VRR info
++ * @vsdb: AMD vsdb from CAE
++ */
++static void monitor_range_from_hdmi(struct drm_display_info *display,
++				    const struct amdgpu_hdmi_vsdb_info *vsdb)
++{
++	u16 vrr_max = display->hdmi.vrr_cap.vrr_max;
++
++	/* Try getting upper vrr bound from AMD vsdb */
++	if (vrr_max == 0)
++		vrr_max = vsdb->max_refresh_rate_hz;
++
++	/* Use max possible BRR value as a last resort */
++	if (vrr_max == 0)
++		vrr_max = VTEM_BRR_MAX;
+ 
+-	return valid_vsdb_found ? i : -ENODEV;
++	display->monitor_range.min_vfreq = display->hdmi.vrr_cap.vrr_min;
++	display->monitor_range.max_vfreq = vrr_max;
++}
++
++/*
++ * Returns true if connector is capable of freesync
++ * Optionally, can fetch the range from AMD vsdb
++ */
++static bool copy_range_to_amdgpu_connector(struct drm_connector *conn)
++{
++	struct amdgpu_dm_connector *aconn = to_amdgpu_dm_connector(conn);
++	struct drm_monitor_range_info *range = &conn->display_info.monitor_range;
++
++	aconn->min_vfreq = range->min_vfreq;
++	aconn->max_vfreq = range->max_vfreq;
++
++	return is_freesync_capable(range);
++}
++
++static void extend_range_from_vsdb(struct drm_display_info *display,
++				   const struct amdgpu_hdmi_vsdb_info *vsdb)
++{
++	u16 vrr_min = display->monitor_range.min_vfreq;
++	u16 vrr_max = display->monitor_range.max_vfreq;
++
++	/* Always extend upper limit */
++	if (vsdb->max_refresh_rate_hz > vrr_max)
++		vrr_max = vsdb->max_refresh_rate_hz;
++
++	/*
++	 * Only extend lower limit if current one disables LFC.
++
++	 * During widespread testing, we found that some manufacturers probably
++	 * had issues with their monitors' lower VRR boundaries and adjusted
++	 * them up (Gigabyte X34GS with official range 48 - 180, AMD vsdb 48 -
++	 * 180 yet Monitor Ranges 55 - 180). After setting the lower boundary
++	 * from AMD vsdb, such monitors start having blanking issues.
++	 *
++	 * Work around that by not touching VRR min if it still supports LFC.
++	 */
++	if (vsdb->min_refresh_rate_hz < vrr_min && (vrr_min * 2 >= vrr_max))
++		vrr_min = vsdb->min_refresh_rate_hz;
++
++	display->monitor_range.min_vfreq = vrr_min;
++	display->monitor_range.max_vfreq = vrr_max;
+ }
+ 
+ /**
+@@ -13195,16 +13353,20 @@ static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector,
+ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector,
+ 				    const struct drm_edid *drm_edid)
+ {
+-	int i = 0;
+ 	struct amdgpu_dm_connector *amdgpu_dm_connector =
+ 			to_amdgpu_dm_connector(connector);
+ 	struct dm_connector_state *dm_con_state = NULL;
+ 	struct dc_sink *sink;
+ 	struct amdgpu_device *adev = drm_to_adev(connector->dev);
+ 	struct amdgpu_hdmi_vsdb_info vsdb_info = {0};
++	struct amdgpu_hdmi_vsdb_info vsdb_did = {0};
++	struct drm_hdmi_vrr_cap hdmi_vrr = {0};
++	struct dpcd_caps dpcd_caps = {0};
+ 	const struct edid *edid;
++	bool freesync_on_desktop = false;
+ 	bool freesync_capable = false;
+-	enum adaptive_sync_type as_type = ADAPTIVE_SYNC_TYPE_NONE;
++	bool pcon_allowed = false;
++	bool is_pcon = false;
+ 
+ 	if (!connector->state) {
+ 		drm_err(adev_to_drm(adev), "%s - Connector has no state", __func__);
+@@ -13232,68 +13394,77 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector,
+ 	if (!adev->dm.freesync_module || !dc_supports_vrr(sink->ctx->dce_version))
+ 		goto update;
+ 
++	/* Gather all data */
+ 	edid = drm_edid_raw(drm_edid); // FIXME: Get rid of drm_edid_raw()
++	parse_amd_vsdb_cea(amdgpu_dm_connector, edid, &vsdb_info);
++	hdmi_vrr = connector->display_info.hdmi.vrr_cap;
++
++	if (amdgpu_dm_connector->dc_link) {
++		dpcd_caps = amdgpu_dm_connector->dc_link->dpcd_caps;
++		is_pcon = dpcd_caps.dongle_type == DISPLAY_DONGLE_DP_HDMI_CONVERTER;
++		pcon_allowed = dm_helpers_is_vrr_pcon_allowed(
++			amdgpu_dm_connector->dc_link, connector->dev);
++	}
++
++	/* DP & eDP excluding PCONs */
++	if ((sink->sink_signal == SIGNAL_TYPE_EDP ||
++	     sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT) && !is_pcon) {
++		/* Some eDP panels only have the refresh rate range info in DisplayID */
++		if (is_monitor_range_invalid(connector))
++			parse_edid_displayid_vrr(connector, edid);
++		/*
++		 * Many monitors expose AMD vsdb in CAE even for DP and their
++		 * monitor ranges do not contain Range Limits Only flag
++		 */
++		if (is_monitor_range_invalid(connector))
++			monitor_range_from_vsdb(&connector->display_info, &vsdb_info);
+ 
+-	/* Some eDP panels only have the refresh rate range info in DisplayID */
+-	if ((connector->display_info.monitor_range.min_vfreq == 0 ||
+-	     connector->display_info.monitor_range.max_vfreq == 0))
+-		parse_edid_displayid_vrr(connector, edid);
+-
+-	if (edid && (sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT ||
+-		     sink->sink_signal == SIGNAL_TYPE_EDP)) {
+-		if (amdgpu_dm_connector->dc_link &&
+-		    amdgpu_dm_connector->dc_link->dpcd_caps.allow_invalid_MSA_timing_param) {
+-			amdgpu_dm_connector->min_vfreq = connector->display_info.monitor_range.min_vfreq;
+-			amdgpu_dm_connector->max_vfreq = connector->display_info.monitor_range.max_vfreq;
+-			if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10)
+-				freesync_capable = true;
+-		}
++		/* Try extending range if found in AMD vsdb */
++		extend_range_from_vsdb(&connector->display_info, &vsdb_info);
+ 
+-		parse_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info);
++		if (dpcd_caps.allow_invalid_MSA_timing_param)
++			freesync_capable = copy_range_to_amdgpu_connector(connector);
+ 
+-		if (vsdb_info.replay_mode) {
+-			amdgpu_dm_connector->vsdb_info.replay_mode = vsdb_info.replay_mode;
+-			amdgpu_dm_connector->vsdb_info.amd_vsdb_version = vsdb_info.amd_vsdb_version;
++		/* eDP */
++		parse_amd_vsdb_did(amdgpu_dm_connector, edid, &vsdb_did);
++		if (vsdb_did.replay_mode) {
++			amdgpu_dm_connector->vsdb_info.replay_mode = vsdb_did.replay_mode;
++			amdgpu_dm_connector->vsdb_info.amd_vsdb_version = vsdb_did.amd_vsdb_version;
+ 			amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_EDP;
+ 		}
+ 
+-	} else if (drm_edid && sink->sink_signal == SIGNAL_TYPE_HDMI_TYPE_A) {
+-		i = parse_hdmi_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info);
+-		if (i >= 0 && vsdb_info.freesync_supported) {
+-			amdgpu_dm_connector->min_vfreq = vsdb_info.min_refresh_rate_hz;
+-			amdgpu_dm_connector->max_vfreq = vsdb_info.max_refresh_rate_hz;
+-			if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10)
+-				freesync_capable = true;
+-
+-			connector->display_info.monitor_range.min_vfreq = vsdb_info.min_refresh_rate_hz;
+-			connector->display_info.monitor_range.max_vfreq = vsdb_info.max_refresh_rate_hz;
+-		}
+-	}
+-
+-	if (amdgpu_dm_connector->dc_link)
+-		as_type = dm_get_adaptive_sync_support_type(amdgpu_dm_connector->dc_link);
+-
+-	if (as_type == FREESYNC_TYPE_PCON_IN_WHITELIST) {
+-		i = parse_hdmi_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info);
+-		if (i >= 0 && vsdb_info.freesync_supported && vsdb_info.amd_vsdb_version > 0) {
+-
+-			amdgpu_dm_connector->pack_sdp_v1_3 = true;
+-			amdgpu_dm_connector->as_type = as_type;
++	/* HDMI */
++	} else if (sink->sink_signal == SIGNAL_TYPE_HDMI_TYPE_A) {
++		/* Prefer HDMI VRR */
++		if (hdmi_vrr.supported) {
++			amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_HDMI;
++			monitor_range_from_hdmi(&connector->display_info, &vsdb_info);
++		} else if (vsdb_info.freesync_supported)
++			monitor_range_from_vsdb(&connector->display_info, &vsdb_info);
++
++		freesync_capable = copy_range_to_amdgpu_connector(connector);
++		freesync_on_desktop = freesync_capable;
++
++	/* DP -> HDMI PCON */
++	} else if (pcon_allowed) {
++		/* Prefer HDMI VRR */
++		if (hdmi_vrr.supported)
++			monitor_range_from_hdmi(&connector->display_info, &vsdb_info);
++		else if (vsdb_info.freesync_supported) {
+ 			amdgpu_dm_connector->vsdb_info = vsdb_info;
+-
+-			amdgpu_dm_connector->min_vfreq = vsdb_info.min_refresh_rate_hz;
+-			amdgpu_dm_connector->max_vfreq = vsdb_info.max_refresh_rate_hz;
+-			if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10)
+-				freesync_capable = true;
+-
+-			connector->display_info.monitor_range.min_vfreq = vsdb_info.min_refresh_rate_hz;
+-			connector->display_info.monitor_range.max_vfreq = vsdb_info.max_refresh_rate_hz;
++			monitor_range_from_vsdb(&connector->display_info, &vsdb_info);
+ 		}
++
++		amdgpu_dm_connector->pack_sdp_v1_3 = true;
++		amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_PCON_ALLOWED;
++		freesync_capable = copy_range_to_amdgpu_connector(connector);
+ 	}
+ 
+ update:
+-	if (dm_con_state)
++	if (dm_con_state) {
+ 		dm_con_state->freesync_capable = freesync_capable;
++		dm_con_state->freesync_on_desktop_capable = freesync_on_desktop;
++	}
+ 
+ 	if (connector->state && amdgpu_dm_connector->dc_link && !freesync_capable &&
+ 	    amdgpu_dm_connector->dc_link->replay_settings.config.replay_supported) {
+@@ -13302,8 +13473,15 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector,
+ 	}
+ 
+ 	if (connector->vrr_capable_property)
+-		drm_connector_set_vrr_capable_property(connector,
+-						       freesync_capable);
++		drm_connector_set_vrr_capable_property(connector, freesync_capable);
++
++	if (connector->passive_vrr_capable_property)
++		drm_connector_set_passive_vrr_capable_property(connector, freesync_on_desktop);
++
++	amdgpu_dm_connector->hdmi_allm_capable = connector->display_info.hdmi.allm;
++	if (connector->allm_capable_property)
++		drm_connector_set_allm_capable_property(
++			connector, connector->display_info.hdmi.allm);
+ }
+ 
+ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev)
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+index beb0d04d3e68..6376d12acb72 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+@@ -828,6 +828,9 @@ struct amdgpu_dm_connector {
+ 	unsigned int hdmi_hpd_debounce_delay_ms;
+ 	struct delayed_work hdmi_hpd_debounce_work;
+ 	struct dc_sink *hdmi_prev_sink;
++
++	/* HDMI ALLM */
++	bool hdmi_allm_capable;
+ };
+ 
+ static inline void amdgpu_dm_set_mst_status(uint8_t *status,
+@@ -1001,6 +1004,7 @@ struct dm_connector_state {
+ 	uint8_t underscan_hborder;
+ 	bool underscan_enable;
+ 	bool freesync_capable;
++	bool freesync_on_desktop_capable;
+ 	bool update_hdcp;
+ 	bool abm_sysfs_forbidden;
+ 	uint8_t abm_level;
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+index e5e993d3ef74..6413f2a587d5 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+@@ -137,7 +137,12 @@ enum dc_edid_status dm_helpers_parse_edid_caps(
+ 				  edid_caps->display_name,
+ 				  AUDIO_INFO_DISPLAY_NAME_SIZE_IN_CHARS);
+ 
+-	edid_caps->edid_hdmi = connector->display_info.is_hdmi;
++	if (connector->display_info.is_hdmi) {
++		edid_caps->edid_hdmi = true;
++		edid_caps->allm = connector->display_info.hdmi.allm;
++		edid_caps->fva = connector->display_info.hdmi.vrr_cap.fva;
++		edid_caps->hdmi_vrr = connector->display_info.hdmi.vrr_cap.supported;
++	}
+ 
+ 	if (edid_caps->edid_hdmi)
+ 		populate_hdmi_info_from_connector(&connector->display_info.hdmi, edid_caps);
+@@ -1375,40 +1380,32 @@ void dm_helpers_dp_mst_update_branch_bandwidth(
+ 	// TODO
+ }
+ 
+-static bool dm_is_freesync_pcon_whitelist(const uint32_t branch_dev_id)
++bool dm_helpers_is_vrr_pcon_allowed(const struct dc_link *link, const struct drm_device *dev)
+ {
+-	bool ret_val = false;
++	if (link->dpcd_caps.dongle_type != DISPLAY_DONGLE_DP_HDMI_CONVERTER)
++		return false;
+ 
+-	switch (branch_dev_id) {
++	if (!link->dpcd_caps.allow_invalid_MSA_timing_param)
++		return false;
++
++	if (!link->dpcd_caps.adaptive_sync_caps.dp_adap_sync_caps.bits.ADAPTIVE_SYNC_SDP_SUPPORT)
++		return false;
++
++	switch (link->dpcd_caps.branch_dev_id) {
+ 	case DP_BRANCH_DEVICE_ID_0060AD:
+ 	case DP_BRANCH_DEVICE_ID_00E04C:
+ 	case DP_BRANCH_DEVICE_ID_90CC24:
+-		ret_val = true;
+-		break;
+-	default:
+-		break;
++	case DP_BRANCH_DEVICE_ID_2B02F0:
++		return true;
+ 	}
+ 
+-	return ret_val;
+-}
+-
+-enum adaptive_sync_type dm_get_adaptive_sync_support_type(struct dc_link *link)
+-{
+-	struct dpcd_caps *dpcd_caps = &link->dpcd_caps;
+-	enum adaptive_sync_type as_type = ADAPTIVE_SYNC_TYPE_NONE;
+-
+-	switch (dpcd_caps->dongle_type) {
+-	case DISPLAY_DONGLE_DP_HDMI_CONVERTER:
+-		if (dpcd_caps->adaptive_sync_caps.dp_adap_sync_caps.bits.ADAPTIVE_SYNC_SDP_SUPPORT == true &&
+-			dpcd_caps->allow_invalid_MSA_timing_param == true &&
+-			dm_is_freesync_pcon_whitelist(dpcd_caps->branch_dev_id))
+-			as_type = FREESYNC_TYPE_PCON_IN_WHITELIST;
+-		break;
+-	default:
+-		break;
++	if (link->dc->debug.override_pcon_vrr_id_check) {
++		drm_info(dev, "Overriding VRR PCON check for ID: 0x%06x\n",
++			 link->dpcd_caps.branch_dev_id);
++		return true;
+ 	}
+ 
+-	return as_type;
++	return false;
+ }
+ 
+ bool dm_helpers_is_fullscreen(struct dc_context *ctx, struct dc_stream_state *stream)
+diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
+index 8be9cbd43e18..b1db19175928 100644
+--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
++++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
+@@ -3287,6 +3287,9 @@ static void copy_stream_update_to_stream(struct dc *dc,
+ 	if (update->vrr_active_fixed)
+ 		stream->vrr_active_fixed = *update->vrr_active_fixed;
+ 
++	if (update->hdmi_allm_active)
++		stream->hdmi_allm_active = *update->hdmi_allm_active;
++
+ 	if (update->crtc_timing_adjust) {
+ 		if (stream->adjust.v_total_min != update->crtc_timing_adjust->v_total_min ||
+ 			stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max ||
+diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+index 848c267ef11e..230ada389e3a 100644
+--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
++++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+@@ -4659,7 +4659,7 @@ static void set_avi_info_frame(
+ 		vic = 0;
+ 	format = stream->timing.timing_3d_format;
+ 	/*todo, add 3DStereo support*/
+-	if (format != TIMING_3D_FORMAT_NONE) {
++	if (format != TIMING_3D_FORMAT_NONE || stream->hdmi_allm_active) {
+ 		// Based on HDMI specs hdmi vic needs to be converted to cea vic when 3D is enabled
+ 		switch (pipe_ctx->stream->timing.hdmi_vic) {
+ 		case 1:
+diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
+index 0a9758a04258..f120dd5c05c6 100644
+--- a/drivers/gpu/drm/amd/display/dc/dc.h
++++ b/drivers/gpu/drm/amd/display/dc/dc.h
+@@ -1039,6 +1039,7 @@ struct dc_debug_options {
+ 	bool scl_reset_length10;
+ 	bool hdmi20_disable;
+ 	bool skip_detection_link_training;
++	bool override_pcon_vrr_id_check;
+ 	uint32_t edid_read_retry_times;
+ 	unsigned int force_odm_combine; //bit vector based on otg inst
+ 	unsigned int seamless_boot_odm_combine;
+diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h
+index 321cfe92d799..e69c17413835 100644
+--- a/drivers/gpu/drm/amd/display/dc/dc_stream.h
++++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h
+@@ -242,6 +242,7 @@ struct dc_stream_state {
+ 	bool vrr_active_variable;
+ 	bool freesync_on_desktop;
+ 	bool vrr_active_fixed;
++	bool hdmi_allm_active;
+ 
+ 	bool converter_disable_audio;
+ 	uint8_t qs_bit;
+@@ -343,6 +344,7 @@ struct dc_stream_update {
+ 	bool *allow_freesync;
+ 	bool *vrr_active_variable;
+ 	bool *vrr_active_fixed;
++	bool *hdmi_allm_active;
+ 
+ 	struct colorspace_transform *gamut_remap;
+ 	enum dc_color_space *output_color_space;
+diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h
+index 3e63d7bda166..57811bc85071 100644
+--- a/drivers/gpu/drm/amd/display/dc/dc_types.h
++++ b/drivers/gpu/drm/amd/display/dc/dc_types.h
+@@ -210,9 +210,14 @@ struct dc_edid_caps {
+ 
+ 	uint32_t max_tmds_clk_mhz;
+ 
+-	/*HDMI 2.0 caps*/
++	/* HDMI 2.0 caps */
+ 	bool lte_340mcsc_scramble;
+ 
++	/* HDMI 2.1 caps */
++	bool allm;
++	bool fva;
++	bool hdmi_vrr;
++
+ 	bool edid_hdmi;
+ 	bool hdr_supported;
+ 	bool rr_capable;
+diff --git a/drivers/gpu/drm/amd/display/dc/dm_helpers.h b/drivers/gpu/drm/amd/display/dc/dm_helpers.h
+index 9d160b39e8c5..ea94c52d2b87 100644
+--- a/drivers/gpu/drm/amd/display/dc/dm_helpers.h
++++ b/drivers/gpu/drm/amd/display/dc/dm_helpers.h
+@@ -219,10 +219,10 @@ int dm_helpers_dmub_set_config_sync(struct dc_context *ctx,
+ 		const struct dc_link *link,
+ 		struct set_config_cmd_payload *payload,
+ 		enum set_config_status *operation_result);
+-enum adaptive_sync_type dm_get_adaptive_sync_support_type(struct dc_link *link);
+ 
+ enum dc_edid_status dm_helpers_get_sbios_edid(struct dc_link *link, struct dc_edid *edid);
+ 
++bool dm_helpers_is_vrr_pcon_allowed(const struct dc_link *link, const struct drm_device *dev);
+ bool dm_helpers_is_fullscreen(struct dc_context *ctx, struct dc_stream_state *stream);
+ bool dm_helpers_is_hdr_on(struct dc_context *ctx, struct dc_stream_state *stream);
+ 
+diff --git a/drivers/gpu/drm/amd/display/include/ddc_service_types.h b/drivers/gpu/drm/amd/display/include/ddc_service_types.h
+index 1c603b12957f..e838f7c1269c 100644
+--- a/drivers/gpu/drm/amd/display/include/ddc_service_types.h
++++ b/drivers/gpu/drm/amd/display/include/ddc_service_types.h
+@@ -36,6 +36,7 @@
+ #define DP_BRANCH_DEVICE_ID_006037 0x006037
+ #define DP_BRANCH_DEVICE_ID_001CF8 0x001CF8
+ #define DP_BRANCH_DEVICE_ID_0060AD 0x0060AD
++#define DP_BRANCH_DEVICE_ID_2B02F0 0x2B02F0 /* Chrontel CH7218 */
+ #define DP_BRANCH_HW_REV_10 0x10
+ #define DP_BRANCH_HW_REV_20 0x20
+ 
+diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+index 1aae46d703ba..db197cf048e1 100644
+--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
++++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+@@ -27,6 +27,7 @@
+ #include "dc.h"
+ #include "mod_freesync.h"
+ #include "core_types.h"
++#include "mod_info_packet.h"
+ 
+ #define MOD_FREESYNC_MAX_CONCURRENT_STREAMS  32
+ 
+@@ -955,6 +956,9 @@ void mod_freesync_build_vrr_infopacket(struct mod_freesync *mod_freesync,
+ 		return;
+ 
+ 	switch (packet_type) {
++	case PACKET_TYPE_VTEM:
++		mod_build_vtem_infopacket(stream, vrr, infopacket);
++		break;
+ 	case PACKET_TYPE_FS_V3:
+ 		build_vrr_infopacket_v3(stream->signal, vrr, app_tf, infopacket, stream->freesync_on_desktop);
+ 		break;
+diff --git a/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h b/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h
+index 66dc9a19aebe..89d412772d16 100644
+--- a/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h
++++ b/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h
+@@ -33,6 +33,8 @@ struct dc_stream_state;
+ struct dc_info_packet;
+ struct mod_vrr_params;
+ 
++#define VTEM_BRR_MAX 1023
++
+ void mod_build_vsc_infopacket(const struct dc_stream_state *stream,
+ 		struct dc_info_packet *info_packet,
+ 		enum dc_color_space cs,
+@@ -41,12 +43,17 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream,
+ void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream,
+ 		struct dc_info_packet *info_packet);
+ 
++void mod_build_vtem_infopacket(const struct dc_stream_state *stream,
++		const struct mod_vrr_params *vrr,
++		struct dc_info_packet *infopacket);
++
+ enum adaptive_sync_type {
+-	ADAPTIVE_SYNC_TYPE_NONE                  = 0,
+-	ADAPTIVE_SYNC_TYPE_DP                    = 1,
+-	FREESYNC_TYPE_PCON_IN_WHITELIST          = 2,
+-	FREESYNC_TYPE_PCON_NOT_IN_WHITELIST      = 3,
+-	ADAPTIVE_SYNC_TYPE_EDP                   = 4,
++	ADAPTIVE_SYNC_TYPE_NONE             = 0,
++	ADAPTIVE_SYNC_TYPE_DP               = 1,
++	ADAPTIVE_SYNC_TYPE_PCON_ALLOWED     = 2,
++	ADAPTIVE_SYNC_TYPE_PCON_NOT_ALLOWED = 3,
++	ADAPTIVE_SYNC_TYPE_EDP              = 4,
++	ADAPTIVE_SYNC_TYPE_HDMI             = 5,
+ };
+ 
+ enum adaptive_sync_sdp_version {
+diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
+index b3d55cac3569..a16a94dffa8d 100644
+--- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
++++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
+@@ -44,8 +44,12 @@ enum vsc_packet_revision {
+ 	vsc_packet_rev5 = 5,
+ };
+ 
++#define HDMI_INFOFRAME_TYPE_EMP    0x7F
+ #define HDMI_INFOFRAME_TYPE_VENDOR 0x81
+-#define HF_VSIF_VERSION 1
++#define HDMI_INFOFRAME_LENGTH_MASK 0x1F
++#define HF_VSIF_VERSION  1
++#define HF_VSIF_3D_BIT   0
++#define HF_VSIF_ALLM_BIT 1
+ 
+ // VTEM Byte Offset
+ #define VTEM_PB0		0
+@@ -56,64 +60,51 @@ enum vsc_packet_revision {
+ #define VTEM_PB5		5
+ #define VTEM_PB6		6
+ 
+-#define VTEM_MD0		7
+-#define VTEM_MD1		8
+-#define VTEM_MD2		9
+-#define VTEM_MD3		10
+-
+-
+-// VTEM Byte Masks
+-//PB0
+-#define MASK_VTEM_PB0__RESERVED0  0x01
+-#define MASK_VTEM_PB0__SYNC       0x02
+-#define MASK_VTEM_PB0__VFR        0x04
+-#define MASK_VTEM_PB0__AFR        0x08
+-#define MASK_VTEM_PB0__DS_TYPE    0x30
+-	//0: Periodic pseudo-static EM Data Set
+-	//1: Periodic dynamic EM Data Set
+-	//2: Unique EM Data Set
+-	//3: Reserved
+-#define MASK_VTEM_PB0__END        0x40
+-#define MASK_VTEM_PB0__NEW        0x80
+-
+-//PB1
+-#define MASK_VTEM_PB1__RESERVED1 0xFF
+-
+-//PB2
+-#define MASK_VTEM_PB2__ORGANIZATION_ID 0xFF
+-	//0: This is a Vendor Specific EM Data Set
+-	//1: This EM Data Set is defined by This Specification (HDMI 2.1 r102.clean)
+-	//2: This EM Data Set is defined by CTA-861-G
+-	//3: This EM Data Set is defined by VESA
+-//PB3
+-#define MASK_VTEM_PB3__DATA_SET_TAG_MSB    0xFF
+-//PB4
+-#define MASK_VTEM_PB4__DATA_SET_TAG_LSB    0xFF
+-//PB5
+-#define MASK_VTEM_PB5__DATA_SET_LENGTH_MSB 0xFF
+-//PB6
+-#define MASK_VTEM_PB6__DATA_SET_LENGTH_LSB 0xFF
+-
+-
+-
+-//PB7-27 (20 bytes):
+-//PB7 = MD0
+-#define MASK_VTEM_MD0__VRR_EN         0x01
+-#define MASK_VTEM_MD0__M_CONST        0x02
+-#define MASK_VTEM_MD0__QMS_EN         0x04
+-#define MASK_VTEM_MD0__RESERVED2      0x08
+-#define MASK_VTEM_MD0__FVA_FACTOR_M1  0xF0
+-
+-//MD1
+-#define MASK_VTEM_MD1__BASE_VFRONT    0xFF
+-
+-//MD2
+-#define MASK_VTEM_MD2__BASE_REFRESH_RATE_98  0x03
+-#define MASK_VTEM_MD2__RB                    0x04
+-#define MASK_VTEM_MD2__NEXT_TFR              0xF8
+-
+-//MD3
+-#define MASK_VTEM_MD3__BASE_REFRESH_RATE_07  0xFF
++#define VTEM_ORG_ID          1
++#define VTEM_DATA_SET_TAG    1
++#define VTEM_DATA_SET_LENGTH 4
++
++#define VTEM_M_CONST    0
++#define VTEM_FVA_FACTOR 0
++
++#define VTEM_BRR_MASK_UPPER 0x03
++#define VTEM_BRR_MASK_LOWER 0xFF
++
++/* VTEM Byte Offset */
++#define VTEM_PB0 0
++#define VTEM_PB1 1
++#define VTEM_PB2 2
++#define VTEM_PB3 3
++#define VTEM_PB4 4
++#define VTEM_PB5 5
++#define VTEM_PB6 6
++
++#define VTEM_MD0 7
++#define VTEM_MD1 8
++#define VTEM_MD2 9
++#define VTEM_MD3 10
++
++/* Extended Metadata Packet */
++/* Header */
++#define EMP_LAST_BIT  6
++#define EMP_FIRST_BIT 7
++/* PB0 */
++#define EMP_SNC_BIT 1
++#define EMP_VFR_BIT 2
++#define EMP_AFR_BIT 3
++#define EMP_DST_BIT 4
++#define EMP_END_BIT 6
++#define EMP_NEW_BIT 7
++/* PB7 = MD0 */
++#define VTEM_VRR_BIT     0
++#define VTEM_M_CONST_BIT 1
++#define VTEM_FVA_BIT     4
++/* MD1 Base_Vfront */
++/* MD2 */
++#define VTEM_BRR_UPPER_BIT 0
++#define VTEM_RB_BIT        2
++/* MD3 BRR Lower */
++
+ 
+ enum ColorimetryRGBDP {
+ 	ColorimetryRGB_DP_sRGB               = 0,
+@@ -441,9 +432,29 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream,
+ 	}
+ }
+ 
++static bool is_hdmi_vic_mode(const struct dc_stream_state *stream)
++{
++	if (stream->timing.hdmi_vic == 0)
++		return false;
++
++	if (stream->timing.h_total < 3840 ||
++	    stream->timing.v_total < 2160)
++		return false;
++
++	/* 3D/ALLM forces HDMI VIC -> CTA VIC translation */
++	if (stream->view_format != VIEW_3D_FORMAT_NONE)
++		return false;
++
++	if (stream->hdmi_allm_active)
++		return false;
++
++	return true;
++}
++
+ /**
+  *  mod_build_hf_vsif_infopacket - Prepare HDMI Vendor Specific info frame.
+  *                                 Follows HDMI Spec to build up Vendor Specific info frame
++ *                                 Conforms to h14b-vsif or hf-vsif based on the capabilities
+  *
+  *  @stream:      contains data we may need to construct VSIF (i.e. timing_3d_format, etc.)
+  *  @info_packet: output structure where to store VSIF
+@@ -451,63 +462,76 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream,
+ void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream,
+ 		struct dc_info_packet *info_packet)
+ {
+-		unsigned int length = 5;
+ 		bool hdmi_vic_mode = false;
++		bool allm = false;
++		bool stereo = false;
+ 		uint8_t checksum = 0;
+-		uint32_t i = 0;
++		uint8_t offset = 0;
++		uint8_t i = 0;
++		uint8_t length = 5;
++		uint32_t oui = HDMI_IEEE_OUI;
+ 		enum dc_timing_3d_format format;
+ 
+ 		info_packet->valid = false;
+-		format = stream->timing.timing_3d_format;
+-		if (stream->view_format == VIEW_3D_FORMAT_NONE)
+-			format = TIMING_3D_FORMAT_NONE;
+ 
+-		if (stream->timing.hdmi_vic != 0
+-				&& stream->timing.h_total >= 3840
+-				&& stream->timing.v_total >= 2160
+-				&& format == TIMING_3D_FORMAT_NONE)
+-			hdmi_vic_mode = true;
++		allm = stream->hdmi_allm_active;
++		format = stream->view_format == VIEW_3D_FORMAT_NONE ?
++			 TIMING_3D_FORMAT_NONE :
++			 stream->timing.timing_3d_format;
++		stereo = format != TIMING_3D_FORMAT_NONE;
++		hdmi_vic_mode = is_hdmi_vic_mode(stream);
+ 
+-		if ((format == TIMING_3D_FORMAT_NONE) && !hdmi_vic_mode)
++		if (!stereo && !hdmi_vic_mode && !allm)
+ 			return;
+ 
+-		info_packet->sb[1] = 0x03;
+-		info_packet->sb[2] = 0x0C;
+-		info_packet->sb[3] = 0x00;
++		if (allm)
++			oui = HDMI_FORUM_IEEE_OUI;
+ 
+-		if (format != TIMING_3D_FORMAT_NONE)
+-			info_packet->sb[4] = (2 << 5);
++		info_packet->sb[1] = oui & 0xFF;
++		info_packet->sb[2] = (oui >> 8) & 0xFF;
++		info_packet->sb[3] = (oui >> 16) & 0xFF;
+ 
+-		else if (hdmi_vic_mode)
+-			info_packet->sb[4] = (1 << 5);
++		if (oui == HDMI_FORUM_IEEE_OUI) {
++			offset = 2;
++			length += 2;
++			info_packet->sb[4] = HF_VSIF_VERSION;
++			info_packet->sb[5] = stereo << HF_VSIF_3D_BIT;
++			info_packet->sb[5] |= allm << HF_VSIF_ALLM_BIT;
++		}
+ 
+-		switch (format) {
+-		case TIMING_3D_FORMAT_HW_FRAME_PACKING:
+-		case TIMING_3D_FORMAT_SW_FRAME_PACKING:
+-			info_packet->sb[5] = (0x0 << 4);
+-			break;
++		if (stereo) {
++			info_packet->sb[4 + offset] = (2 << 5);
+ 
+-		case TIMING_3D_FORMAT_SIDE_BY_SIDE:
+-		case TIMING_3D_FORMAT_SBS_SW_PACKED:
+-			info_packet->sb[5] = (0x8 << 4);
+-			length = 6;
+-			break;
++			switch (format) {
++			case TIMING_3D_FORMAT_HW_FRAME_PACKING:
++			case TIMING_3D_FORMAT_SW_FRAME_PACKING:
++				info_packet->sb[5 + offset] = (0x0 << 4);
++				break;
+ 
+-		case TIMING_3D_FORMAT_TOP_AND_BOTTOM:
+-		case TIMING_3D_FORMAT_TB_SW_PACKED:
+-			info_packet->sb[5] = (0x6 << 4);
+-			break;
++			case TIMING_3D_FORMAT_SIDE_BY_SIDE:
++			case TIMING_3D_FORMAT_SBS_SW_PACKED:
++				info_packet->sb[5 + offset] = (0x8 << 4);
++				++length;
++				break;
+ 
+-		default:
+-			break;
+-		}
++			case TIMING_3D_FORMAT_TOP_AND_BOTTOM:
++			case TIMING_3D_FORMAT_TB_SW_PACKED:
++				info_packet->sb[5 + offset] = (0x6 << 4);
++				break;
++
++			default:
++				break;
++			}
+ 
+-		if (hdmi_vic_mode)
++		/* Doesn't need the offset as it can't be used with hf-vsif */
++		} else if (hdmi_vic_mode) {
++			info_packet->sb[4] = (1 << 5);
+ 			info_packet->sb[5] = stream->timing.hdmi_vic;
++		}
+ 
+ 		info_packet->hb0 = HDMI_INFOFRAME_TYPE_VENDOR;
+ 		info_packet->hb1 = 0x01;
+-		info_packet->hb2 = (uint8_t) (length);
++		info_packet->hb2 = length & HDMI_INFOFRAME_LENGTH_MASK;
+ 
+ 		checksum += info_packet->hb0;
+ 		checksum += info_packet->hb1;
+@@ -521,6 +545,92 @@ void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream,
+ 		info_packet->valid = true;
+ }
+ 
++static void build_vtem_infopacket_header(struct dc_info_packet *infopacket)
++{
++	uint8_t pb0 = 0;
++
++	/* might need logic in the future */
++	pb0 |= 0 << EMP_SNC_BIT;
++	pb0 |= 1 << EMP_VFR_BIT;
++	pb0 |= 0 << EMP_AFR_BIT;
++	pb0 |= 0 << EMP_DST_BIT;
++	pb0 |= 0 << EMP_END_BIT;
++	pb0 |= 1 << EMP_NEW_BIT;
++
++	infopacket->hb0 = HDMI_INFOFRAME_TYPE_EMP;
++	infopacket->hb1 = (1 << EMP_FIRST_BIT) | (1 << EMP_LAST_BIT);
++	infopacket->hb2 = 0; // sequence
++
++	infopacket->sb[VTEM_PB0] = pb0;
++	infopacket->sb[VTEM_PB2] = VTEM_ORG_ID;
++	infopacket->sb[VTEM_PB4] = VTEM_DATA_SET_TAG;
++	infopacket->sb[VTEM_PB6] = VTEM_DATA_SET_LENGTH;
++}
++
++static void build_vtem_infopacket_data(const struct dc_stream_state *stream,
++		const struct mod_vrr_params *vrr,
++		struct dc_info_packet *infopacket)
++{
++	unsigned int hblank = 0;
++	unsigned int brr = 0;
++	bool vrr_active = false;
++	bool rb = false;
++
++	/*
++	 * Enables FreeSync-like behavior by keeping HDMI VRR signalling active
++	 * in fixed refresh rate conditions like normal desktop work/web browsing.
++	 * Functinally behaves like non-VRR mode by keeping the actual refresh
++	 * rate fixed.
++	 */
++	if (stream->freesync_on_desktop) {
++		vrr_active = vrr->state != VRR_STATE_DISABLED &&
++			     vrr->state != VRR_STATE_UNSUPPORTED;
++	} else {
++		vrr_active = vrr->state == VRR_STATE_ACTIVE_VARIABLE ||
++			     vrr->state == VRR_STATE_ACTIVE_FIXED;
++	}
++
++	infopacket->sb[VTEM_MD0] = VTEM_M_CONST << VTEM_M_CONST_BIT;
++	infopacket->sb[VTEM_MD0] |= VTEM_FVA_FACTOR << VTEM_FVA_BIT;
++	infopacket->sb[VTEM_MD0] |= vrr_active << VTEM_VRR_BIT;
++
++	infopacket->sb[VTEM_MD1] = 0;
++	infopacket->sb[VTEM_MD2] = 0;
++	infopacket->sb[VTEM_MD3] = 0;
++
++	if (!vrr_active || is_hdmi_vic_mode(stream))
++		return;
++	/*
++	 * In accordance with CVT 1.2 and CVT 2.1:
++	 * Reduced Blanking standard defines a fixed value of
++	 * 160 for hblank, further reduced to 80 in RB2. RB3 uses
++	 * fixed hblank of 80 pixels + up to 120 additional pixels
++	 * in 8-pixel steps.
++	 */
++	hblank = stream->timing.h_total - stream->timing.h_addressable;
++	rb = (hblank >= 80 && hblank <= 200 && hblank % 8 == 0);
++	brr = div_u64(mod_freesync_calc_nominal_field_rate(stream), 1000000);
++
++	if (brr > VTEM_BRR_MAX) {
++		infopacket->valid = false;
++		return;
++	}
++
++	infopacket->sb[VTEM_MD1] = (uint8_t) stream->timing.v_front_porch;
++	infopacket->sb[VTEM_MD2] = rb << VTEM_RB_BIT;
++	infopacket->sb[VTEM_MD2] |= (brr >> 8) & VTEM_BRR_MASK_UPPER;
++	infopacket->sb[VTEM_MD3] = brr & VTEM_BRR_MASK_LOWER;
++}
++
++void mod_build_vtem_infopacket(const struct dc_stream_state *stream,
++		const struct mod_vrr_params *vrr,
++		struct dc_info_packet *infopacket)
++{
++	infopacket->valid = true;
++	build_vtem_infopacket_header(infopacket);
++	build_vtem_infopacket_data(stream, vrr, infopacket);
++}
++
+ void mod_build_adaptive_sync_infopacket(const struct dc_stream_state *stream,
+ 		enum adaptive_sync_type asType,
+ 		const struct AS_Df_params *param,
+@@ -535,12 +645,13 @@ void mod_build_adaptive_sync_infopacket(const struct dc_stream_state *stream,
+ 		if (stream != NULL)
+ 			mod_build_adaptive_sync_infopacket_v2(stream, param, info_packet);
+ 		break;
+-	case FREESYNC_TYPE_PCON_IN_WHITELIST:
++	case ADAPTIVE_SYNC_TYPE_PCON_ALLOWED:
+ 	case ADAPTIVE_SYNC_TYPE_EDP:
+ 		mod_build_adaptive_sync_infopacket_v1(info_packet);
+ 		break;
+ 	case ADAPTIVE_SYNC_TYPE_NONE:
+-	case FREESYNC_TYPE_PCON_NOT_IN_WHITELIST:
++	case ADAPTIVE_SYNC_TYPE_PCON_NOT_ALLOWED:
++	case ADAPTIVE_SYNC_TYPE_HDMI:
+ 	default:
+ 		break;
+ 	}
+diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
+index ac2d3701e2bd..894e1e738ce0 100644
+--- a/drivers/gpu/drm/amd/include/amd_shared.h
++++ b/drivers/gpu/drm/amd/include/amd_shared.h
+@@ -412,6 +412,12 @@ enum DC_DEBUG_MASK {
+ 	 * @DC_SKIP_DETECTION_LT: (0x200000) If set, skip detection link training
+ 	 */
+ 	DC_SKIP_DETECTION_LT = 0x200000,
++
++	/**
++	 * @DC_OVERRIDE_PCON_VRR_ID_CHECK: (0x400000) If set, always return true if checking for
++	 * PCON VRR compatibility and print it's ID in kernel log.
++	 */
++	DC_OVERRIDE_PCON_VRR_ID_CHECK = 0x400000,
+ };
+ 
+ enum amd_dpm_forced_level;
+diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
+index 7320db4b8489..94e1b7eb65f7 100644
+--- a/drivers/gpu/drm/drm_atomic_uapi.c
++++ b/drivers/gpu/drm/drm_atomic_uapi.c
+@@ -412,6 +412,8 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
+ 		return ret;
+ 	} else if (property == config->prop_vrr_enabled) {
+ 		state->vrr_enabled = val;
++	} else if (property == config->prop_passive_vrr_disabled) {
++		state->passive_vrr_disabled = val;
+ 	} else if (property == config->degamma_lut_property) {
+ 		ret = drm_property_replace_blob_from_id(dev,
+ 					&state->degamma_lut,
+@@ -477,6 +479,8 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc,
+ 		*val = (state->mode_blob) ? state->mode_blob->base.id : 0;
+ 	else if (property == config->prop_vrr_enabled)
+ 		*val = state->vrr_enabled;
++	else if (property == config->prop_passive_vrr_disabled)
++		*val = state->passive_vrr_disabled;
+ 	else if (property == config->degamma_lut_property)
+ 		*val = (state->degamma_lut) ? state->degamma_lut->base.id : 0;
+ 	else if (property == config->ctm_property)
+@@ -885,6 +889,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
+ 		state->content_type = val;
+ 	} else if (property == connector->scaling_mode_property) {
+ 		state->scaling_mode = val;
++	} else if (property == connector->allm_mode_property) {
++		state->allm_mode = val;
+ 	} else if (property == config->content_protection_property) {
+ 		if (val == DRM_MODE_CONTENT_PROTECTION_ENABLED) {
+ 			drm_dbg_kms(dev, "only drivers can set CP Enabled\n");
+@@ -982,6 +988,8 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
+ 		*val = state->colorspace;
+ 	} else if (property == connector->scaling_mode_property) {
+ 		*val = state->scaling_mode;
++	} else if (property == connector->allm_mode_property) {
++		*val = state->allm_mode;
+ 	} else if (property == config->hdr_output_metadata_property) {
+ 		*val = state->hdr_output_metadata ?
+ 			state->hdr_output_metadata->base.id : 0;
+diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
+index 4d6dc9ebfdb5..bdf49bb3c38e 100644
+--- a/drivers/gpu/drm/drm_connector.c
++++ b/drivers/gpu/drm/drm_connector.c
+@@ -1226,6 +1226,12 @@ static const struct drm_prop_enum_list drm_content_type_enum_list[] = {
+ 	{ DRM_MODE_CONTENT_TYPE_GAME, "Game" },
+ };
+ 
++static const struct drm_prop_enum_list drm_allm_mode_enum_list[] = {
++	{ DRM_ALLM_MODE_DISABLED, "Disabled" },
++	{ DRM_ALLM_MODE_ENABLED_DYNAMIC, "Dynamic" },
++	{ DRM_ALLM_MODE_ENABLED_FORCED, "Always On" },
++};
++
+ static const struct drm_prop_enum_list drm_panel_orientation_enum_list[] = {
+ 	{ DRM_MODE_PANEL_ORIENTATION_NORMAL,	"Normal"	},
+ 	{ DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP,	"Upside Down"	},
+@@ -2346,6 +2352,16 @@ EXPORT_SYMBOL(drm_mode_create_scaling_mode_property);
+  *
+  *	Absence of the property should indicate absence of support.
+  *
++ * "passive_vrr_capable":
++ *	Optional &drm_connector boolean property that drivers should attach
++ *	with drm_connector_attach_passive_vrr_capable_property() on
++ *	connectors that could support keeping variable refresh rate signalling
++ *	in fixed-refresh rate scenarios like desktop work. Drivers should update
++ *	the property value by calling
++ *	drm_connector_set_passive_vrr_capable_property().
++ *
++ *	Absence of the property should indicate absence of support.
++ *
+  * "VRR_ENABLED":
+  *	Default &drm_crtc boolean property that notifies the driver that the
+  *	content on the CRTC is suitable for variable refresh rate presentation.
+@@ -2364,6 +2380,17 @@ EXPORT_SYMBOL(drm_mode_create_scaling_mode_property);
+  *
+  *	The driver may place further restrictions within these minimum
+  *	and maximum bounds.
++ *
++ * "PASSIVE_VRR_DISABLED":
++ *	Default &drm_crtc boolean property that notifies the driver that the
++ *	VRR singalling should be disabled in fixed refresh rate scenarios.
++ *	Functionally, psssive vrr works the same as VRR_ENABLED == false
++ *	but works around displays blanking (mainly HDMI) that do not support
++ *	seamless VRR transitions. Also helps with brightness flickering during
++ *	VRR transitions.
++ *
++ *	Passive VRR mode is not that useful for DP/eDP sinks where seamless VRR
++ *	transitions are enforced by the standard.
+  */
+ 
+ /**
+@@ -2397,6 +2424,125 @@ int drm_connector_attach_vrr_capable_property(
+ }
+ EXPORT_SYMBOL(drm_connector_attach_vrr_capable_property);
+ 
++/**
++ * drm_connector_attach_passive_vrr_capable_property - creates the
++ * passive_vrr_capable property
++ * @connector: connector to create the passive_vrr_capable property on.
++ *
++ * This is used by atomic drivers to add support for querying
++ * variable refresh rate on desktop capability for a connector.
++ *
++ * Returns:
++ * Zero on success, negative errno on failure.
++ */
++int drm_connector_attach_passive_vrr_capable_property(
++	struct drm_connector *connector)
++{
++	struct drm_device *dev = connector->dev;
++	struct drm_property *prop;
++
++	if (!connector->passive_vrr_capable_property) {
++		prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE,
++			"passive_vrr_capable");
++		if (!prop)
++			return -ENOMEM;
++
++		connector->passive_vrr_capable_property = prop;
++		drm_object_attach_property(&connector->base, prop, 0);
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(drm_connector_attach_passive_vrr_capable_property);
++
++/**
++ * DOC: Auto Low Latency Mode properties
++ *
++ * Auto Low Latency capable HDMI displays (be it PC monitors or TVs)
++ * can automatically enter a "low latency" mode, usually named "Game Mode" by
++ * receiving specific data in HDMI Forum vendor-specific info frame.
++ *
++ * This usually is the best mode for PC usage but disables as much processing as
++ * possible which might not be desireable on lower end devices casing them to
++ * produce an image that's unsatisfactory to some users.
++ *
++ * "allm_capable":
++ *	Optional &drm_connector boolean property that drivers should attach
++ *	with drm_connector_attach_allm_capable_property() on connectors that
++ *	could support Auto Low Latency Mode. Drivers should update the
++ *	property value by calling drm_connector_set_allm_capable_property().
++ *
++ *	Absence of the property should indicate absence of support.
++ *
++ * "ALLM_MODE":
++ *	Optional &drm_connector enum property enables compositors to control and
++ *	expose ALLM triggering behavior modes to the end user where:
++ *
++ *	- ALLM_MODE_DISABLED: completely disabled ALLM signalling.
++ *	- ALLM_MODE_ENABLED_DYNAMIC: triggers ALLM based on current needs.
++ *	  preferrably display content type hint being set to Game by compositor
++ *	  or VRR being enabled and active.
++ *	- ALLM_MODE_ENABLED_FORCED: always-on ALLM triggering.
++ *
++ *	ALLM_MODE_ENABLED_DYNAMIC should behave like gaming devices such as
++ *	consoles where ALLM is only triggered when needed. It's main purpose is
++ *	gaming (part of so-called HDMI gaming features).
++ *
++ *	If compositors wish to control ALLM completely on their own, they can
++ *	switch between disabled and enabled_forced modes.
++ */
++
++/**
++ * drm_connector_attach_allm_capable_property - creates the
++ * allm_capable property
++ * @connector: connector to create the allm_capable property on.
++ *
++ * This is used by atomic drivers to add support for querying
++ * Auto Low Latency Mode capability for a connector.
++ *
++ * Returns:
++ * Zero on success, negative errno on failure.
++ */
++int drm_connector_attach_allm_capable_property(struct drm_connector *connector)
++{
++	struct drm_device *dev = connector->dev;
++	struct drm_property *prop;
++
++	if (!connector->allm_capable_property) {
++		prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE,
++			"allm_capable");
++		if (!prop)
++			return -ENOMEM;
++
++		connector->allm_capable_property = prop;
++		drm_object_attach_property(&connector->base, prop, 0);
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(drm_connector_attach_allm_capable_property);
++
++int drm_connector_attach_allm_mode_property(struct drm_connector *connector)
++{
++	struct drm_property *prop;
++
++	if (connector->allm_mode_property)
++		return 0;
++
++	prop = drm_property_create_enum(connector->dev, 0, "allm_mode",
++					drm_allm_mode_enum_list,
++					ARRAY_SIZE(drm_allm_mode_enum_list));
++	if (!prop)
++		return -ENOMEM;
++
++	connector->allm_mode_property = prop;
++	drm_object_attach_property(&connector->base, prop,
++				   DRM_ALLM_MODE_DISABLED);
++
++	return 0;
++}
++EXPORT_SYMBOL(drm_connector_attach_allm_mode_property);
++
+ /**
+  * drm_connector_attach_scaling_mode_property - attach atomic scaling mode property
+  * @connector: connector to attach scaling mode property on.
+@@ -2968,6 +3114,48 @@ void drm_connector_set_vrr_capable_property(
+ }
+ EXPORT_SYMBOL(drm_connector_set_vrr_capable_property);
+ 
++/**
++ * drm_connector_set_passive_vrr_disabled_capable_property - sets the variable refresh
++ * rate on desktop capable property for a connector
++ * @connector: drm connector
++ * @capable: True if the connector is variable refresh rate on desktop capable
++ *
++ * Should be used by atomic drivers to update the indicated support for
++ * variable refresh rate on desktop over a connector.
++ */
++void drm_connector_set_passive_vrr_capable_property(
++		struct drm_connector *connector, bool capable)
++{
++	if (!connector->passive_vrr_capable_property)
++		return;
++
++	drm_object_property_set_value(&connector->base,
++				      connector->passive_vrr_capable_property,
++				      capable);
++}
++EXPORT_SYMBOL(drm_connector_set_passive_vrr_capable_property);
++
++/**
++ * drm_connector_set_allm_capable_property - sets Auto Low Latency Mode
++ * capable property for a connector
++ * @connector: drm connector
++ * @capable: True if the connector is ALLM capable
++ *
++ * Should be used by atomic drivers to update the indicated support for
++ * Auto Low Latency Mode over a connector.
++ */
++void drm_connector_set_allm_capable_property(
++		struct drm_connector *connector, bool capable)
++{
++	if (!connector->allm_capable_property)
++		return;
++
++	drm_object_property_set_value(&connector->base,
++				      connector->allm_capable_property,
++				      capable);
++}
++EXPORT_SYMBOL(drm_connector_set_allm_capable_property);
++
+ /**
+  * drm_connector_set_panel_orientation - sets the connector's panel_orientation
+  * @connector: connector for which to set the panel-orientation property.
+diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
+index a7797d260f1e..4f2c871552e5 100644
+--- a/drivers/gpu/drm/drm_crtc.c
++++ b/drivers/gpu/drm/drm_crtc.c
+@@ -322,6 +322,8 @@ static int __drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc *
+ 					   config->prop_out_fence_ptr, 0);
+ 		drm_object_attach_property(&crtc->base,
+ 					   config->prop_vrr_enabled, 0);
++		drm_object_attach_property(&crtc->base,
++					   config->prop_passive_vrr_disabled, 0);
+ 	}
+ 
+ 	return 0;
+diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
+index 26bb7710a462..056eff8cbd1a 100644
+--- a/drivers/gpu/drm/drm_edid.c
++++ b/drivers/gpu/drm/drm_edid.c
+@@ -6152,6 +6152,33 @@ static void drm_parse_ycbcr420_deep_color_info(struct drm_connector *connector,
+ 	hdmi->y420_dc_modes = dc_mask;
+ }
+ 
++static void drm_parse_hdmi_gaming_info(struct drm_hdmi_info *hdmi, const u8 *db)
++{
++	struct drm_hdmi_vrr_cap *vrr = &hdmi->vrr_cap;
++
++	if (cea_db_payload_len(db) < 8)
++		return;
++
++	hdmi->fapa_start_location = db[8] & DRM_EDID_FAPA_START_LOCATION;
++	hdmi->allm = db[8] & DRM_EDID_ALLM;
++	vrr->fva = db[8] & DRM_EDID_FVA;
++	vrr->cnmvrr = db[8] & DRM_EDID_CNMVRR;
++	vrr->cinema_vrr = db[8] & DRM_EDID_CINEMA_VRR;
++	vrr->mdelta = db[8] & DRM_EDID_MDELTA;
++
++	if (cea_db_payload_len(db) < 9)
++		return;
++
++	vrr->vrr_min = db[9] & DRM_EDID_VRR_MIN_MASK;
++	vrr->supported = (vrr->vrr_min > 0 && vrr->vrr_min <= 48);
++
++	if (cea_db_payload_len(db) < 10)
++		return;
++
++	vrr->vrr_max = (db[9] & DRM_EDID_VRR_MAX_UPPER_MASK) << 2 | db[10];
++	vrr->supported &= (vrr->vrr_max == 0 || vrr->vrr_max >= 100);
++}
++
+ static void drm_parse_dsc_info(struct drm_hdmi_dsc_cap *hdmi_dsc,
+ 			       const u8 *hf_scds)
+ {
+@@ -6277,7 +6304,7 @@ static void drm_parse_hdmi_forum_scds(struct drm_connector *connector,
+ 	}
+ 
+ 	drm_parse_ycbcr420_deep_color_info(connector, hf_scds);
+-
++	drm_parse_hdmi_gaming_info(&connector->display_info.hdmi, hf_scds);
+ 	if (cea_db_payload_len(hf_scds) >= 11 && hf_scds[11]) {
+ 		drm_parse_dsc_info(hdmi_dsc, hf_scds);
+ 		dsc_support = true;
+@@ -6287,6 +6314,18 @@ static void drm_parse_hdmi_forum_scds(struct drm_connector *connector,
+ 		    "[CONNECTOR:%d:%s] HF-VSDB: max TMDS clock: %d KHz, HDMI 2.1 support: %s, DSC 1.2 support: %s\n",
+ 		    connector->base.id, connector->name,
+ 		    max_tmds_clock, str_yes_no(max_frl_rate), str_yes_no(dsc_support));
++	drm_dbg_kms(connector->dev,
++		    "[CONNECTOR:%d:%s] FAPA in blanking: %s, ALLM support: %s, Fast Vactive support: %s\n",
++		    connector->base.id, connector->name, str_yes_no(hdmi->fapa_start_location),
++		    str_yes_no(hdmi->allm), str_yes_no(hdmi->vrr_cap.fva));
++	drm_dbg_kms(connector->dev,
++		    "[CONNECTOR:%d:%s] Negative M VRR support: %s, CinemaVRR support: %s, Mdelta: %d\n",
++		    connector->base.id, connector->name, str_yes_no(hdmi->vrr_cap.cnmvrr),
++		    str_yes_no(hdmi->vrr_cap.cinema_vrr), hdmi->vrr_cap.mdelta);
++	drm_dbg_kms(connector->dev,
++		    "[CONNECTOR:%d:%s] VRRmin: %u, VRRmax: %u, VRR supported: %s\n",
++		    connector->base.id, connector->name, hdmi->vrr_cap.vrr_min,
++		    hdmi->vrr_cap.vrr_max, str_yes_no(hdmi->vrr_cap.supported));
+ }
+ 
+ static void drm_parse_hdmi_deep_color_info(struct drm_connector *connector,
+diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c
+index d12db9b0bab8..231f54ba66f8 100644
+--- a/drivers/gpu/drm/drm_mode_config.c
++++ b/drivers/gpu/drm/drm_mode_config.c
+@@ -345,6 +345,12 @@ static int drm_mode_create_standard_properties(struct drm_device *dev)
+ 		return -ENOMEM;
+ 	dev->mode_config.prop_vrr_enabled = prop;
+ 
++	prop = drm_property_create_bool(dev, 0,
++			"PASSIVE_VRR_DISABLED");
++	if (!prop)
++		return -ENOMEM;
++	dev->mode_config.prop_passive_vrr_disabled = prop;
++
+ 	prop = drm_property_create(dev,
+ 			DRM_MODE_PROP_BLOB,
+ 			"DEGAMMA_LUT", 0);
+diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
+index 8f34f4b8183d..fa4abfe8971e 100644
+--- a/include/drm/drm_connector.h
++++ b/include/drm/drm_connector.h
+@@ -58,6 +58,12 @@ enum drm_connector_force {
+ 	DRM_FORCE_ON_DIGITAL, /* for DVI-I use digital connector */
+ };
+ 
++enum drm_allm_mode {
++	DRM_ALLM_MODE_DISABLED,
++	DRM_ALLM_MODE_ENABLED_DYNAMIC,
++	DRM_ALLM_MODE_ENABLED_FORCED,
++};
++
+ /**
+  * enum drm_connector_status - status for a &drm_connector
+  *
+@@ -254,6 +260,44 @@ struct drm_scdc {
+ 	struct drm_scrambling scrambling;
+ };
+ 
++/**
++ * struct drm_hdmi_vrr_cap - Information about VRR capabilities of a HDMI sink
++ *
++ * Describes the VRR support provided by HDMI 2.1 sink. The information is
++ * fetched fom additional HFVSDB blocks defined for HDMI 2.1.
++ */
++struct drm_hdmi_vrr_cap {
++	/** @fva: flag for Fast VActive (Quick Frame Transport) support */
++	bool fva;
++
++	/** @mcnmvrr: flag for Negative M VRR support */
++	bool cnmvrr;
++
++	/** @mcinema_vrr: flag for Cinema VRR support */
++	bool cinema_vrr;
++
++	/** @mdelta: flag for limited frame-to-frame compensation support */
++	bool mdelta;
++
++	/**
++	 * @vrr_min : minimum supported variable refresh rate in Hz.
++	 * Valid values only inide 1 - 48 range
++	 */
++	u16 vrr_min;
++
++	/**
++	 * @vrr_max : maximum supported variable refresh rate in Hz (optional).
++	 * Valid values are either 0 (max based on video mode) or >= 100
++	 */
++	u16 vrr_max;
++
++	/**
++	 * @supported: flag for vrr support based on checking for VRRmin and
++	 * VRRmax values having correct values.
++	 */
++	bool supported;
++};
++
+ /**
+  * struct drm_hdmi_dsc_cap - DSC capabilities of HDMI sink
+  *
+@@ -330,6 +374,15 @@ struct drm_hdmi_info {
+ 	/** @max_lanes: supported by sink */
+ 	u8 max_lanes;
+ 
++	/** @fapa_start_location: flag for the FAPA in blanking support */
++	bool fapa_start_location;
++
++	/** @allm: flag for Auto Low Latency Mode support by sink */
++	bool allm;
++
++	/** @vrr_cap: VRR capabilities of the sink */
++	struct drm_hdmi_vrr_cap vrr_cap;
++
+ 	/** @dsc_cap: DSC capabilities of the sink */
+ 	struct drm_hdmi_dsc_cap dsc_cap;
+ };
+@@ -1100,6 +1153,13 @@ struct drm_connector_state {
+ 	 */
+ 	unsigned int content_protection;
+ 
++	/**
++	 * @allm_mode: Connector property to control the
++	 * HDMI Auto Low Latency Mode trigger setting.
++	 * The %DRM_ALLM_MODE_\* values must match the values.
++	 */
++	enum drm_allm_mode allm_mode;
++
+ 	/**
+ 	 * @colorspace: State variable for Connector property to request
+ 	 * colorspace change on Sink. This is most commonly used to switch
+@@ -2054,6 +2114,37 @@ struct drm_connector {
+ 	 */
+ 	struct drm_property *vrr_capable_property;
+ 
++	/**
++	 * @passive_vrr_capable_property: Optional property to help userspace
++	 * query hardware support for passive variable refresh rate on a
++	 * connector. Drivers can add the property to a connector by
++	 * calling drm_connector_attach_passive_vrr_capable_property().
++	 *
++	 * This should be updated only by calling
++	 * drm_connector_set_passive_vrr_capable_property().
++	 */
++	struct drm_property *passive_vrr_capable_property;
++
++	/**
++	 * @allm_capable_property: Optional property to help userspace
++	 * query hardware support for HDMI Auto Low Latency Mode on a connector.
++	 * Drivers can add the property to a connector by calling
++	 * drm_connector_attach_allm_capable_property().
++	 *
++	 * This should be updated only by calling
++	 * drm_connector_set_allm_capable_property().
++	 */
++	struct drm_property *allm_capable_property;
++
++	/**
++	 * @allm_mode_property:
++	 *
++	 * Indicates HDMI Auto Low Latency Mode triggering mode for connector.
++	 * Support for the requested state will depend on driver and hardware
++	 * capabiltiy - lacking support is not treated as failure.
++	 */
++	struct drm_property *allm_mode_property;
++
+ 	/**
+ 	 * @colorspace_property: Connector property to set the suitable
+ 	 * colorspace supported by the sink.
+@@ -2448,6 +2539,10 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector,
+ 					       u32 scaling_mode_mask);
+ int drm_connector_attach_vrr_capable_property(
+ 		struct drm_connector *connector);
++int drm_connector_attach_passive_vrr_capable_property(
++		struct drm_connector *connector);
++int drm_connector_attach_allm_capable_property(struct drm_connector *connector);
++int drm_connector_attach_allm_mode_property(struct drm_connector *connector);
+ int drm_connector_attach_broadcast_rgb_property(struct drm_connector *connector);
+ int drm_connector_attach_colorspace_property(struct drm_connector *connector);
+ int drm_connector_attach_hdr_output_metadata_property(struct drm_connector *connector);
+@@ -2470,6 +2565,10 @@ void drm_connector_set_link_status_property(struct drm_connector *connector,
+ 					    uint64_t link_status);
+ void drm_connector_set_vrr_capable_property(
+ 		struct drm_connector *connector, bool capable);
++void drm_connector_set_passive_vrr_capable_property(
++		struct drm_connector *connector, bool capable);
++void drm_connector_set_allm_capable_property(
++		struct drm_connector *connector, bool capable);
+ int drm_connector_set_panel_orientation(
+ 	struct drm_connector *connector,
+ 	enum drm_panel_orientation panel_orientation);
+diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
+index 66278ffeebd6..59dbb7ce1358 100644
+--- a/include/drm/drm_crtc.h
++++ b/include/drm/drm_crtc.h
+@@ -299,6 +299,15 @@ struct drm_crtc_state {
+ 	 */
+ 	bool vrr_enabled;
+ 
++	/**
++	 * @passive_vrr_disabled:
++	 *
++	 * Indicates if variable refresh rate on desktop should be enabled for
++	 * the CRTC. Support for the requested state will depend on driver and
++	 * hardware capabiltiy - lacking support is not treated as failure.
++	 */
++	bool passive_vrr_disabled;
++
+ 	/**
+ 	 * @self_refresh_active:
+ 	 *
+diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
+index 895fb820dba0..23ce744b233b 100644
+--- a/include/drm/drm_mode_config.h
++++ b/include/drm/drm_mode_config.h
+@@ -697,6 +697,12 @@ struct drm_mode_config {
+ 	 * whether variable refresh rate should be enabled on the CRTC.
+ 	 */
+ 	struct drm_property *prop_vrr_enabled;
++	/**
++	 * @prop_passive_vrr_disabled: Default atomic CRTC property to indicate
++	 * whether passive variable refresh rate should be disabled
++	 * on the CRTC.
++	 */
++	struct drm_property *prop_passive_vrr_disabled;
+ 
+ 	/**
+ 	 * @dvi_i_subconnector_property: Optional DVI-I property to
+-- 
+2.53.0
+
diff --git a/sys-kernel/gentoo-sources-6.19/0006-r8125.patch b/sys-kernel/gentoo-sources-6.19/0006-r8125.patch
new file mode 100644
index 0000000..ae00052
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.19/0006-r8125.patch
@@ -0,0 +1,29360 @@
+From 739c942b8335f00091b9c255370d5b27448af308 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 13 Feb 2026 16:53:25 +0100
+Subject: [PATCH 6/9] r8125
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/net/ethernet/realtek/Kconfig          |    15 +
+ drivers/net/ethernet/realtek/Makefile         |     2 +
+ drivers/net/ethernet/realtek/r8125.h          |  3059 +++
+ drivers/net/ethernet/realtek/r8125_dash.c     |   573 +
+ drivers/net/ethernet/realtek/r8125_dash.h     |   196 +
+ drivers/net/ethernet/realtek/r8125_fiber.c    |   464 +
+ drivers/net/ethernet/realtek/r8125_fiber.h    |    63 +
+ drivers/net/ethernet/realtek/r8125_firmware.c |   264 +
+ drivers/net/ethernet/realtek/r8125_firmware.h |    68 +
+ drivers/net/ethernet/realtek/r8125_n.c        | 21312 ++++++++++++++++
+ drivers/net/ethernet/realtek/r8125_ptp.c      |  1472 ++
+ drivers/net/ethernet/realtek/r8125_ptp.h      |   159 +
+ drivers/net/ethernet/realtek/r8125_realwow.h  |   118 +
+ drivers/net/ethernet/realtek/r8125_rss.c      |   583 +
+ drivers/net/ethernet/realtek/r8125_rss.h      |    76 +
+ drivers/net/ethernet/realtek/r8169_main.c     |     6 +-
+ drivers/net/ethernet/realtek/rtl_eeprom.c     |   284 +
+ drivers/net/ethernet/realtek/rtl_eeprom.h     |    53 +
+ drivers/net/ethernet/realtek/rtltool.c        |   312 +
+ drivers/net/ethernet/realtek/rtltool.h        |    89 +
+ 20 files changed, 29166 insertions(+), 2 deletions(-)
+ create mode 100755 drivers/net/ethernet/realtek/r8125.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_dash.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_dash.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_fiber.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_fiber.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_firmware.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_firmware.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_n.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_ptp.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_ptp.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_realwow.h
+ create mode 100755 drivers/net/ethernet/realtek/r8125_rss.c
+ create mode 100755 drivers/net/ethernet/realtek/r8125_rss.h
+ create mode 100755 drivers/net/ethernet/realtek/rtl_eeprom.c
+ create mode 100755 drivers/net/ethernet/realtek/rtl_eeprom.h
+ create mode 100755 drivers/net/ethernet/realtek/rtltool.c
+ create mode 100755 drivers/net/ethernet/realtek/rtltool.h
+
+diff --git a/drivers/net/ethernet/realtek/Kconfig b/drivers/net/ethernet/realtek/Kconfig
+index 272c83bfdc6c..dc7cf96add0c 100644
+--- a/drivers/net/ethernet/realtek/Kconfig
++++ b/drivers/net/ethernet/realtek/Kconfig
+@@ -95,6 +95,21 @@ config 8139_OLD_RX_RESET
+ 	  experience problems, you can enable this option to restore the
+ 	  old RX-reset behavior.  If unsure, say N.
+ 
++config R8125
++	tristate "Realtek 8125/8162 ethernet support"
++	depends on PCI
++	select FW_LOADER
++	select CRC32
++	select PHYLIB
++	select REALTEK_PHY
++	help
++	  Say Y here if you have a Realtek Ethernet adapter belonging to
++	  the following families:
++	  RTL8125 2.5GBit Ethernet
++
++	  To compile this driver as a module, choose M here: the module
++	  will be called r8125.  This is recommended.
++
+ config R8169
+ 	tristate "Realtek 8169/8168/8101/8125 ethernet support"
+ 	depends on PCI
+diff --git a/drivers/net/ethernet/realtek/Makefile b/drivers/net/ethernet/realtek/Makefile
+index 046adf503ff4..dee73dfd003f 100644
+--- a/drivers/net/ethernet/realtek/Makefile
++++ b/drivers/net/ethernet/realtek/Makefile
+@@ -9,4 +9,6 @@ obj-$(CONFIG_ATP) += atp.o
+ r8169-y += r8169_main.o r8169_firmware.o r8169_phy_config.o
+ r8169-$(CONFIG_R8169_LEDS) += r8169_leds.o
+ obj-$(CONFIG_R8169) += r8169.o
++r8125-y += r8125_n.o rtl_eeprom.o rtltool.o
++obj-$(CONFIG_R8125) += r8125.o
+ obj-$(CONFIG_RTASE) += rtase/
+diff --git a/drivers/net/ethernet/realtek/r8125.h b/drivers/net/ethernet/realtek/r8125.h
+new file mode 100755
+index 000000000000..57b2b94872fd
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125.h
+@@ -0,0 +1,3059 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef __R8125_H
++#define __R8125_H
++
++#define CONFIG_SOC_LAN
++#define CONFIG_ASPM
++#define ENABLE_S5WOL
++#define ENABLE_EEE
++#define ENABLE_TX_NO_CLOSE
++#define ENABLE_GIGA_LITE
++
++//#include <linux/pci.h>
++#include <linux/ethtool.h>
++#include <linux/interrupt.h>
++#include <linux/version.h>
++#include "r8125_dash.h"
++#include "r8125_realwow.h"
++#ifdef ENABLE_FIBER_SUPPORT
++#include "r8125_fiber.h"
++#endif /* ENABLE_FIBER_SUPPORT */
++#ifdef ENABLE_PTP_SUPPORT
++#include "r8125_ptp.h"
++#endif
++#include "r8125_rss.h"
++#ifdef ENABLE_LIB_SUPPORT
++#include "r8125_lib.h"
++#endif
++
++#ifndef fallthrough
++#define fallthrough
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++#define netif_xmit_stopped netif_tx_queue_stopped
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
++#ifndef MDIO_AN_EEE_ADV_100TX
++#define MDIO_AN_EEE_ADV_100TX	0x0002	/* Advertise 100TX EEE cap */
++#endif
++#ifndef MDIO_AN_EEE_ADV_1000T
++#define MDIO_AN_EEE_ADV_1000T	0x0004	/* Advertise 1000T EEE cap */
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)
++#define MDIO_EEE_100TX		MDIO_AN_EEE_ADV_100TX	/* 100TX EEE cap */
++#define MDIO_EEE_1000T		MDIO_AN_EEE_ADV_1000T	/* 1000T EEE cap */
++#define MDIO_EEE_10GT		0x0008	/* 10GT EEE cap */
++#define MDIO_EEE_1000KX		0x0010	/* 1000KX EEE cap */
++#define MDIO_EEE_10GKX4		0x0020	/* 10G KX4 EEE cap */
++#define MDIO_EEE_10GKR		0x0040	/* 10G KR EEE cap */
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) */
++
++static inline u32 mmd_eee_adv_to_ethtool_adv_t(u16 eee_adv)
++{
++        u32 adv = 0;
++
++        if (eee_adv & MDIO_EEE_100TX)
++                adv |= ADVERTISED_100baseT_Full;
++        if (eee_adv & MDIO_EEE_1000T)
++                adv |= ADVERTISED_1000baseT_Full;
++        if (eee_adv & MDIO_EEE_10GT)
++                adv |= ADVERTISED_10000baseT_Full;
++        if (eee_adv & MDIO_EEE_1000KX)
++                adv |= ADVERTISED_1000baseKX_Full;
++        if (eee_adv & MDIO_EEE_10GKX4)
++                adv |= ADVERTISED_10000baseKX4_Full;
++        if (eee_adv & MDIO_EEE_10GKR)
++                adv |= ADVERTISED_10000baseKR_Full;
++
++        return adv;
++}
++
++static inline u16 ethtool_adv_to_mmd_eee_adv_t(u32 adv)
++{
++        u16 reg = 0;
++
++        if (adv & ADVERTISED_100baseT_Full)
++                reg |= MDIO_EEE_100TX;
++        if (adv & ADVERTISED_1000baseT_Full)
++                reg |= MDIO_EEE_1000T;
++        if (adv & ADVERTISED_10000baseT_Full)
++                reg |= MDIO_EEE_10GT;
++        if (adv & ADVERTISED_1000baseKX_Full)
++                reg |= MDIO_EEE_1000KX;
++        if (adv & ADVERTISED_10000baseKX4_Full)
++                reg |= MDIO_EEE_10GKX4;
++        if (adv & ADVERTISED_10000baseKR_Full)
++                reg |= MDIO_EEE_10GKR;
++
++        return reg;
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
++static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
++{
++        return skb->transport_header != ~0U;
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0)
++static inline
++ssize_t strscpy(char *dest, const char *src, size_t count)
++{
++        long res = 0;
++
++        if (count == 0)
++                return -E2BIG;
++
++        while (count) {
++                char c;
++
++                c = src[res];
++                dest[res] = c;
++                if (!c)
++                        return res;
++                res++;
++                count--;
++        }
++
++        /* Hit buffer length without finding a NUL; force NUL-termination. */
++        if (res)
++                dest[res-1] = '\0';
++
++        return -E2BIG;
++}
++#endif
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0))
++static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
++{
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22))
++        return skb->head + skb->csum_start;
++#else /* < 2.6.22 */
++        return skb_transport_header(skb);
++#endif
++}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
++                                        unsigned int bytes)
++{}
++static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
++                unsigned int pkts,
++                unsigned int bytes)
++{}
++static inline void netdev_tx_reset_queue(struct netdev_queue *q) {}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
++static inline void fsleep(unsigned long usecs)
++{
++        if (usecs <= 10)
++                udelay(usecs);
++        else if (usecs <= 20000)
++                usleep_range(usecs, 2 * usecs);
++        else
++                msleep(DIV_ROUND_UP(usecs, 1000));
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0)
++#define netdev_xmit_more() (0)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
++#define netif_testing_on(dev)
++#define netif_testing_off(dev)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0)
++#define netdev_sw_irq_coalesce_default_on(dev)
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
++typedef int netdev_tx_t;
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,12,0)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9)
++static inline bool page_is_pfmemalloc(struct page *page)
++{
++        /*
++         * Page index cannot be this large so this must be
++         * a pfmemalloc page.
++         */
++        return page->index == -1UL;
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9) */
++static inline bool dev_page_is_reusable(struct page *page)
++{
++        return likely(page_to_nid(page) == numa_mem_id() &&
++                      !page_is_pfmemalloc(page));
++}
++#endif
++
++/*
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0)&& !defined(ENABLE_LIB_SUPPORT)
++#define RTL_USE_NEW_INTR_API
++#endif
++*/
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0)
++#define dma_map_page_attrs(dev, page, offset, size, dir, attrs) \
++	dma_map_page(dev, page, offset, size, dir)
++#define dma_unmap_page_attrs(dev, page, size, dir, attrs) \
++	 dma_unmap_page(dev, page, size, dir)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++#define page_ref_inc(page) atomic_inc(&page->_count)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,4,216)
++#define page_ref_count(page) atomic_read(&page->_count)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,4,216)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++#define skb_transport_offset(skb) (skb->h.raw - skb->data)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
++#define device_set_wakeup_enable(dev, val)	do {} while (0)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)
++static inline void ether_addr_copy(u8 *dst, const u8 *src)
++{
++        u16 *a = (u16 *)dst;
++        const u16 *b = (const u16 *)src;
++
++        a[0] = b[0];
++        a[1] = b[1];
++        a[2] = b[2];
++}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0)
++#define IS_ERR_OR_NULL(ptr)			(!ptr)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0)
++#define reinit_completion(x)			((x)->done = 0)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
++#define pm_runtime_mark_last_busy(x)
++#define pm_runtime_put_autosuspend(x)		pm_runtime_put(x)
++#define pm_runtime_put_sync_autosuspend(x)	pm_runtime_put_sync(x)
++
++static inline bool pm_runtime_suspended(struct device *dev)
++{
++        return dev->power.runtime_status == RPM_SUSPENDED
++               && !dev->power.disable_depth;
++}
++
++static inline bool pm_runtime_active(struct device *dev)
++{
++        return dev->power.runtime_status == RPM_ACTIVE
++               || dev->power.disable_depth;
++}
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
++#define queue_delayed_work(long_wq, work, delay)	schedule_delayed_work(work, delay)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
++#define netif_printk(priv, type, level, netdev, fmt, args...)	\
++	do {								\
++		if (netif_msg_##type(priv))				\
++			printk(level "%s: " fmt,(netdev)->name , ##args); \
++	} while (0)
++
++#define netif_emerg(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_EMERG, netdev, fmt, ##args)
++#define netif_alert(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_ALERT, netdev, fmt, ##args)
++#define netif_crit(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_CRIT, netdev, fmt, ##args)
++#define netif_err(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_ERR, netdev, fmt, ##args)
++#define netif_warn(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_WARNING, netdev, fmt, ##args)
++#define netif_notice(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_NOTICE, netdev, fmt, ##args)
++#define netif_info(priv, type, netdev, fmt, args...)		\
++		netif_printk(priv, type, KERN_INFO, (netdev), fmt, ##args)
++#endif
++#endif
++#endif
++#endif
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
++#define setup_timer(_timer, _function, _data) \
++do { \
++	(_timer)->function = _function; \
++	(_timer)->data = _data; \
++	init_timer(_timer); \
++} while (0)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
++#if defined(skb_vlan_tag_present) && !defined(vlan_tx_tag_present)
++#define vlan_tx_tag_present skb_vlan_tag_present
++#endif
++#if defined(skb_vlan_tag_get) && !defined(vlan_tx_tag_get)
++#define vlan_tx_tag_get skb_vlan_tag_get
++#endif
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
++
++#define RTL_ALLOC_SKB_INTR(napi, length) dev_alloc_skb(length)
++#define R8125_USE_NAPI_ALLOC_SKB 0
++#ifdef CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)
++#undef RTL_ALLOC_SKB_INTR
++#define RTL_ALLOC_SKB_INTR(napi, length) napi_alloc_skb(napi, length)
++#undef R8125_USE_NAPI_ALLOC_SKB
++#define R8125_USE_NAPI_ALLOC_SKB 1
++#endif
++#endif
++
++#define RTL_BUILD_SKB_INTR(data, frag_size) build_skb(data, frag_size)
++#ifdef CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,12,0)
++#undef RTL_BUILD_SKB_INTR
++#define RTL_BUILD_SKB_INTR(data, frag_size) napi_build_skb(data, frag_size)
++#endif
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)
++#define eth_random_addr(addr) random_ether_addr(addr)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)
++#define netdev_features_t  u32
++#endif
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
++#define NETIF_F_ALL_CSUM        NETIF_F_CSUM_MASK
++#else
++#ifndef NETIF_F_ALL_CSUM
++#define NETIF_F_ALL_CSUM        NETIF_F_CSUM_MASK
++#endif
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
++#define ENABLE_R8125_PROCFS
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
++#define ENABLE_R8125_SYSFS
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++#define NETIF_F_HW_VLAN_RX	NETIF_F_HW_VLAN_CTAG_RX
++#define NETIF_F_HW_VLAN_TX	NETIF_F_HW_VLAN_CTAG_TX
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
++#define __devinit
++#define __devexit
++#define __devexit_p(func)   func
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++#define CHECKSUM_PARTIAL CHECKSUM_HW
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++#define irqreturn_t void
++#define IRQ_HANDLED    1
++#define IRQ_NONE   0
++#define IRQ_RETVAL(x)
++#endif
++
++#ifndef NETIF_F_RXALL
++#define NETIF_F_RXALL  0
++#endif
++
++#ifndef NETIF_F_RXFCS
++#define NETIF_F_RXFCS  0
++#endif
++
++#if !defined(HAVE_FREE_NETDEV) && (LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0))
++#define free_netdev(x)  kfree(x)
++#endif
++
++#ifndef SET_NETDEV_DEV
++#define SET_NETDEV_DEV(net, pdev)
++#endif
++
++#ifndef SET_MODULE_OWNER
++#define SET_MODULE_OWNER(dev)
++#endif
++
++#ifndef SA_SHIRQ
++#define SA_SHIRQ IRQF_SHARED
++#endif
++
++#ifndef NETIF_F_GSO
++#define gso_size    tso_size
++#define gso_segs    tso_segs
++#endif
++
++#ifndef PCI_VENDOR_ID_DLINK
++#define PCI_VENDOR_ID_DLINK 0x1186
++#endif
++
++#ifndef dma_mapping_error
++#define dma_mapping_error(a,b) 0
++#endif
++
++#ifndef netif_err
++#define netif_err(a,b,c,d)
++#endif
++
++#ifndef AUTONEG_DISABLE
++#define AUTONEG_DISABLE   0x00
++#endif
++
++#ifndef AUTONEG_ENABLE
++#define AUTONEG_ENABLE    0x01
++#endif
++
++#ifndef BMCR_SPEED1000
++#define BMCR_SPEED1000  0x0040
++#endif
++
++#ifndef BMCR_SPEED100
++#define BMCR_SPEED100   0x2000
++#endif
++
++#ifndef BMCR_SPEED10
++#define BMCR_SPEED10    0x0000
++#endif
++
++#ifndef SPEED_UNKNOWN
++#define SPEED_UNKNOWN   -1
++#endif
++
++#ifndef DUPLEX_UNKNOWN
++#define DUPLEX_UNKNOWN  0xff
++#endif
++
++#ifndef SUPPORTED_Pause
++#define SUPPORTED_Pause  (1 << 13)
++#endif
++
++#ifndef SUPPORTED_Asym_Pause
++#define SUPPORTED_Asym_Pause  (1 << 14)
++#endif
++
++#ifndef  MDIO_EEE_100TX
++#define  MDIO_EEE_100TX  0x0002
++#endif
++
++#ifndef  MDIO_EEE_1000T
++#define  MDIO_EEE_1000T  0x0004
++#endif
++
++#ifndef  MDIO_EEE_2_5GT
++#define  MDIO_EEE_2_5GT  0x0001
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0)
++#define ethtool_keee ethtool_eee
++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t ethtool_adv_to_mmd_eee_adv_t
++static inline u32 rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t(u32 adv)
++{
++        u32 result = 0;
++
++        if (adv & SUPPORTED_2500baseX_Full)
++                result |= MDIO_EEE_2_5GT;
++
++        return result;
++}
++#else
++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t linkmode_to_mii_eee_cap1_t
++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t linkmode_to_mii_eee_cap2_t
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++#ifdef CONFIG_NET_POLL_CONTROLLER
++#define RTL_NET_POLL_CONTROLLER dev->poll_controller=rtl8125_netpoll
++#else
++#define RTL_NET_POLL_CONTROLLER
++#endif
++
++#ifdef CONFIG_R8125_VLAN
++#define RTL_SET_VLAN dev->vlan_rx_register=rtl8125_vlan_rx_register
++#else
++#define RTL_SET_VLAN
++#endif
++
++#define RTL_NET_DEVICE_OPS(ops) dev->open=rtl8125_open; \
++                    dev->hard_start_xmit=rtl8125_start_xmit; \
++                    dev->get_stats=rtl8125_get_stats; \
++                    dev->stop=rtl8125_close; \
++                    dev->tx_timeout=rtl8125_tx_timeout; \
++                    dev->set_multicast_list=rtl8125_set_rx_mode; \
++                    dev->change_mtu=rtl8125_change_mtu; \
++                    dev->set_mac_address=rtl8125_set_mac_address; \
++                    dev->do_ioctl=rtl8125_do_ioctl; \
++                    RTL_NET_POLL_CONTROLLER; \
++                    RTL_SET_VLAN;
++#else
++#define RTL_NET_DEVICE_OPS(ops) dev->netdev_ops=&ops
++#endif
++
++#ifndef FALSE
++#define FALSE 0
++#endif
++
++#ifndef TRUE
++#define TRUE  1
++#endif
++
++#ifndef false
++#define false 0
++#endif
++
++#ifndef true
++#define true  1
++#endif
++
++//Hardware will continue interrupt 10 times after interrupt finished.
++#define RTK_KEEP_INTERRUPT_COUNT (10)
++
++//the low 32 bit address of receive buffer must be 8-byte alignment.
++#ifndef NET_IP_ALIGN
++#define NET_IP_ALIGN        2
++#endif
++#define R8125_RX_ALIGN        NET_IP_ALIGN
++
++#ifdef CONFIG_R8125_NAPI
++#define NAPI_SUFFIX "-NAPI"
++#else
++#define NAPI_SUFFIX ""
++#endif
++
++#if defined(ENABLE_REALWOW_SUPPORT)
++#define REALWOW_SUFFIX "-REALWOW"
++#else
++#define REALWOW_SUFFIX ""
++#endif
++
++#if defined(ENABLE_DASH_SUPPORT)
++#define DASH_SUFFIX "-DASH"
++#else
++#define DASH_SUFFIX ""
++#endif
++
++#if defined(ENABLE_PTP_SUPPORT)
++#define PTP_SUFFIX "-PTP"
++#else
++#define PTP_SUFFIX ""
++#endif
++
++#if defined(ENABLE_RSS_SUPPORT)
++#define RSS_SUFFIX "-RSS"
++#else
++#define RSS_SUFFIX ""
++#endif
++
++#define RTL8125_VERSION "9.016.01" NAPI_SUFFIX DASH_SUFFIX REALWOW_SUFFIX PTP_SUFFIX RSS_SUFFIX
++#define MODULENAME "r8125"
++#define PFX MODULENAME ": "
++
++#define GPL_CLAIM "\
++r8125  Copyright (C) 2025 Realtek NIC software team <nicfae@realtek.com> \n \
++This program comes with ABSOLUTELY NO WARRANTY; for details, please see <http://www.gnu.org/licenses/>. \n \
++This is free software, and you are welcome to redistribute it under certain conditions; see <http://www.gnu.org/licenses/>. \n"
++
++#ifdef RTL8125_DEBUG
++#define assert(expr) \
++        if(!(expr)) {                   \
++            printk("Assertion failed! %s,%s,%s,line=%d\n", \
++            #expr,__FILE__,__FUNCTION__,__LINE__);      \
++        }
++#define dprintk(fmt, args...)   do { printk(PFX fmt, ## args); } while (0)
++#else
++#define assert(expr) do {} while (0)
++#define dprintk(fmt, args...)   do {} while (0)
++#endif /* RTL8125_DEBUG */
++
++#define R8125_MSG_DEFAULT \
++    (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN)
++
++#ifdef CONFIG_R8125_NAPI
++#define rtl8125_rx_hwaccel_skb      vlan_hwaccel_receive_skb
++#define rtl8125_rx_quota(count, quota)  min(count, quota)
++#else
++#define rtl8125_rx_hwaccel_skb      vlan_hwaccel_rx
++#define rtl8125_rx_quota(count, quota)  count
++#endif
++
++#ifdef CONFIG_R8125_NAPI
++#define r8125_spin_lock(lock, flags)  (void)flags;spin_lock_bh(lock)
++#define r8125_spin_unlock(lock, flags)  (void)flags;spin_unlock_bh(lock)
++#else
++#define r8125_spin_lock(lock, flags)  spin_lock_irqsave(lock, flags)
++#define r8125_spin_unlock(lock, flags)  spin_unlock_irqrestore(lock, flags)
++#endif
++
++/* MAC address length */
++#ifndef MAC_ADDR_LEN
++#define MAC_ADDR_LEN    6
++#endif
++
++#ifndef MAC_PROTOCOL_LEN
++#define MAC_PROTOCOL_LEN    2
++#endif
++
++#ifndef ETH_FCS_LEN
++#define ETH_FCS_LEN	  4
++#endif
++
++#ifndef NETIF_F_TSO6
++#define NETIF_F_TSO6  0
++#endif
++
++#define Reserved2_data  7
++#define RX_DMA_BURST_unlimited  7   /* Maximum PCI burst, '7' is unlimited */
++#define RX_DMA_BURST_512    5
++#define RX_DMA_BURST_256    4
++#define TX_DMA_BURST_unlimited  7
++#define TX_DMA_BURST_1024   6
++#define TX_DMA_BURST_512    5
++#define TX_DMA_BURST_256    4
++#define TX_DMA_BURST_128    3
++#define TX_DMA_BURST_64     2
++#define TX_DMA_BURST_32     1
++#define TX_DMA_BURST_16     0
++#define Reserved1_data  0x3F
++#define RxPacketMaxSize 0x3FE8  /* 16K - 1 - ETH_HLEN - VLAN - CRC... */
++#define Jumbo_Frame_1k  ETH_DATA_LEN
++#define Jumbo_Frame_2k  (2*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_3k  (3*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_4k  (4*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_5k  (5*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_6k  (6*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_7k  (7*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_8k  (8*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define Jumbo_Frame_9k  (9*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN)
++#define InterFrameGap   0x03    /* 3 means InterFrameGap = the shortest one */
++#define RxEarly_off_V1 (0x07 << 11)
++#define RxEarly_off_V2 (1 << 11)
++#define Rx_Single_fetch_V2 (1 << 14)
++#define Rx_Close_Multiple (1 << 21)
++#define Rx_Fetch_Number_8 (1 << 30)
++
++#define R8125_REGS_SIZE     (256)
++#define R8125_MAC_REGS_SIZE     (256)
++#define R8125_PHY_REGS_SIZE     (16*2)
++#define R8125_EPHY_REGS_SIZE  	(31*2)
++#define R8125_ERI_REGS_SIZE  	(0x100)
++#define R8125_REGS_DUMP_SIZE     (0x400)
++#define R8125_PCI_REGS_SIZE  	(0x100)
++#define R8125_NAPI_WEIGHT   64
++
++#define R8125_MAX_MSIX_VEC_8125A   4
++#define R8125_MAX_MSIX_VEC_8125B   32
++#define R8125_MAX_MSIX_VEC_8125D   32
++#define R8125_MIN_MSIX_VEC_8125B   22
++#define R8125_MIN_MSIX_VEC_8125BP  32
++#define R8125_MIN_MSIX_VEC_8125CP  31
++#define R8125_MIN_MSIX_VEC_8125D   20
++#define R8125_MAX_MSIX_VEC   32
++#define R8125_MAX_RX_QUEUES_VEC_V3 (16)
++
++#define RTL8125_TX_TIMEOUT  (6 * HZ)
++#define RTL8125_LINK_TIMEOUT    (1 * HZ)
++#define RTL8125_ESD_TIMEOUT (2 * HZ)
++#define RTL8125_DASH_TIMEOUT    (0)
++
++#define rtl8125_rx_page_size(order) (PAGE_SIZE << order)
++
++#define MAX_NUM_TX_DESC 1024    /* Maximum number of Tx descriptor registers */
++#define MAX_NUM_RX_DESC 1024    /* Maximum number of Rx descriptor registers */
++
++#define MIN_NUM_TX_DESC 256    /* Minimum number of Tx descriptor registers */
++#define MIN_NUM_RX_DESC 256    /* Minimum number of Rx descriptor registers */
++
++#define NUM_TX_DESC MAX_NUM_TX_DESC    /* Number of Tx descriptor registers */
++#define NUM_RX_DESC MAX_NUM_RX_DESC    /* Number of Rx descriptor registers */
++
++#ifdef ENABLE_DOUBLE_VLAN
++#define RX_BUF_SIZE 0x05F6  /* 0x05F6(1526) = 1514 + 8(double vlan) + 4(crc) bytes */
++#define RT_VALN_HLEN 8      /* 8(double vlan) bytes */
++#else
++#define RX_BUF_SIZE 0x05F2  /* 0x05F2(1522) = 1514 + 4(single vlan) + 4(crc) bytes */
++#define RT_VALN_HLEN 4      /* 4(single vlan) bytes */
++#endif
++
++#define R8125_MAX_TX_QUEUES (2)
++#define R8125_MAX_RX_QUEUES_V2 (4)
++#define R8125_MAX_RX_QUEUES_V3 (16)
++#define R8125_MAX_RX_QUEUES R8125_MAX_RX_QUEUES_V3
++#define R8125_MAX_QUEUES R8125_MAX_RX_QUEUES
++
++#define OCP_STD_PHY_BASE	0xa400
++
++//Channel Wait Count
++#define R8125_CHANNEL_WAIT_COUNT (20000)
++#define R8125_CHANNEL_WAIT_TIME (1)  // 1us
++#define R8125_CHANNEL_EXIT_DELAY_TIME (20)  //20us
++
++#ifdef ENABLE_LIB_SUPPORT
++#define R8125_MULTI_RX_Q(tp) 0
++#else
++#define R8125_MULTI_RX_Q(tp) (tp->num_rx_rings > 1)
++#endif
++
++#define NODE_ADDRESS_SIZE 6
++
++#define SHORT_PACKET_PADDING_BUF_SIZE 256
++
++#define RTK_MAGIC_DEBUG_VALUE 0x0badbeef
++
++/* write/read MMIO register */
++#define RTL_W8(tp, reg, val8)	writeb((val8), tp->mmio_addr + (reg))
++#define RTL_W16(tp, reg, val16)	writew((val16), tp->mmio_addr + (reg))
++#define RTL_W32(tp, reg, val32)	writel((val32), tp->mmio_addr + (reg))
++#define RTL_R8(tp, reg)		readb(tp->mmio_addr + (reg))
++#define RTL_R16(tp, reg)		readw(tp->mmio_addr + (reg))
++#define RTL_R32(tp, reg)		((unsigned long) readl(tp->mmio_addr + (reg)))
++
++#ifndef DMA_64BIT_MASK
++#define DMA_64BIT_MASK  0xffffffffffffffffULL
++#endif
++
++#ifndef DMA_32BIT_MASK
++#define DMA_32BIT_MASK  0x00000000ffffffffULL
++#endif
++
++#ifndef NETDEV_TX_OK
++#define NETDEV_TX_OK 0      /* driver took care of packet */
++#endif
++
++#ifndef NETDEV_TX_BUSY
++#define NETDEV_TX_BUSY 1    /* driver tx path was busy*/
++#endif
++
++#ifndef NETDEV_TX_LOCKED
++#define NETDEV_TX_LOCKED -1t /* driver tx lock was already taken */
++#endif
++
++#ifndef ADVERTISED_Pause
++#define ADVERTISED_Pause    (1 << 13)
++#endif
++
++#ifndef ADVERTISED_Asym_Pause
++#define ADVERTISED_Asym_Pause   (1 << 14)
++#endif
++
++#ifndef ADVERTISE_PAUSE_CAP
++#define ADVERTISE_PAUSE_CAP 0x400
++#endif
++
++#ifndef ADVERTISE_PAUSE_ASYM
++#define ADVERTISE_PAUSE_ASYM    0x800
++#endif
++
++#ifndef MII_CTRL1000
++#define MII_CTRL1000        0x09
++#endif
++
++#ifndef ADVERTISE_1000FULL
++#define ADVERTISE_1000FULL  0x200
++#endif
++
++#ifndef ADVERTISE_1000HALF
++#define ADVERTISE_1000HALF  0x100
++#endif
++
++#ifndef ADVERTISED_2500baseX_Full
++#define ADVERTISED_2500baseX_Full  0x8000
++#endif
++
++#define RTK_ADVERTISE_2500FULL  0x80
++#define RTK_ADVERTISE_5000FULL  0x100
++#define RTK_ADVERTISE_10000FULL  0x1000
++#define RTK_LPA_ADVERTISE_2500FULL  0x20
++#define RTK_LPA_ADVERTISE_5000FULL  0x40
++#define RTK_LPA_ADVERTISE_10000FULL  0x800
++
++#define RTK_EEE_ADVERTISE_2500FULL  BIT(0)
++#define RTK_EEE_ADVERTISE_5000FULL  BIT(1)
++#define RTK_LPA_EEE_ADVERTISE_2500FULL  BIT(0)
++#define RTK_LPA_EEE_ADVERTISE_5000FULL  BIT(1)
++
++/* Tx NO CLOSE */
++#define MAX_TX_NO_CLOSE_DESC_PTR_V2 0x10000
++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V2 0xFFFF
++#define MAX_TX_NO_CLOSE_DESC_PTR_V3 0x100000000
++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V3 0xFFFFFFFF
++#define MAX_TX_NO_CLOSE_DESC_PTR_V4 0x80000000
++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V4 0x7FFFFFFF
++#define TX_NO_CLOSE_SW_PTR_MASK_V2 0x1FFFF
++
++#ifndef ETH_MIN_MTU
++#define ETH_MIN_MTU  68
++#endif
++
++#define D0_SPEED_UP_SPEED_DISABLE    0
++#define D0_SPEED_UP_SPEED_1000       1
++#define D0_SPEED_UP_SPEED_2500       2
++
++#define RTL8125_MAC_MCU_PAGE_SIZE 256 //256 words
++
++#ifndef WRITE_ONCE
++#define WRITE_ONCE(var, val) (*((volatile typeof(val) *)(&(var))) = (val))
++#endif
++#ifndef READ_ONCE
++#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
++#endif
++
++#define R8125_LINK_STATE_OFF 0
++#define R8125_LINK_STATE_ON 1
++#define R8125_LINK_STATE_UNKNOWN 2
++
++/*****************************************************************************/
++
++//#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3)
++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,4,27)) || \
++     ((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) && \
++      (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3))))
++/* copied from linux kernel 2.6.20 include/linux/netdev.h */
++#define NETDEV_ALIGN        32
++#define NETDEV_ALIGN_CONST  (NETDEV_ALIGN - 1)
++
++static inline void *netdev_priv(struct net_device *dev)
++{
++        return (char *)dev + ((sizeof(struct net_device)
++                               + NETDEV_ALIGN_CONST)
++                              & ~NETDEV_ALIGN_CONST);
++}
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3)
++
++/*****************************************************************************/
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++#define RTLDEV  tp
++#else
++#define RTLDEV  dev
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++/*****************************************************************************/
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++typedef struct net_device *napi_ptr;
++typedef int *napi_budget;
++
++#define napi dev
++#define RTL_NAPI_CONFIG(ndev, priv, function, weig) ndev->poll=function;    \
++                                ndev->weight=weig;
++#define RTL_NAPI_QUOTA(budget, ndev)            min(*budget, ndev->quota)
++#define RTL_GET_PRIV(stuct_ptr, priv_struct)        netdev_priv(stuct_ptr)
++#define RTL_GET_NETDEV(priv_ptr)
++#define RTL_RX_QUOTA(budget)          *budget
++#define RTL_NAPI_QUOTA_UPDATE(ndev, work_done, budget)  *budget -= work_done;   \
++                                ndev->quota -= work_done;
++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done)        netif_rx_complete(dev)
++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi)       netif_rx_schedule_prep(dev)
++#define __RTL_NETIF_RX_SCHEDULE(dev, napi)      __netif_rx_schedule(dev)
++#define RTL_NAPI_RETURN_VALUE               work_done >= work_to_do
++#define RTL_NAPI_ENABLE(dev, napi)          netif_poll_enable(dev)
++#define RTL_NAPI_DISABLE(dev, napi)         netif_poll_disable(dev)
++#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
++#else
++typedef struct napi_struct *napi_ptr;
++typedef int napi_budget;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,0)
++#define RTL_NAPI_CONFIG(ndev, priv, function, weight)   netif_napi_add_weight(ndev, &priv->napi, function, weight)
++#else
++#define RTL_NAPI_CONFIG(ndev, priv, function, weight)   netif_napi_add(ndev, &priv->napi, function, weight)
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,0)
++#define RTL_NAPI_QUOTA(budget, ndev)            min(budget, budget)
++#define RTL_GET_PRIV(stuct_ptr, priv_struct)        container_of(stuct_ptr, priv_struct, stuct_ptr)
++#define RTL_GET_NETDEV(priv_ptr)            struct net_device *dev = priv_ptr->dev;
++#define RTL_RX_QUOTA(budget)          budget
++#define RTL_NAPI_QUOTA_UPDATE(ndev, work_done, budget)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done)        netif_rx_complete(dev, napi)
++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi)       netif_rx_schedule_prep(dev, napi)
++#define __RTL_NETIF_RX_SCHEDULE(dev, napi)      __netif_rx_schedule(dev, napi)
++#endif
++#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,29)
++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done)        netif_rx_complete(napi)
++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi)       netif_rx_schedule_prep(napi)
++#define __RTL_NETIF_RX_SCHEDULE(dev, napi)      __netif_rx_schedule(napi)
++#endif
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,29)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)
++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done)        napi_complete_done(napi, work_done)
++#else
++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done)        napi_complete(napi)
++#endif
++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi)       napi_schedule_prep(napi)
++#define __RTL_NETIF_RX_SCHEDULE(dev, napi)      __napi_schedule(napi)
++#endif
++#define RTL_NAPI_RETURN_VALUE work_done
++#define RTL_NAPI_ENABLE(dev, napi)          napi_enable(napi)
++#define RTL_NAPI_DISABLE(dev, napi)         napi_disable(napi)
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
++#define RTL_NAPI_DEL(priv)
++#else
++#define RTL_NAPI_DEL(priv)   netif_napi_del(&priv->napi)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
++
++/*****************************************************************************/
++#ifdef CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget)          napi_consume_skb(skb, budget)
++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget)          dev_consume_skb_any(skb);
++#else
++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget)          dev_kfree_skb_any(skb);
++#endif  //LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
++#else   //CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget)          dev_consume_skb_any(skb);
++#else
++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget)          dev_kfree_skb_any(skb);
++#endif
++#endif  //CONFIG_R8125_NAPI
++
++/*****************************************************************************/
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++#ifdef __CHECKER__
++#define __iomem __attribute__((noderef, address_space(2)))
++extern void __chk_io_ptr(void __iomem *);
++#define __bitwise __attribute__((bitwise))
++#else
++#define __iomem
++#define __chk_io_ptr(x) (void)0
++#define __bitwise
++#endif
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++
++/*****************************************************************************/
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
++#ifdef __CHECKER__
++#define __force __attribute__((force))
++#else
++#define __force
++#endif
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
++
++#ifndef module_param
++#define module_param(v,t,p) MODULE_PARM(v, "i");
++#endif
++
++#ifndef PCI_DEVICE
++#define PCI_DEVICE(vend,dev) \
++    .vendor = (vend), .device = (dev), \
++    .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
++#endif
++
++/*****************************************************************************/
++/* 2.5.28 => 2.4.23 */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,28))
++
++static inline void _kc_synchronize_irq(void)
++{
++        synchronize_irq();
++}
++#undef synchronize_irq
++#define synchronize_irq(X) _kc_synchronize_irq()
++
++#include <linux/tqueue.h>
++#define work_struct tq_struct
++#undef INIT_WORK
++#define INIT_WORK(a,b,c) INIT_TQUEUE(a,(void (*)(void *))b,c)
++#undef container_of
++#define container_of list_entry
++#define schedule_work schedule_task
++#define flush_scheduled_work flush_scheduled_tasks
++#endif /* 2.5.28 => 2.4.17 */
++
++/*****************************************************************************/
++/* 2.6.4 => 2.6.0 */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4))
++#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
++#endif /* 2.6.4 => 2.6.0 */
++/*****************************************************************************/
++/* 2.6.0 => 2.5.28 */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
++#define MODULE_INFO(version, _version)
++#ifndef CONFIG_E1000_DISABLE_PACKET_SPLIT
++#define CONFIG_E1000_DISABLE_PACKET_SPLIT 1
++#endif
++
++#define pci_set_consistent_dma_mask(dev,mask) 1
++
++#undef dev_put
++#define dev_put(dev) __dev_put(dev)
++
++#ifndef skb_fill_page_desc
++#define skb_fill_page_desc _kc_skb_fill_page_desc
++extern void _kc_skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size);
++#endif
++
++#ifndef pci_dma_mapping_error
++#define pci_dma_mapping_error _kc_pci_dma_mapping_error
++static inline int _kc_pci_dma_mapping_error(dma_addr_t dma_addr)
++{
++        return dma_addr == 0;
++}
++#endif
++
++#undef ALIGN
++#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
++
++#endif /* 2.6.0 => 2.5.28 */
++
++/*****************************************************************************/
++/* 2.4.22 => 2.4.17 */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,22))
++#define pci_name(x) ((x)->slot_name)
++#endif /* 2.4.22 => 2.4.17 */
++
++/*****************************************************************************/
++/* 2.6.5 => 2.6.0 */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5))
++#define pci_dma_sync_single_for_cpu pci_dma_sync_single
++#define pci_dma_sync_single_for_device  pci_dma_sync_single_for_cpu
++#endif /* 2.6.5 => 2.6.0 */
++
++/*****************************************************************************/
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++/*
++ * initialize a work-struct's func and data pointers:
++ */
++#define PREPARE_WORK(_work, _func, _data)           \
++    do {                            \
++        (_work)->func = _func;              \
++        (_work)->data = _data;              \
++    } while (0)
++
++#endif
++/*****************************************************************************/
++/* 2.6.4 => 2.6.0 */
++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,4,25) && \
++     LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)) || \
++    (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) && \
++      LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)))
++#define ETHTOOL_OPS_COMPAT
++#endif /* 2.6.4 => 2.6.0 */
++
++/*****************************************************************************/
++/* Installations with ethtool version without eeprom, adapter id, or statistics
++ * support */
++
++#ifndef ETH_GSTRING_LEN
++#define ETH_GSTRING_LEN 32
++#endif
++
++#ifndef ETHTOOL_GSTATS
++#define ETHTOOL_GSTATS 0x1d
++#undef ethtool_drvinfo
++#define ethtool_drvinfo k_ethtool_drvinfo
++struct k_ethtool_drvinfo {
++        u32 cmd;
++        char driver[32];
++        char version[32];
++        char fw_version[32];
++        char bus_info[32];
++        char reserved1[32];
++        char reserved2[16];
++        u32 n_stats;
++        u32 testinfo_len;
++        u32 eedump_len;
++        u32 regdump_len;
++};
++
++struct ethtool_stats {
++        u32 cmd;
++        u32 n_stats;
++        u64 data[0];
++};
++#endif /* ETHTOOL_GSTATS */
++
++#ifndef ETHTOOL_PHYS_ID
++#define ETHTOOL_PHYS_ID 0x1c
++#endif /* ETHTOOL_PHYS_ID */
++
++#ifndef ETHTOOL_GSTRINGS
++#define ETHTOOL_GSTRINGS 0x1b
++enum ethtool_stringset {
++        ETH_SS_TEST             = 0,
++        ETH_SS_STATS,
++};
++struct ethtool_gstrings {
++        u32 cmd;            /* ETHTOOL_GSTRINGS */
++        u32 string_set;     /* string set id e.c. ETH_SS_TEST, etc*/
++        u32 len;            /* number of strings in the string set */
++        u8 data[0];
++};
++#endif /* ETHTOOL_GSTRINGS */
++
++#ifndef ETHTOOL_TEST
++#define ETHTOOL_TEST 0x1a
++enum ethtool_test_flags {
++        ETH_TEST_FL_OFFLINE = (1 << 0),
++        ETH_TEST_FL_FAILED  = (1 << 1),
++};
++struct ethtool_test {
++        u32 cmd;
++        u32 flags;
++        u32 reserved;
++        u32 len;
++        u64 data[0];
++};
++#endif /* ETHTOOL_TEST */
++
++#ifndef ETHTOOL_GEEPROM
++#define ETHTOOL_GEEPROM 0xb
++#undef ETHTOOL_GREGS
++struct ethtool_eeprom {
++        u32 cmd;
++        u32 magic;
++        u32 offset;
++        u32 len;
++        u8 data[0];
++};
++
++struct ethtool_value {
++        u32 cmd;
++        u32 data;
++};
++#endif /* ETHTOOL_GEEPROM */
++
++#ifndef ETHTOOL_GLINK
++#define ETHTOOL_GLINK 0xa
++#endif /* ETHTOOL_GLINK */
++
++#ifndef ETHTOOL_GREGS
++#define ETHTOOL_GREGS       0x00000004 /* Get NIC registers */
++#define ethtool_regs _kc_ethtool_regs
++/* for passing big chunks of data */
++struct _kc_ethtool_regs {
++        u32 cmd;
++        u32 version; /* driver-specific, indicates different chips/revs */
++        u32 len; /* bytes */
++        u8 data[0];
++};
++#endif /* ETHTOOL_GREGS */
++
++#ifndef ETHTOOL_GMSGLVL
++#define ETHTOOL_GMSGLVL     0x00000007 /* Get driver message level */
++#endif
++#ifndef ETHTOOL_SMSGLVL
++#define ETHTOOL_SMSGLVL     0x00000008 /* Set driver msg level, priv. */
++#endif
++#ifndef ETHTOOL_NWAY_RST
++#define ETHTOOL_NWAY_RST    0x00000009 /* Restart autonegotiation, priv */
++#endif
++#ifndef ETHTOOL_GLINK
++#define ETHTOOL_GLINK       0x0000000a /* Get link status */
++#endif
++#ifndef ETHTOOL_GEEPROM
++#define ETHTOOL_GEEPROM     0x0000000b /* Get EEPROM data */
++#endif
++#ifndef ETHTOOL_SEEPROM
++#define ETHTOOL_SEEPROM     0x0000000c /* Set EEPROM data */
++#endif
++#ifndef ETHTOOL_GCOALESCE
++#define ETHTOOL_GCOALESCE   0x0000000e /* Get coalesce config */
++/* for configuring coalescing parameters of chip */
++#define ethtool_coalesce _kc_ethtool_coalesce
++struct _kc_ethtool_coalesce {
++        u32 cmd;    /* ETHTOOL_{G,S}COALESCE */
++
++        /* How many usecs to delay an RX interrupt after
++         * a packet arrives.  If 0, only rx_max_coalesced_frames
++         * is used.
++         */
++        u32 rx_coalesce_usecs;
++
++        /* How many packets to delay an RX interrupt after
++         * a packet arrives.  If 0, only rx_coalesce_usecs is
++         * used.  It is illegal to set both usecs and max frames
++         * to zero as this would cause RX interrupts to never be
++         * generated.
++         */
++        u32 rx_max_coalesced_frames;
++
++        /* Same as above two parameters, except that these values
++         * apply while an IRQ is being serviced by the host.  Not
++         * all cards support this feature and the values are ignored
++         * in that case.
++         */
++        u32 rx_coalesce_usecs_irq;
++        u32 rx_max_coalesced_frames_irq;
++
++        /* How many usecs to delay a TX interrupt after
++         * a packet is sent.  If 0, only tx_max_coalesced_frames
++         * is used.
++         */
++        u32 tx_coalesce_usecs;
++
++        /* How many packets to delay a TX interrupt after
++         * a packet is sent.  If 0, only tx_coalesce_usecs is
++         * used.  It is illegal to set both usecs and max frames
++         * to zero as this would cause TX interrupts to never be
++         * generated.
++         */
++        u32 tx_max_coalesced_frames;
++
++        /* Same as above two parameters, except that these values
++         * apply while an IRQ is being serviced by the host.  Not
++         * all cards support this feature and the values are ignored
++         * in that case.
++         */
++        u32 tx_coalesce_usecs_irq;
++        u32 tx_max_coalesced_frames_irq;
++
++        /* How many usecs to delay in-memory statistics
++         * block updates.  Some drivers do not have an in-memory
++         * statistic block, and in such cases this value is ignored.
++         * This value must not be zero.
++         */
++        u32 stats_block_coalesce_usecs;
++
++        /* Adaptive RX/TX coalescing is an algorithm implemented by
++         * some drivers to improve latency under low packet rates and
++         * improve throughput under high packet rates.  Some drivers
++         * only implement one of RX or TX adaptive coalescing.  Anything
++         * not implemented by the driver causes these values to be
++         * silently ignored.
++         */
++        u32 use_adaptive_rx_coalesce;
++        u32 use_adaptive_tx_coalesce;
++
++        /* When the packet rate (measured in packets per second)
++         * is below pkt_rate_low, the {rx,tx}_*_low parameters are
++         * used.
++         */
++        u32 pkt_rate_low;
++        u32 rx_coalesce_usecs_low;
++        u32 rx_max_coalesced_frames_low;
++        u32 tx_coalesce_usecs_low;
++        u32 tx_max_coalesced_frames_low;
++
++        /* When the packet rate is below pkt_rate_high but above
++         * pkt_rate_low (both measured in packets per second) the
++         * normal {rx,tx}_* coalescing parameters are used.
++         */
++
++        /* When the packet rate is (measured in packets per second)
++         * is above pkt_rate_high, the {rx,tx}_*_high parameters are
++         * used.
++         */
++        u32 pkt_rate_high;
++        u32 rx_coalesce_usecs_high;
++        u32 rx_max_coalesced_frames_high;
++        u32 tx_coalesce_usecs_high;
++        u32 tx_max_coalesced_frames_high;
++
++        /* How often to do adaptive coalescing packet rate sampling,
++         * measured in seconds.  Must not be zero.
++         */
++        u32 rate_sample_interval;
++};
++#endif /* ETHTOOL_GCOALESCE */
++
++#ifndef ETHTOOL_SCOALESCE
++#define ETHTOOL_SCOALESCE   0x0000000f /* Set coalesce config. */
++#endif
++#ifndef ETHTOOL_GRINGPARAM
++#define ETHTOOL_GRINGPARAM  0x00000010 /* Get ring parameters */
++/* for configuring RX/TX ring parameters */
++#define ethtool_ringparam _kc_ethtool_ringparam
++struct _kc_ethtool_ringparam {
++        u32 cmd;    /* ETHTOOL_{G,S}RINGPARAM */
++
++        /* Read only attributes.  These indicate the maximum number
++         * of pending RX/TX ring entries the driver will allow the
++         * user to set.
++         */
++        u32 rx_max_pending;
++        u32 rx_mini_max_pending;
++        u32 rx_jumbo_max_pending;
++        u32 tx_max_pending;
++
++        /* Values changeable by the user.  The valid values are
++         * in the range 1 to the "*_max_pending" counterpart above.
++         */
++        u32 rx_pending;
++        u32 rx_mini_pending;
++        u32 rx_jumbo_pending;
++        u32 tx_pending;
++};
++#endif /* ETHTOOL_GRINGPARAM */
++
++#ifndef ETHTOOL_SRINGPARAM
++#define ETHTOOL_SRINGPARAM  0x00000011 /* Set ring parameters, priv. */
++#endif
++#ifndef ETHTOOL_GPAUSEPARAM
++#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */
++/* for configuring link flow control parameters */
++#define ethtool_pauseparam _kc_ethtool_pauseparam
++struct _kc_ethtool_pauseparam {
++        u32 cmd;    /* ETHTOOL_{G,S}PAUSEPARAM */
++
++        /* If the link is being auto-negotiated (via ethtool_cmd.autoneg
++         * being true) the user may set 'autonet' here non-zero to have the
++         * pause parameters be auto-negotiated too.  In such a case, the
++         * {rx,tx}_pause values below determine what capabilities are
++         * advertised.
++         *
++         * If 'autoneg' is zero or the link is not being auto-negotiated,
++         * then {rx,tx}_pause force the driver to use/not-use pause
++         * flow control.
++         */
++        u32 autoneg;
++        u32 rx_pause;
++        u32 tx_pause;
++};
++#endif /* ETHTOOL_GPAUSEPARAM */
++
++#ifndef ETHTOOL_SPAUSEPARAM
++#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */
++#endif
++#ifndef ETHTOOL_GRXCSUM
++#define ETHTOOL_GRXCSUM     0x00000014 /* Get RX hw csum enable (ethtool_value) */
++#endif
++#ifndef ETHTOOL_SRXCSUM
++#define ETHTOOL_SRXCSUM     0x00000015 /* Set RX hw csum enable (ethtool_value) */
++#endif
++#ifndef ETHTOOL_GTXCSUM
++#define ETHTOOL_GTXCSUM     0x00000016 /* Get TX hw csum enable (ethtool_value) */
++#endif
++#ifndef ETHTOOL_STXCSUM
++#define ETHTOOL_STXCSUM     0x00000017 /* Set TX hw csum enable (ethtool_value) */
++#endif
++#ifndef ETHTOOL_GSG
++#define ETHTOOL_GSG     0x00000018 /* Get scatter-gather enable
++* (ethtool_value) */
++#endif
++#ifndef ETHTOOL_SSG
++#define ETHTOOL_SSG     0x00000019 /* Set scatter-gather enable
++* (ethtool_value). */
++#endif
++#ifndef ETHTOOL_TEST
++#define ETHTOOL_TEST        0x0000001a /* execute NIC self-test, priv. */
++#endif
++#ifndef ETHTOOL_GSTRINGS
++#define ETHTOOL_GSTRINGS    0x0000001b /* get specified string set */
++#endif
++#ifndef ETHTOOL_PHYS_ID
++#define ETHTOOL_PHYS_ID     0x0000001c /* identify the NIC */
++#endif
++#ifndef ETHTOOL_GSTATS
++#define ETHTOOL_GSTATS      0x0000001d /* get NIC-specific statistics */
++#endif
++#ifndef ETHTOOL_GTSO
++#define ETHTOOL_GTSO        0x0000001e /* Get TSO enable (ethtool_value) */
++#endif
++#ifndef ETHTOOL_STSO
++#define ETHTOOL_STSO        0x0000001f /* Set TSO enable (ethtool_value) */
++#endif
++
++#ifndef ETHTOOL_BUSINFO_LEN
++#define ETHTOOL_BUSINFO_LEN 32
++#endif
++
++/*****************************************************************************/
++
++enum RTL8125_registers {
++        MAC0            = 0x00,     /* Ethernet hardware address. */
++        MAC4            = 0x04,
++        MAR0            = 0x08,     /* Multicast filter. */
++        CounterAddrLow      = 0x10,
++        CounterAddrHigh     = 0x14,
++        CustomLED       = 0x18,
++        TxDescStartAddrLow  = 0x20,
++        TxDescStartAddrHigh = 0x24,
++        TxHDescStartAddrLow = 0x28,
++        TxHDescStartAddrHigh    = 0x2c,
++        FLASH           = 0x30,
++        INT_CFG0_8125   = 0x34,
++        ERSR            = 0x36,
++        ChipCmd         = 0x37,
++        TxPoll          = 0x38,
++        IntrMask        = 0x3C,
++        IntrStatus      = 0x3E,
++        TxConfig        = 0x40,
++        RxConfig        = 0x44,
++        TCTR            = 0x48,
++        Cfg9346         = 0x50,
++        Config0         = 0x51,
++        Config1         = 0x52,
++        Config2         = 0x53,
++        Config3         = 0x54,
++        Config4         = 0x55,
++        Config5         = 0x56,
++        TDFNR           = 0x57,
++        TimeInt0        = 0x58,
++        TimeInt1        = 0x5C,
++        PHYAR           = 0x60,
++        CSIDR           = 0x64,
++        CSIAR           = 0x68,
++        PHYstatus       = 0x6C,
++        MACDBG          = 0x6D,
++        GPIO            = 0x6E,
++        PMCH            = 0x6F,
++        ERIDR           = 0x70,
++        ERIAR           = 0x74,
++        INT_CFG1_8125   = 0x7A,
++        EPHY_RXER_NUM   = 0x7C,
++        EPHYAR          = 0x80,
++        LEDSEL_2_8125   = 0x84,
++        LEDSEL_1_8125   = 0x86,
++        TimeInt2        = 0x8C,
++        LEDSEL_3_8125   = 0x96,
++        OCPDR           = 0xB0,
++        MACOCP          = 0xB0,
++        OCPAR           = 0xB4,
++        SecMAC0         = 0xB4,
++        SecMAC4         = 0xB8,
++        PHYOCP          = 0xB8,
++        DBG_reg         = 0xD1,
++        TwiCmdReg       = 0xD2,
++        MCUCmd_reg      = 0xD3,
++        RxMaxSize       = 0xDA,
++        EFUSEAR         = 0xDC,
++        CPlusCmd        = 0xE0,
++        IntrMitigate    = 0xE2,
++        RxDescAddrLow   = 0xE4,
++        RxDescAddrHigh  = 0xE8,
++        MTPS            = 0xEC,
++        FuncEvent       = 0xF0,
++        PPSW            = 0xF2,
++        FuncEventMask   = 0xF4,
++        TimeInt3        = 0xF4,
++        FuncPresetState = 0xF8,
++        CMAC_IBCR0      = 0xF8,
++        CMAC_IBCR2      = 0xF9,
++        CMAC_IBIMR0     = 0xFA,
++        CMAC_IBISR0     = 0xFB,
++        FuncForceEvent  = 0xFC,
++        //8125
++        IMR0_8125          = 0x38,
++        ISR0_8125          = 0x3C,
++        TPPOLL_8125        = 0x90,
++        IMR1_8125          = 0x800,
++        ISR1_8125          = 0x802,
++        IMR2_8125          = 0x804,
++        ISR2_8125          = 0x806,
++        IMR3_8125          = 0x808,
++        ISR3_8125          = 0x80A,
++        BACKUP_ADDR0_8125  = 0x19E0,
++        BACKUP_ADDR1_8125  = 0X19E4,
++        TCTR0_8125         = 0x0048,
++        TCTR1_8125         = 0x004C,
++        TCTR2_8125         = 0x0088,
++        TCTR3_8125         = 0x001C,
++        TIMER_INT0_8125    = 0x0058,
++        TIMER_INT1_8125    = 0x005C,
++        TIMER_INT2_8125    = 0x008C,
++        TIMER_INT3_8125    = 0x00F4,
++        INT_MITI_V2_0_RX   = 0x0A00,
++        INT_MITI_V2_0_TX   = 0x0A02,
++        INT_MITI_V2_1_RX   = 0x0A08,
++        INT_MITI_V2_1_TX   = 0x0A0A,
++        IMR_V2_CLEAR_REG_8125 = 0x0D00,
++        ISR_V2_8125           = 0x0D04,
++        IMR_V2_SET_REG_8125   = 0x0D0C,
++        TDU_STA_8125       = 0x0D08,
++        RDU_STA_8125       = 0x0D0A,
++        IMR_V4_L2_CLEAR_REG_8125 = 0x0D10,
++        IMR_V4_L2_SET_REG_8125   = 0x0D18,
++        ISR_V4_L2_8125     = 0x0D14,
++        SW_TAIL_PTR0_8125BP = 0x0D30,
++        SW_TAIL_PTR1_8125BP = 0x0D38,
++        HW_CLO_PTR0_8125BP = 0x0D34,
++        HW_CLO_PTR1_8125BP = 0x0D3C,
++        DOUBLE_VLAN_CONFIG = 0x1000,
++        TX_NEW_CTRL        = 0x203E,
++        TNPDS_Q1_LOW_8125  = 0x2100,
++        PLA_TXQ0_IDLE_CREDIT = 0x2500,
++        PLA_TXQ1_IDLE_CREDIT = 0x2504,
++        SW_TAIL_PTR0_8125  = 0x2800,
++        HW_CLO_PTR0_8125   = 0x2802,
++        SW_TAIL_PTR0_8126  = 0x2800,
++        HW_CLO_PTR0_8126   = 0x2800,
++        RDSAR_Q1_LOW_8125  = 0x4000,
++        RSS_CTRL_8125      = 0x4500,
++        Q_NUM_CTRL_8125    = 0x4800,
++        RSS_KEY_8125       = 0x4600,
++        RSS_INDIRECTION_TBL_8125_V2 = 0x4700,
++        EEE_TXIDLE_TIMER_8125   = 0x6048,
++        /* mac ptp */
++        PTP_CTRL_8125      = 0x6800,
++        PTP_STATUS_8125    = 0x6802,
++        PTP_ISR_8125       = 0x6804,
++        PTP_IMR_8125       = 0x6805,
++        PTP_TIME_CORRECT_CMD_8125    = 0x6806,
++        PTP_SOFT_CONFIG_Time_NS_8125 = 0x6808,
++        PTP_SOFT_CONFIG_Time_S_8125  = 0x680C,
++        PTP_SOFT_CONFIG_Time_Sign    = 0x6812,
++        PTP_LOCAL_Time_SUB_NS_8125   = 0x6814,
++        PTP_LOCAL_Time_NS_8125       = 0x6818,
++        PTP_LOCAL_Time_S_8125        = 0x681C,
++        PTP_Time_SHIFTER_S_8125      = 0x6856,
++        PPS_RISE_TIME_NS_8125        = 0x68A0,
++        PPS_RISE_TIME_S_8125         = 0x68A4,
++        PTP_EGRESS_TIME_BASE_NS_8125 = 0XCF20,
++        PTP_EGRESS_TIME_BASE_S_8125  = 0XCF24,
++        /* phy ptp */
++        PTP_CTL                 = 0xE400,
++        PTP_INER                = 0xE402,
++        PTP_INSR                = 0xE404,
++        PTP_SYNCE_CTL           = 0xE406,
++        PTP_GEN_CFG             = 0xE408,
++        PTP_CLK_CFG_8126        = 0xE410,
++        PTP_CFG_NS_LO_8126      = 0xE412,
++        PTP_CFG_NS_HI_8126      = 0xE414,
++        PTP_CFG_S_LO_8126       = 0xE416,
++        PTP_CFG_S_MI_8126       = 0xE418,
++        PTP_CFG_S_HI_8126       = 0xE41A,
++        PTP_TAI_CFG             = 0xE420,
++        PTP_TAI_TS_S_LO         = 0xE42A,
++        PTP_TAI_TS_S_HI         = 0xE42C,
++        PTP_TRX_TS_STA          = 0xE430,
++        PTP_TRX_TS_NS_LO        = 0xE446,
++        PTP_TRX_TS_NS_HI        = 0xE448,
++        PTP_TRX_TS_S_LO         = 0xE44A,
++        PTP_TRX_TS_S_MI         = 0xE44C,
++        PTP_TRX_TS_S_HI         = 0xE44E,
++
++
++        //TCAM
++        TCAM_NOTVALID_ADDR           = 0xA000,
++        TCAM_VALID_ADDR              = 0xA800,
++        TCAM_MAC_ADDR                = 448,
++        TCAM_VLAN_TAG                = 496,
++        //TCAM V2
++        TCAM_NOTVALID_ADDR_V2           = 0xA000,
++        TCAM_VALID_ADDR_V2              = 0xB000,
++        TCAM_MAC_ADDR_V2                = 0x00,
++        TCAM_VLAN_TAG_V2                = 0x03,
++        //ipc2
++        IB2SOC_SET     = 0x0010,
++        IB2SOC_DATA    = 0x0014,
++        IB2SOC_CMD     = 0x0018,
++        IB2SOC_IMR     = 0x001C,
++
++        RISC_IMR_8125BP     = 0x0D20,
++        RISC_ISR_8125BP     = 0x0D22,
++};
++
++enum RTL8125_register_content {
++        /* InterruptStatusBits */
++        SYSErr      = 0x8000,
++        PCSTimeout  = 0x4000,
++        SWInt       = 0x0100,
++        TxDescUnavail   = 0x0080,
++        RxFIFOOver  = 0x0040,
++        LinkChg     = 0x0020,
++        RxDescUnavail   = 0x0010,
++        TxErr       = 0x0008,
++        TxOK        = 0x0004,
++        RxErr       = 0x0002,
++        RxOK        = 0x0001,
++        RxDU1       = 0x0002,
++        RxOK1       = 0x0001,
++
++        /* RxStatusDesc */
++        RxRWT = (1 << 22),
++        RxRES = (1 << 21),
++        RxRUNT = (1 << 20),
++        RxCRC = (1 << 19),
++
++        RxRWT_V3 = (1 << 18),
++        RxRES_V3 = (1 << 20),
++        RxRUNT_V3 = (1 << 19),
++        RxCRC_V3 = (1 << 17),
++
++        RxRES_V4 = (1 << 22),
++        RxRUNT_V4 = (1 << 21),
++        RxCRC_V4 = (1 << 20),
++
++        /* ChipCmdBits */
++        StopReq  = 0x80,
++        CmdReset = 0x10,
++        CmdRxEnb = 0x08,
++        CmdTxEnb = 0x04,
++        RxBufEmpty = 0x01,
++
++        /* Cfg9346Bits */
++        Cfg9346_EEM_MASK = 0xC0,
++        Cfg9346_Lock = 0x00,
++        Cfg9346_Unlock = 0xC0,
++        Cfg9346_EEDO = (1 << 0),
++        Cfg9346_EEDI = (1 << 1),
++        Cfg9346_EESK = (1 << 2),
++        Cfg9346_EECS = (1 << 3),
++        Cfg9346_EEM0 = (1 << 6),
++        Cfg9346_EEM1 = (1 << 7),
++
++        /* rx_mode_bits */
++        AcceptErr = 0x20,
++        AcceptRunt = 0x10,
++        AcceptBroadcast = 0x08,
++        AcceptMulticast = 0x04,
++        AcceptMyPhys = 0x02,
++        AcceptAllPhys = 0x01,
++        AcceppVlanPhys = 0x8000,
++
++        /* Transmit Priority Polling*/
++        HPQ = 0x80,
++        NPQ = 0x40,
++        FSWInt = 0x01,
++
++        /* RxConfigBits */
++        Reserved2_shift = 13,
++        RxCfgDMAShift = 8,
++        EnableRxDescV3 = (1 << 24),
++        EnableRxDescV4_1 = (1 << 24),
++        EnableOuterVlan = (1 << 23),
++        EnableInnerVlan = (1 << 22),
++        RxCfg_128_int_en = (1 << 15),
++        RxCfg_fet_multi_en = (1 << 14),
++        RxCfg_half_refetch = (1 << 13),
++        RxCfg_pause_slot_en = (1 << 11),
++        RxCfg_9356SEL = (1 << 6),
++        EnableRxDescV4_0 = (1 << 1), //not in rcr
++
++        /* TxConfigBits */
++        TxInterFrameGapShift = 24,
++        TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */
++        TxMACLoopBack = (1 << 17),  /* MAC loopback */
++
++        /* Config1 register */
++        LEDS1       = (1 << 7),
++        LEDS0       = (1 << 6),
++        Speed_down  = (1 << 4),
++        MEMMAP      = (1 << 3),
++        IOMAP       = (1 << 2),
++        VPD         = (1 << 1),
++        PMEnable    = (1 << 0), /* Power Management Enable */
++
++        /* Config2 register */
++        PMSTS_En    = (1 << 5),
++
++        /* Config3 register */
++        Isolate_en  = (1 << 12), /* Isolate enable */
++        MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */
++        LinkUp      = (1 << 4), /* This bit is reserved in RTL8125B.*/
++        /* Wake up when the cable connection is re-established */
++        ECRCEN      = (1 << 3), /* This bit is reserved in RTL8125B*/
++        Jumbo_En0   = (1 << 2), /* This bit is reserved in RTL8125B*/
++        RDY_TO_L23  = (1 << 1), /* This bit is reserved in RTL8125B*/
++        Beacon_en   = (1 << 0), /* This bit is reserved in RTL8125B*/
++
++        /* Config4 register */
++        Jumbo_En1   = (1 << 1), /* This bit is reserved in RTL8125B*/
++
++        /* Config5 register */
++        BWF     = (1 << 6), /* Accept Broadcast wakeup frame */
++        MWF     = (1 << 5), /* Accept Multicast wakeup frame */
++        UWF     = (1 << 4), /* Accept Unicast wakeup frame */
++        LanWake     = (1 << 1), /* LanWake enable/disable */
++        PMEStatus   = (1 << 0), /* PME status can be reset by PCI RST# */
++
++        /* CPlusCmd */
++        EnableBist  = (1 << 15),
++        Macdbgo_oe  = (1 << 14),
++        Normal_mode = (1 << 13),
++        Force_halfdup   = (1 << 12),
++        Force_rxflow_en = (1 << 11),
++        Force_txflow_en = (1 << 10),
++        Cxpl_dbg_sel    = (1 << 9),//This bit is reserved in RTL8125B
++        ASF     = (1 << 8),//This bit is reserved in RTL8125C
++        PktCntrDisable  = (1 << 7),
++        RxVlan      = (1 << 6),
++        RxChkSum    = (1 << 5),
++        Macdbgo_sel = 0x001C,
++        INTT_0      = 0x0000,
++        INTT_1      = 0x0001,
++        INTT_2      = 0x0002,
++        INTT_3      = 0x0003,
++
++        /* rtl8125_PHYstatus */
++        PowerSaveStatus = 0x80,
++        _1000bpsL = 0x80000,
++        _5000bpsF = 0x1000,
++        _2500bpsF = 0x400,
++        _2500bpsL = 0x200,
++        TxFlowCtrl = 0x40,
++        RxFlowCtrl = 0x20,
++        _1000bpsF = 0x10,
++        _100bps = 0x08,
++        _10bps = 0x04,
++        LinkStatus = 0x02,
++        FullDup = 0x01,
++
++        /* DBG_reg */
++        Fix_Nak_1 = (1 << 4),
++        Fix_Nak_2 = (1 << 3),
++        DBGPIN_E2 = (1 << 0),
++
++        /* ResetCounterCommand */
++        CounterReset = 0x1,
++        /* DumpCounterCommand */
++        CounterDump = 0x8,
++
++        /* PHY access */
++        PHYAR_Flag = 0x80000000,
++        PHYAR_Write = 0x80000000,
++        PHYAR_Read = 0x00000000,
++        PHYAR_Reg_Mask = 0x1f,
++        PHYAR_Reg_shift = 16,
++        PHYAR_Data_Mask = 0xffff,
++
++        /* EPHY access */
++        EPHYAR_Flag = 0x80000000,
++        EPHYAR_Write = 0x80000000,
++        EPHYAR_Read = 0x00000000,
++        EPHYAR_Reg_Mask = 0x3f,
++        EPHYAR_Reg_Mask_v2 = 0x7f,
++        EPHYAR_Reg_shift = 16,
++        EPHYAR_Data_Mask = 0xffff,
++
++        /* CSI access */
++        CSIAR_Flag = 0x80000000,
++        CSIAR_Write = 0x80000000,
++        CSIAR_Read = 0x00000000,
++        CSIAR_ByteEn = 0x0f,
++        CSIAR_ByteEn_shift = 12,
++        CSIAR_Addr_Mask = 0x0fff,
++
++        /* ERI access */
++        ERIAR_Flag = 0x80000000,
++        ERIAR_Write = 0x80000000,
++        ERIAR_Read = 0x00000000,
++        ERIAR_Addr_Align = 4, /* ERI access register address must be 4 byte alignment */
++        ERIAR_ExGMAC = 0,
++        ERIAR_MSIX = 1,
++        ERIAR_ASF = 2,
++        ERIAR_OOB = 2,
++        ERIAR_Type_shift = 16,
++        ERIAR_ByteEn = 0x0f,
++        ERIAR_ByteEn_shift = 12,
++
++        /* OCP GPHY access */
++        OCPDR_Write = 0x80000000,
++        OCPDR_Read = 0x00000000,
++        OCPDR_Reg_Mask = 0xFF,
++        OCPDR_Data_Mask = 0xFFFF,
++        OCPDR_GPHY_Reg_shift = 16,
++        OCPAR_Flag = 0x80000000,
++        OCPAR_GPHY_Write = 0x8000F060,
++        OCPAR_GPHY_Read = 0x0000F060,
++        OCPR_Write = 0x80000000,
++        OCPR_Read = 0x00000000,
++        OCPR_Addr_Reg_shift = 16,
++        OCPR_Flag = 0x80000000,
++        OCP_STD_PHY_BASE_PAGE = 0x0A40,
++
++        /* MCU Command */
++        Now_is_oob = (1 << 7),
++        Txfifo_empty = (1 << 5),
++        Rxfifo_empty = (1 << 4),
++
++        /* E-FUSE access */
++        EFUSE_WRITE = 0x80000000,
++        EFUSE_WRITE_OK  = 0x00000000,
++        EFUSE_READ  = 0x00000000,
++        EFUSE_READ_OK   = 0x80000000,
++        EFUSE_WRITE_V3 = 0x40000000,
++        EFUSE_WRITE_OK_V3  = 0x00000000,
++        EFUSE_READ_V3  = 0x80000000,
++        EFUSE_READ_OK_V3   = 0x00000000,
++        EFUSE_Reg_Mask  = 0x03FF,
++        EFUSE_Reg_Shift = 8,
++        EFUSE_Check_Cnt = 300,
++        EFUSE_READ_FAIL = 0xFF,
++        EFUSE_Data_Mask = 0x000000FF,
++
++        /* GPIO */
++        GPIO_en = (1 << 0),
++
++        /* PTP */
++        PTP_ISR_TOK = (1 << 1),
++        PTP_ISR_TER = (1 << 2),
++        PTP_EXEC_CMD = (1 << 7),
++        PTP_ADJUST_TIME_NS_NEGATIVE = (1 << 30),
++        PTP_ADJUST_TIME_S_NEGATIVE = (1ULL << 48),
++        PTP_SOFT_CONFIG_TIME_NS_NEGATIVE = (1 << 30),
++        PTP_SOFT_CONFIG_TIME_S_NEGATIVE = (1ULL << 48),
++
++        /* New Interrupt Bits */
++        INT_CFG0_ENABLE_8125 = (1 << 0),
++        INT_CFG0_TIMEOUT0_BYPASS_8125 = (1 << 1),
++        INT_CFG0_MITIGATION_BYPASS_8125 = (1 << 2),
++        INT_CFG0_RDU_BYPASS_8126 = (1 << 4),
++        INT_CFG0_MSIX_ENTRY_NUM_MODE = (1 << 5),
++        INT_CFG0_AUTO_CLEAR_IMR = (1 << 5),
++        INT_CFG0_AVOID_MISS_INTR = (1 << 6),
++        ISRIMR_V2_ROK_Q0     = (1 << 0),
++        ISRIMR_TOK_Q0        = (1 << 16),
++        ISRIMR_TOK_Q1        = (1 << 18),
++        ISRIMR_V2_LINKCHG    = (1 << 21),
++
++        ISRIMR_V4_ROK_Q0     = (1 << 0),
++        ISRIMR_V4_LINKCHG    = (1 << 29),
++        ISRIMR_V4_LAYER2_INTR_STS = (1 << 31),
++        ISRIMR_V4_L2_IPC2    = (1 << 17),
++
++        ISRIMR_V5_ROK_Q0     = (1 << 0),
++        ISRIMR_V5_TOK_Q0     = (1 << 16),
++        ISRIMR_V5_TOK_Q1     = (1 << 17),
++        ISRIMR_V5_LINKCHG    = (1 << 18),
++
++        ISRIMR_V7_ROK_Q0     = (1 << 0),
++        ISRIMR_V7_TOK_Q0     = (1 << 27),
++        ISRIMR_V7_TOK_Q1     = (1 << 28),
++        ISRIMR_V7_LINKCHG    = (1 << 29),
++
++        /* IPC2 */
++        RISC_IPC2_INTR    = (1 << 1),
++
++        /* Magic Number */
++        RTL8125_MAGIC_NUMBER = 0x0badbadbadbadbadull,
++};
++
++enum _DescStatusBit {
++        DescOwn     = (1 << 31), /* Descriptor is owned by NIC */
++        RingEnd     = (1 << 30), /* End of descriptor ring */
++        FirstFrag   = (1 << 29), /* First segment of a packet */
++        LastFrag    = (1 << 28), /* Final segment of a packet */
++
++        DescOwn_V3     = (DescOwn), /* Descriptor is owned by NIC */
++        RingEnd_V3     = (RingEnd), /* End of descriptor ring */
++        FirstFrag_V3   = (1 << 25), /* First segment of a packet */
++        LastFrag_V3    = (1 << 24), /* Final segment of a packet */
++
++        DescOwn_V4     = (DescOwn), /* Descriptor is owned by NIC */
++        RingEnd_V4     = (RingEnd), /* End of descriptor ring */
++        FirstFrag_V4   = (FirstFrag), /* First segment of a packet */
++        LastFrag_V4    = (LastFrag), /* Final segment of a packet */
++
++        /* Tx private */
++        /*------ offset 0 of tx descriptor ------*/
++        LargeSend   = (1 << 27), /* TCP Large Send Offload (TSO) */
++        GiantSendv4 = (1 << 26), /* TCP Giant Send Offload V4 (GSOv4) */
++        GiantSendv6 = (1 << 25), /* TCP Giant Send Offload V6 (GSOv6) */
++        LargeSend_DP = (1 << 16), /* TCP Large Send Offload (TSO) */
++        MSSShift    = 16,        /* MSS value position */
++        MSSMask     = 0x7FFU,    /* MSS value 11 bits */
++        TxIPCS      = (1 << 18), /* Calculate IP checksum */
++        TxUDPCS     = (1 << 17), /* Calculate UDP/IP checksum */
++        TxTCPCS     = (1 << 16), /* Calculate TCP/IP checksum */
++        TxVlanTag   = (1 << 17), /* Add VLAN tag */
++
++        /*@@@@@@ offset 4 of tx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        TxUDPCS_C   = (1 << 31), /* Calculate UDP/IP checksum */
++        TxTCPCS_C   = (1 << 30), /* Calculate TCP/IP checksum */
++        TxIPCS_C    = (1 << 29), /* Calculate IP checksum */
++        TxIPV6F_C   = (1 << 28), /* Indicate it is an IPv6 packet */
++        /*@@@@@@ offset 4 of tx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++
++        /* Rx private */
++        /*------ offset 0 of rx descriptor ------*/
++        PID1        = (1 << 18), /* Protocol ID bit 1/2 */
++        PID0        = (1 << 17), /* Protocol ID bit 2/2 */
++
++#define RxProtoUDP  (PID1)
++#define RxProtoTCP  (PID0)
++#define RxProtoIP   (PID1 | PID0)
++#define RxProtoMask RxProtoIP
++
++        RxIPF       = (1 << 16), /* IP checksum failed */
++        RxUDPF      = (1 << 15), /* UDP/IP checksum failed */
++        RxTCPF      = (1 << 14), /* TCP/IP checksum failed */
++        RxVlanTag   = (1 << 16), /* VLAN tag available */
++
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxUDPT      = (1 << 18),
++        RxTCPT      = (1 << 17),
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxV6F       = (1 << 31),
++        RxV4F       = (1 << 30),
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++
++        PID1_v3        = (1 << 29), /* Protocol ID bit 1/2 */
++        PID0_v3        = (1 << 28), /* Protocol ID bit 2/2 */
++
++#define RxProtoUDP_v3  (PID1_v3)
++#define RxProtoTCP_v3  (PID0_v3)
++#define RxProtoIP_v3   (PID1_v3 | PID0_v3)
++#define RxProtoMask_v3 RxProtoIP_v3
++
++        RxIPF_v3       = (1 << 26), /* IP checksum failed */
++        RxUDPF_v3      = (1 << 25), /* UDP/IP checksum failed */
++        RxTCPF_v3      = (1 << 24), /* TCP/IP checksum failed */
++        RxSCTPF_v3     = (1 << 23), /* SCTP checksum failed */
++        RxVlanTag_v3   = (RxVlanTag), /* VLAN tag available */
++
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxUDPT_v3      = (1 << 29),
++        RxTCPT_v3      = (1 << 28),
++        RxSCTP_v3      = (1 << 27),
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxV6F_v3       = (RxV6F),
++        RxV4F_v3       = (RxV4F),
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++        RxIPF_v4       = (1 << 17), /* IP checksum failed */
++        RxUDPF_v4      = (1 << 16), /* UDP/IP checksum failed */
++        RxTCPF_v4      = (1 << 15), /* TCP/IP checksum failed */
++        RxSCTPF_v4     = (1 << 19), /* SCTP checksum failed */
++        RxVlanTag_v4   = (RxVlanTag), /* VLAN tag available */
++
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxUDPT_v4      = (1 << 19),
++        RxTCPT_v4      = (1 << 18),
++        RxSCTP_v4      = (1 << 19),
++        /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     begin @@@@@@*/
++        RxV6F_v4       = (RxV6F),
++        RxV4F_v4       = (RxV4F),
++        /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only     end @@@@@@*/
++};
++
++enum features {
++//  RTL_FEATURE_WOL = (1 << 0),
++        RTL_FEATURE_MSI = (1 << 1),
++        RTL_FEATURE_MSIX = (1 << 2),
++};
++
++enum wol_capability {
++        WOL_DISABLED = 0,
++        WOL_ENABLED = 1
++};
++
++enum bits {
++        BIT_0 = (1 << 0),
++        BIT_1 = (1 << 1),
++        BIT_2 = (1 << 2),
++        BIT_3 = (1 << 3),
++        BIT_4 = (1 << 4),
++        BIT_5 = (1 << 5),
++        BIT_6 = (1 << 6),
++        BIT_7 = (1 << 7),
++        BIT_8 = (1 << 8),
++        BIT_9 = (1 << 9),
++        BIT_10 = (1 << 10),
++        BIT_11 = (1 << 11),
++        BIT_12 = (1 << 12),
++        BIT_13 = (1 << 13),
++        BIT_14 = (1 << 14),
++        BIT_15 = (1 << 15),
++        BIT_16 = (1 << 16),
++        BIT_17 = (1 << 17),
++        BIT_18 = (1 << 18),
++        BIT_19 = (1 << 19),
++        BIT_20 = (1 << 20),
++        BIT_21 = (1 << 21),
++        BIT_22 = (1 << 22),
++        BIT_23 = (1 << 23),
++        BIT_24 = (1 << 24),
++        BIT_25 = (1 << 25),
++        BIT_26 = (1 << 26),
++        BIT_27 = (1 << 27),
++        BIT_28 = (1 << 28),
++        BIT_29 = (1 << 29),
++        BIT_30 = (1 << 30),
++        BIT_31 = (1 << 31)
++};
++
++/* Phy Fuse Dout */
++#define R8125_PHY_FUSE_DOUT_NUM (32)
++#define R8125_MAX_PHY_FUSE_DOUT_NUM R8125_PHY_FUSE_DOUT_NUM
++
++#define RTL8125_CP_NUM 4
++#define RTL8125_MAX_SUPPORT_CP_LEN 110
++
++enum rtl8125_cp_status {
++        rtl8125_cp_normal = 0,
++        rtl8125_cp_short,
++        rtl8125_cp_open,
++        rtl8125_cp_mismatch,
++        rtl8125_cp_unknown
++};
++
++enum efuse {
++        EFUSE_NOT_SUPPORT = 0,
++        EFUSE_SUPPORT_V1,
++        EFUSE_SUPPORT_V2,
++        EFUSE_SUPPORT_V3,
++        EFUSE_SUPPORT_V4,
++};
++#define RsvdMask    0x3fffc000
++#define RsvdMaskV3  0x3fff8000
++#define RsvdMaskV4  RsvdMaskV3
++
++struct TxDesc {
++        u32 opts1;
++        u32 opts2;
++        u64 addr;
++        u32 reserved0;
++        u32 reserved1;
++        u32 reserved2;
++        u32 reserved3;
++};
++
++struct RxDesc {
++        u32 opts1;
++        u32 opts2;
++        u64 addr;
++};
++
++struct RxDescV3 {
++        union {
++                struct {
++                        u32 rsv1;
++                        u32 rsv2;
++                } RxDescDDWord1;
++        };
++
++        union {
++                struct {
++                        u32 RSSResult;
++                        u16 HeaderBufferLen;
++                        u16 HeaderInfo;
++                } RxDescNormalDDWord2;
++
++                struct {
++                        u32 rsv5;
++                        u32 rsv6;
++                } RxDescDDWord2;
++        };
++
++        union {
++                u64   addr;
++
++                struct {
++                        u32 TimeStampLow;
++                        u32 TimeStampHigh;
++                } RxDescTimeStamp;
++
++                struct {
++                        u32 rsv8;
++                        u32 rsv9;
++                } RxDescDDWord3;
++        };
++
++        union {
++                struct {
++                        u32 opts2;
++                        u32 opts1;
++                } RxDescNormalDDWord4;
++
++                struct {
++                        u16 TimeStampHHigh;
++                        u16 rsv11;
++                        u32 opts1;
++                } RxDescPTPDDWord4;
++        };
++};
++
++struct RxDescV4 {
++        union {
++                u64   addr;
++
++                struct {
++                        u32 RSSInfo;
++                        u32 RSSResult;
++                } RxDescNormalDDWord1;
++        };
++
++        struct {
++                u32 opts2;
++                u32 opts1;
++        } RxDescNormalDDWord2;
++};
++
++enum rxdesc_type {
++        RXDESC_TYPE_NORMAL=0,
++        RXDESC_TYPE_NEXT,
++        RXDESC_TYPE_PTP,
++        RXDESC_TYPE_MAX
++};
++
++//Rx Desc Type
++enum rx_desc_ring_type {
++        RX_DESC_RING_TYPE_UNKNOWN=0,
++        RX_DESC_RING_TYPE_1,
++        RX_DESC_RING_TYPE_2,
++        RX_DESC_RING_TYPE_3,
++        RX_DESC_RING_TYPE_4,
++        RX_DESC_RING_TYPE_MAX
++};
++
++enum rx_desc_len {
++        RX_DESC_LEN_TYPE_1 = (sizeof(struct RxDesc)),
++        RX_DESC_LEN_TYPE_3 = (sizeof(struct RxDescV3)),
++        RX_DESC_LEN_TYPE_4 = (sizeof(struct RxDescV4))
++};
++
++struct ring_info {
++        struct sk_buff  *skb;
++        u32     len;
++        unsigned int   bytecount;
++        unsigned short gso_segs;
++        u8      __pad[sizeof(void *) - sizeof(u32)];
++};
++
++struct pci_resource {
++        u8  cmd;
++        u8  cls;
++        u16 io_base_h;
++        u16 io_base_l;
++        u16 mem_base_h;
++        u16 mem_base_l;
++        u8  ilr;
++        u16 resv_0x1c_h;
++        u16 resv_0x1c_l;
++        u16 resv_0x20_h;
++        u16 resv_0x20_l;
++        u16 resv_0x24_h;
++        u16 resv_0x24_l;
++        u16 resv_0x2c_h;
++        u16 resv_0x2c_l;
++        u32 pci_sn_l;
++        u32 pci_sn_h;
++};
++
++enum r8125_dash_req_flag {
++        R8125_RCV_REQ_SYS_OK = 0,
++        R8125_RCV_REQ_DASH_OK,
++        R8125_SEND_REQ_HOST_OK,
++        R8125_CMAC_RESET,
++        R8125_CMAC_DISALE_RX_FLAG_MAX,
++        R8125_DASH_REQ_FLAG_MAX
++};
++
++enum r8125_flag {
++        R8125_FLAG_DOWN = 0,
++        R8125_FLAG_TASK_RESET_PENDING,
++        R8125_FLAG_TASK_ESD_CHECK_PENDING,
++        R8125_FLAG_TASK_LINKCHG_CHECK_PENDING,
++        R8125_FLAG_TASK_LINK_CHECK_PENDING,
++        R8125_FLAG_TASK_DASH_CHECK_PENDING,
++        R8125_FLAG_MAX
++};
++
++enum r8125_sysfs_flag {
++        R8125_SYSFS_RTL_ADV = 0,
++        R8125_SYSFS_FLAG_MAX
++};
++
++struct rtl8125_tx_ring {
++        void* priv;
++        struct net_device *netdev;
++        u32 index;
++        u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
++        u32 dirty_tx;
++        u32 num_tx_desc; /* Number of Tx descriptor registers */
++        struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */
++        dma_addr_t TxPhyAddr;
++        u32 TxDescAllocSize;
++        struct ring_info tx_skb[MAX_NUM_TX_DESC]; /* Tx data buffers */
++
++        u32 NextHwDesCloPtr;
++        u32 BeginHwDesCloPtr;
++
++        u16 hw_clo_ptr_reg;
++        u16 sw_tail_ptr_reg;
++
++        u16 tdsar_reg; /* Transmit Descriptor Start Address */
++};
++
++struct rtl8125_rx_buffer {
++        struct page *page;
++        u32 page_offset;
++        dma_addr_t dma;
++        void* data;
++        struct sk_buff *skb;
++};
++
++struct rtl8125_rx_ring {
++        void* priv;
++        struct net_device *netdev;
++        u32 index;
++        u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
++        u32 dirty_rx;
++        u32 num_rx_desc; /* Number of Rx descriptor registers */
++        struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */
++        u32 RxDescAllocSize;
++        u64 RxDescPhyAddr[MAX_NUM_RX_DESC]; /* Rx desc physical address*/
++        dma_addr_t RxPhyAddr;
++#ifdef ENABLE_PAGE_REUSE
++        struct rtl8125_rx_buffer rx_buffer[MAX_NUM_RX_DESC];
++        u16 rx_offset;
++#else
++        struct sk_buff *Rx_skbuff[MAX_NUM_RX_DESC]; /* Rx data buffers */
++#endif //ENABLE_PAGE_REUSE
++
++        u16 rdsar_reg; /* Receive Descriptor Start Address */
++};
++
++struct r8125_napi {
++#ifdef CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
++        struct napi_struct napi;
++#endif
++#endif
++        void* priv;
++        int index;
++};
++
++struct r8125_irq {
++        irq_handler_t	handler;
++        unsigned int	vector;
++        u8		requested;
++        char		name[IFNAMSIZ + 10];
++};
++
++#pragma pack(1)
++struct rtl8125_regs {
++        //00
++        u8 mac_id[6];
++        u16 reg_06;
++        u8 mar[8];
++        //10
++        u64 dtccr;
++        u16 ledsel0;
++        u16 legreg;
++        u32 tctr3;
++        //20
++        u32 txq0_dsc_st_addr_0;
++        u32 txq0_dsc_st_addr_2;
++        u64 reg_28;
++        //30
++        u16 rit;
++        u16 ritc;
++        u16 reg_34;
++        u8 reg_36;
++        u8 command;
++        u32 imr0;
++        u32 isr0;
++        //40
++        u32 tcr;
++        u32 rcr;
++        u32 tctr0;
++        u32 tctr1;
++        //50
++        u8 cr93c46;
++        u8 config0;
++        u8 config1;
++        u8 config2;
++        u8 config3;
++        u8 config4;
++        u8 config5;
++        u8 tdfnr;
++        u32 timer_int0;
++        u32 timer_int1;
++        //60
++        u32 gphy_mdcmdio;
++        u32 csidr;
++        u32 csiar;
++        u16 phy_status;
++        u8 config6;
++        u8 pmch;
++        //70
++        u32 eridr;
++        u32 eriar;
++        u16 config7;
++        u16 reg_7a;
++        u32 ephy_rxerr_cnt;
++        //80
++        u32 ephy_mdcmdio;
++        u16 ledsel2;
++        u16 ledsel1;
++        u32 tctr2;
++        u32 timer_int2;
++        //90
++        u8 tppoll0;
++        u8 reg_91;
++        u16 reg_92;
++        u16 led_feature;
++        u16 ledsel3;
++        u16 eee_led_config;
++        u16 reg_9a;
++        u32 reg_9c;
++        //a0
++        u32 reg_a0;
++        u32 reg_a4;
++        u32 reg_a8;
++        u32 reg_ac;
++        //b0
++        u32 patch_dbg;
++        u32 reg_b4;
++        u32 gphy_ocp;
++        u32 reg_bc;
++        //c0
++        u32 reg_c0;
++        u32 reg_c4;
++        u32 reg_c8;
++        u16 otp_cmd;
++        u16 otp_pg_config;
++        //d0
++        u16 phy_pwr;
++        u8 twsi_ctrl;
++        u8 oob_ctrl;
++        u16 mac_dbgo;
++        u16 mac_dbg;
++        u16 reg_d8;
++        u16 rms;
++        u32 efuse_data;
++        //e0
++        u16 cplus_cmd;
++        u16 reg_e2;
++        u32 rxq0_dsc_st_addr_0;
++        u32 rxq0_dsc_st_addr_2;
++        u16 reg_ec;
++        u16 tx10midle_cnt;
++        //f0
++        u16 misc0;
++        u16 misc1;
++        u32 timer_int3;
++        u32 cmac_ib;
++        u16 reg_fc;
++        u16 sw_rst;
++};
++#pragma pack()
++
++struct rtl8125_regs_save {
++        union {
++                u8 mac_io[R8125_MAC_REGS_SIZE];
++
++                struct rtl8125_regs mac_reg;
++        };
++        u16 pcie_phy[R8125_EPHY_REGS_SIZE/2];
++        u16 eth_phy[R8125_PHY_REGS_SIZE/2];
++        u32 eri_reg[R8125_ERI_REGS_SIZE/4];
++        u32 pci_reg[R8125_PCI_REGS_SIZE/4];
++        u16 sw_tail_ptr_reg[R8125_MAX_TX_QUEUES];
++        u16 hw_clo_ptr_reg[R8125_MAX_TX_QUEUES];
++
++        //ktime_t begin_ktime;
++        //ktime_t end_ktime;
++        //u64 duration_ns;
++
++        u16 sw0_tail_ptr;
++        u16 next_hwq0_clo_ptr;
++        u16 sw1_tail_ptr;
++        u16 next_hwq1_clo_ptr;
++
++        u16 int_miti_rxq0;
++        u16 int_miti_txq0;
++        u16 int_miti_rxq1;
++        u16 int_miti_txq1;
++        u8 int_config;
++        u32 imr_new;
++        u32 isr_new;
++
++        u8 tdu_status;
++        u16 rdu_status;
++
++        u16 tc_mode;
++
++        u32 txq1_dsc_st_addr_0;
++        u32 txq1_dsc_st_addr_2;
++
++        u32 pla_tx_q0_idle_credit;
++        u32 pla_tx_q1_idle_credit;
++
++        u32 rxq1_dsc_st_addr_0;
++        u32 rxq1_dsc_st_addr_2;
++
++        u32 rss_ctrl;
++        u8 rss_key[RTL8125_RSS_KEY_SIZE];
++        u8 rss_i_table[RTL8125_MAX_INDIRECTION_TABLE_ENTRIES];
++        u16 rss_queue_num_sel_r;
++};
++
++struct rtl8125_counters {
++        /* legacy */
++        u64 tx_packets;
++        u64 rx_packets;
++        u64 tx_errors;
++        u32 rx_errors;
++        u16 rx_missed;
++        u16 align_errors;
++        u32 tx_one_collision;
++        u32 tx_multi_collision;
++        u64 rx_unicast;
++        u64 rx_broadcast;
++        u32 rx_multicast;
++        u16 tx_aborted;
++        u16 tx_underrun;
++
++        /* extended */
++        u64 tx_octets;
++        u64 rx_octets;
++        u64 rx_multicast64;
++        u64 tx_unicast64;
++        u64 tx_broadcast64;
++        u64 tx_multicast64;
++        u32 tx_pause_on;
++        u32 tx_pause_off;
++        u32 tx_pause_all;
++        u32 tx_deferred;
++        u32 tx_late_collision;
++        u32 tx_all_collision;
++        u32 tx_aborted32;
++        u32 align_errors32;
++        u32 rx_frame_too_long;
++        u32 rx_runt;
++        u32 rx_pause_on;
++        u32 rx_pause_off;
++        u32 rx_pause_all;
++        u32 rx_unknown_opcode;
++        u32 rx_mac_error;
++        u32 tx_underrun32;
++        u32 rx_mac_missed;
++        u32 rx_tcam_dropped;
++        u32 tdu;
++        u32 rdu;
++};
++
++/* Flow Control Settings */
++enum rtl8125_fc_mode {
++        rtl8125_fc_none = 0,
++        rtl8125_fc_rx_pause,
++        rtl8125_fc_tx_pause,
++        rtl8125_fc_full,
++        rtl8125_fc_default
++};
++
++enum rtl8125_state_t {
++        __RTL8125_TESTING = 0,
++        __RTL8125_RESETTING,
++        __RTL8125_DOWN,
++        __RTL8125_PTP_TX_IN_PROGRESS,
++};
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0)
++struct ethtool_eee {
++        __u32	cmd;
++        __u32	supported;
++        __u32	advertised;
++        __u32	lp_advertised;
++        __u32	eee_active;
++        __u32	eee_enabled;
++        __u32	tx_lpi_enabled;
++        __u32	tx_lpi_timer;
++        __u32	reserved[2];
++};
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) */
++
++struct rtl8125_private {
++        void __iomem *mmio_addr;    /* memory map physical address */
++        struct pci_dev *pci_dev;    /* Index of PCI device */
++        struct net_device *dev;
++        struct r8125_napi r8125napi[R8125_MAX_MSIX_VEC];
++        struct r8125_irq irq_tbl[R8125_MAX_MSIX_VEC];
++        unsigned int irq_nvecs;
++        unsigned int max_irq_nvecs;
++        unsigned int min_irq_nvecs;
++        unsigned int hw_supp_irq_nvecs;
++        //struct msix_entry msix_entries[R8125_MAX_MSIX_VEC];
++        struct net_device_stats stats;  /* statistics of net device */
++        unsigned long state;
++        u8 flags;
++
++        u32 msg_enable;
++        u32 tx_tcp_csum_cmd;
++        u32 tx_udp_csum_cmd;
++        u32 tx_ip_csum_cmd;
++        u32 tx_ipv6_csum_cmd;
++        int max_jumbo_frame_size;
++        int chipset;
++        u32 mcfg;
++        //u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
++        //u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
++        //u32 dirty_rx;
++        //u32 dirty_tx;
++        //struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */
++        //struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */
++        //dma_addr_t TxPhyAddr;
++        //dma_addr_t RxPhyAddr;
++        //struct sk_buff *Rx_skbuff[MAX_NUM_RX_DESC]; /* Rx data buffers */
++        //struct ring_info tx_skb[MAX_NUM_TX_DESC];   /* Tx data buffers */
++        unsigned rx_buf_sz;
++#ifdef ENABLE_PAGE_REUSE
++        unsigned rx_buf_page_order;
++        unsigned rx_buf_page_size;
++        u32 page_reuse_fail_cnt;
++#endif //ENABLE_PAGE_REUSE
++        u16 HwSuppNumTxQueues;
++        u16 HwSuppNumRxQueues;
++        unsigned int num_tx_rings;
++        unsigned int num_rx_rings;
++        struct rtl8125_tx_ring tx_ring[R8125_MAX_TX_QUEUES];
++        struct rtl8125_rx_ring rx_ring[R8125_MAX_RX_QUEUES];
++#ifdef ENABLE_LIB_SUPPORT
++        struct blocking_notifier_head lib_nh;
++        struct rtl8125_ring lib_tx_ring[R8125_MAX_TX_QUEUES];
++        struct rtl8125_ring lib_rx_ring[R8125_MAX_RX_QUEUES];
++#endif
++        //struct timer_list esd_timer;
++        //struct timer_list link_timer;
++        struct pci_resource pci_cfg_space;
++        unsigned int esd_flag;
++        unsigned int pci_cfg_is_read;
++        unsigned int rtl8125_rx_config;
++        u16 rms;
++        u16 cp_cmd;
++        u32 intr_mask;
++        u32 intr_l2_mask;
++        u32 timer_intr_mask;
++        u16 isr_reg[R8125_MAX_MSIX_VEC];
++        u16 imr_reg[R8125_MAX_MSIX_VEC];
++        int phy_auto_nego_reg;
++        int phy_1000_ctrl_reg;
++        int phy_2500_ctrl_reg;
++        u8 org_mac_addr[NODE_ADDRESS_SIZE];
++        struct rtl8125_counters *tally_vaddr;
++        dma_addr_t tally_paddr;
++
++#ifdef CONFIG_R8125_VLAN
++        struct vlan_group *vlgrp;
++#endif
++        u8  wol_enabled;
++        u32 wol_opts;
++        u8  efuse_ver;
++        u8  eeprom_type;
++        u8  autoneg;
++        u8  duplex;
++        u32 speed;
++        u64 advertising;
++        enum rtl8125_fc_mode fcpause;
++        u32 HwSuppMaxPhyLinkSpeed;
++        u16 eeprom_len;
++        u16 cur_page;
++        u32 bios_setting;
++
++        int (*set_speed)(struct net_device *, u8 autoneg, u32 speed, u8 duplex, u64 adv);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++        void (*get_settings)(struct net_device *, struct ethtool_cmd *);
++#else
++        void (*get_settings)(struct net_device *, struct ethtool_link_ksettings *);
++#endif
++        void (*phy_reset_enable)(struct net_device *);
++        unsigned int (*phy_reset_pending)(struct net_device *);
++        unsigned int (*link_ok)(struct net_device *);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++        struct work_struct reset_task;
++        struct work_struct esd_task;
++        struct work_struct linkchg_task;
++        struct work_struct link_task;
++        struct work_struct dash_task;
++#else
++        struct delayed_work reset_task;
++        struct delayed_work esd_task;
++        struct delayed_work linkchg_task;
++        struct delayed_work link_task;
++        struct delayed_work dash_task;
++#endif
++        DECLARE_BITMAP(task_flags, R8125_FLAG_MAX);
++        unsigned features;
++
++        u8 org_pci_offset_99;
++        u8 org_pci_offset_180;
++        u8 issue_offset_99_event;
++
++        u8 org_pci_offset_80;
++        u8 org_pci_offset_81;
++        u8 use_timer_interrupt;
++
++        u32 keep_intr_cnt;
++
++        u8  HwIcVerUnknown;
++        u8  NotWrRamCodeToMicroP;
++        u8  NotWrMcuPatchCode;
++        u8  HwHasWrRamCodeToMicroP;
++
++        u16 sw_ram_code_ver;
++        u16 hw_ram_code_ver;
++
++        u8 rtk_enable_diag;
++
++        u8 ShortPacketSwChecksum;
++
++        u8 UseSwPaddingShortPkt;
++
++        u8 RequireAdcBiasPatch;
++        u16 AdcBiasPatchIoffset;
++
++        u8 RequireAdjustUpsTxLinkPulseTiming;
++        u16 SwrCnt1msIni;
++
++        u8 HwSuppNowIsOobVer;
++
++        u8 RequiredSecLanDonglePatch;
++
++        u8 RequiredPfmPatch;
++
++        u8 RequirePhyMdiSwapPatch;
++
++        u8 RequireLSOPatch;
++
++        u32 HwFiberModeVer;
++        u32 HwFiberStat;
++        u8 HwSwitchMdiToFiber;
++
++        u16 BackupLedSel[4];
++
++        u8 HwSuppMagicPktVer;
++
++        u8 HwSuppLinkChgWakeUpVer;
++
++        u8 HwSuppCheckPhyDisableModeVer;
++
++        u8 random_mac;
++
++        u16 phy_reg_aner;
++        u16 phy_reg_anlpar;
++        u16 phy_reg_gbsr;
++        u16 phy_reg_status_2500;
++
++        u32 HwPcieSNOffset;
++
++        u8 HwSuppEsdVer;
++        u8 TestPhyOcpReg;
++        u16 BackupPhyFuseDout[R8125_MAX_PHY_FUSE_DOUT_NUM];
++
++        u32 MaxTxDescPtrMask;
++        u8 HwSuppTxNoCloseVer;
++        u8 EnableTxNoClose;
++
++        u8 HwSuppIsrVer;
++        u8 HwCurrIsrVer;
++
++        u8 HwSuppIntMitiVer;
++
++        u8 HwSuppExtendTallyCounterVer;
++
++        u8 check_keep_link_speed;
++        u8 resume_not_chg_speed;
++
++        u8 HwSuppD0SpeedUpVer;
++        u8 D0SpeedUpSpeed;
++
++        u8 ring_lib_enabled;
++
++        const char *fw_name;
++        struct rtl8125_fw *rtl_fw;
++        u32 ocp_base;
++
++        //Dash+++++++++++++++++
++        u8 HwSuppDashVer;
++        u8 DASH;
++        u8 HwPkgDet;
++        u8 HwSuppOcpChannelVer;
++        u32 DashFirmwareVersion;
++        u32 SizeOfSendToFwBuffer;
++        u32 SizeOfRecvFromFwBuffer;
++        u8 AllowAccessDashOcp;
++        DECLARE_BITMAP(dash_req_flags, R8125_DASH_REQ_FLAG_MAX);
++
++#ifdef ENABLE_DASH_SUPPORT
++        u16 AfterRecvFromFwBufLen;
++        u8 AfterRecvFromFwBuf[RECV_FROM_FW_BUF_SIZE];
++        u32 RecvFromFwBufErrCnt;
++        u16 AfterSendToFwBufLen;
++        u8 AfterSendToFwBuf[SEND_TO_FW_BUF_SIZE];
++        u16 SendToFwBufferLen;
++
++        u8 OobReq;
++        u8 OobAck;
++        u32 OobReqComplete;
++        u32 OobAckComplete;
++
++        u8 SendingToFw;
++
++        u32 RecvFromDashFwCnt;
++
++        u8 DashReqRegValue;
++
++        //Dash-----------------
++#endif //ENABLE_DASH_SUPPORT
++
++        //Realwow++++++++++++++
++        u8 HwSuppKCPOffloadVer;
++
++        u8 EnableDhcpTimeoutWake;
++        u8 EnableTeredoOffload;
++        u8 EnableKCPOffload;
++#ifdef ENABLE_REALWOW_SUPPORT
++        u32 DhcpTimeout;
++        MP_KCP_INFO MpKCPInfo;
++        //Realwow--------------
++#endif //ENABLE_REALWOW_SUPPORT
++
++        struct ethtool_keee eee;
++
++#ifdef ENABLE_R8125_PROCFS
++        //Procfs support
++        struct proc_dir_entry *proc_dir;
++        struct proc_dir_entry *proc_dir_debug;
++        struct proc_dir_entry *proc_dir_test;
++#endif
++#ifdef ENABLE_R8125_SYSFS
++        //sysfs support
++        DECLARE_BITMAP(sysfs_flag, R8125_SYSFS_FLAG_MAX);
++        u32 testmode;
++#endif
++        u8 HwSuppRxDescType;
++        u8 InitRxDescType;
++        u16 RxDescLength; //V1 16 Byte V2 32 Bytes
++
++        spinlock_t phy_lock;
++
++        u8 HwSuppPtpVer;
++        u8 EnablePtp;
++        u8 ptp_master_mode;
++#ifdef ENABLE_PTP_SUPPORT
++        u32 tx_hwtstamp_timeouts;
++        u32 tx_hwtstamp_skipped;
++        struct work_struct ptp_tx_work;
++        struct sk_buff *ptp_tx_skb;
++        struct hwtstamp_config hwtstamp_config;
++        unsigned long ptp_tx_start;
++        struct ptp_clock_info ptp_clock_info;
++        struct ptp_clock *ptp_clock;
++        u8 syncE_en;
++        u8 pps_enable;
++        struct hrtimer pps_timer;
++#endif
++
++        u8 HwSuppRssVer;
++        u8 EnableRss;
++        u16 HwSuppIndirTblEntries;
++#ifdef ENABLE_RSS_SUPPORT
++        u32 rss_flags;
++        /* Receive Side Scaling settings */
++        u8 rss_key[RTL8125_RSS_KEY_SIZE];
++        u8 rss_indir_tbl[RTL8125_MAX_INDIRECTION_TABLE_ENTRIES];
++        u32 rss_options;
++#endif
++
++        u8 HwSuppMacMcuVer;
++        u16 MacMcuPageSize;
++        u64 hw_mcu_patch_code_ver;
++        u64 bin_mcu_patch_code_ver;
++
++        u8 HwSuppTcamVer;
++
++        u16 TcamNotValidReg;
++        u16 TcamValidReg;
++        u16 TcamMaAddrcOffset;
++        u16 TcamVlanTagOffset;
++};
++
++#ifdef ENABLE_LIB_SUPPORT
++static inline unsigned int
++rtl8125_num_lib_tx_rings(struct rtl8125_private *tp)
++{
++        int count, i;
++
++        for (count = 0, i = tp->num_tx_rings; i < tp->HwSuppNumTxQueues; i++)
++                if(tp->lib_tx_ring[i].enabled)
++                        count++;
++
++        return count;
++}
++
++static inline unsigned int
++rtl8125_num_lib_rx_rings(struct rtl8125_private *tp)
++{
++        int count, i;
++
++        for (count = 0, i = 0; i < tp->HwSuppNumRxQueues; i++)
++                if(tp->lib_rx_ring[i].enabled)
++                        count++;
++
++        return count;
++}
++
++#else
++static inline unsigned int
++rtl8125_num_lib_tx_rings(struct rtl8125_private *tp)
++{
++        return 0;
++}
++
++static inline unsigned int
++rtl8125_num_lib_rx_rings(struct rtl8125_private *tp)
++{
++        return 0;
++}
++#endif
++
++static inline unsigned int
++rtl8125_tot_tx_rings(struct rtl8125_private *tp)
++{
++        return tp->num_tx_rings + rtl8125_num_lib_tx_rings(tp);
++}
++
++static inline unsigned int
++rtl8125_tot_rx_rings(struct rtl8125_private *tp)
++{
++        unsigned int num_lib_rx_rings;
++
++        num_lib_rx_rings = rtl8125_num_lib_rx_rings(tp);
++        if (num_lib_rx_rings > 0)
++                return num_lib_rx_rings;
++        else
++                return tp->num_rx_rings;
++}
++
++static inline struct netdev_queue *txring_txq(const struct rtl8125_tx_ring *ring)
++{
++        return netdev_get_tx_queue(ring->netdev, ring->index);
++}
++
++enum eetype {
++        EEPROM_TYPE_NONE=0,
++        EEPROM_TYPE_93C46,
++        EEPROM_TYPE_93C56,
++        EEPROM_TWSI
++};
++
++enum mcfg {
++        CFG_METHOD_2=2,
++        CFG_METHOD_3,
++        CFG_METHOD_4,
++        CFG_METHOD_5,
++        CFG_METHOD_6,
++        CFG_METHOD_7,
++        CFG_METHOD_8,
++        CFG_METHOD_9,
++        CFG_METHOD_10,
++        CFG_METHOD_11,
++        CFG_METHOD_12,
++        CFG_METHOD_13,
++        CFG_METHOD_DEFAULT,
++        CFG_METHOD_MAX
++};
++
++#define LSO_32K 32000
++#define LSO_64K 64000
++
++#define NIC_MIN_PHYS_BUF_COUNT          (2)
++#define NIC_MAX_PHYS_BUF_COUNT_LSO_64K  (24)
++#define NIC_MAX_PHYS_BUF_COUNT_LSO2     (16*4)
++
++#define GTTCPHO_SHIFT                   18
++#define GTTCPHO_MAX                     0x70U
++#define GTPKTSIZE_MAX                   0x3ffffU
++#define TCPHO_SHIFT                     18
++#define TCPHO_MAX                       0x3ffU
++#define LSOPKTSIZE_MAX                  0xffffU
++#define MSS_MAX                         0x07ffu /* MSS value */
++
++#define OOB_CMD_RESET       0x00
++#define OOB_CMD_DRIVER_START    0x05
++#define OOB_CMD_DRIVER_STOP 0x06
++#define OOB_CMD_SET_IPMAC   0x41
++
++#define WAKEUP_MAGIC_PACKET_NOT_SUPPORT (0)
++#define WAKEUP_MAGIC_PACKET_V1 (1)
++#define WAKEUP_MAGIC_PACKET_V2 (2)
++#define WAKEUP_MAGIC_PACKET_V3 (3)
++
++//Ram Code Version
++#define NIC_RAMCODE_VERSION_CFG_METHOD_2 (0x0b11)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_3 (0x0b33)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_4 (0x0b17)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_5 (0x0b99)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_8 (0x0013)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_9 (0x0001)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_10 (0x0027)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_11 (0x0031)
++#define NIC_RAMCODE_VERSION_CFG_METHOD_12 (0x0010)
++
++//hwoptimize
++#define HW_PATCH_SOC_LAN (BIT_0)
++#define HW_PATCH_SAMSUNG_LAN_DONGLE (BIT_2)
++
++static const u16 other_q_intr_mask = (RxOK1 | RxDU1);
++
++#define HW_PHY_STATUS_INI       1
++#define HW_PHY_STATUS_EXT_INI   2
++#define HW_PHY_STATUS_LAN_ON    3
++
++void rtl8125_mdio_write(struct rtl8125_private *tp, u16 RegAddr, u16 value);
++void rtl8125_mdio_prot_write(struct rtl8125_private *tp, u32 RegAddr, u32 value);
++void rtl8125_mdio_prot_direct_write_phy_ocp(struct rtl8125_private *tp, u32 RegAddr, u32 value);
++u32 rtl8125_mdio_read(struct rtl8125_private *tp, u16 RegAddr);
++u32 rtl8125_mdio_prot_read(struct rtl8125_private *tp, u32 RegAddr);
++u32 rtl8125_mdio_prot_direct_read_phy_ocp(struct rtl8125_private *tp, u32 RegAddr);
++void rtl8125_ephy_write(struct rtl8125_private *tp, int RegAddr, int value);
++void rtl8125_mac_ocp_write(struct rtl8125_private *tp, u16 reg_addr, u16 value);
++u16 rtl8125_mac_ocp_read(struct rtl8125_private *tp, u16 reg_addr);
++void rtl8125_clear_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask);
++void rtl8125_set_eth_phy_bit(struct rtl8125_private *tp,  u8  addr, u16  mask);
++void rtl8125_ocp_write(struct rtl8125_private *tp, u16 addr, u8 len, u32 data);
++void rtl8125_init_ring_indexes(struct rtl8125_private *tp);
++void rtl8125_oob_mutex_lock(struct rtl8125_private *tp);
++u32 rtl8125_ocp_read(struct rtl8125_private *tp, u16 addr, u8 len);
++u32 rtl8125_ocp_read_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 base_address);
++u32 rtl8125_ocp_write_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 value, u32 base_address);
++u32 rtl8125_eri_read(struct rtl8125_private *tp, int addr, int len, int type);
++u32 rtl8125_eri_read_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, int type, u32 base_address);
++int rtl8125_eri_write(struct rtl8125_private *tp, int addr, int len, u32 value, int type);
++int rtl8125_eri_write_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, u32 value, int type, u32 base_address);
++u16 rtl8125_ephy_read(struct rtl8125_private *tp, int RegAddr);
++void rtl8125_wait_txrx_fifo_empty(struct net_device *dev);
++void rtl8125_enable_now_is_oob(struct rtl8125_private *tp);
++void rtl8125_disable_now_is_oob(struct rtl8125_private *tp);
++void rtl8125_oob_mutex_unlock(struct rtl8125_private *tp);
++void rtl8125_dash2_disable_tx(struct rtl8125_private *tp);
++void rtl8125_dash2_enable_tx(struct rtl8125_private *tp);
++void rtl8125_dash2_disable_rx(struct rtl8125_private *tp);
++void rtl8125_dash2_enable_rx(struct rtl8125_private *tp);
++void rtl8125_hw_disable_mac_mcu_bps(struct net_device *dev);
++void rtl8125_mark_to_asic(struct rtl8125_private *tp, struct RxDesc *desc, u32 rx_buf_sz);
++void rtl8125_mark_as_last_descriptor(struct rtl8125_private *tp, struct RxDesc *desc);
++
++static inline void
++rtl8125_make_unusable_by_asic(struct rtl8125_private *tp,
++                              struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                ((struct RxDescV3 *)desc)->addr = RTL8125_MAGIC_NUMBER;
++                ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts1 &= ~cpu_to_le32(DescOwn | RsvdMaskV3);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                ((struct RxDescV4 *)desc)->addr = RTL8125_MAGIC_NUMBER;
++                ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts1 &= ~cpu_to_le32(DescOwn | RsvdMaskV4);
++                break;
++        default:
++                desc->addr = RTL8125_MAGIC_NUMBER;
++                desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask);
++                break;
++        }
++}
++
++static inline struct RxDesc*
++rtl8125_get_rxdesc(struct rtl8125_private *tp, struct RxDesc *RxDescBase, u32 const cur_rx)
++{
++        return (struct RxDesc*)((u8*)RxDescBase + (cur_rx * tp->RxDescLength));
++}
++
++static inline void
++rtl8125_disable_hw_interrupt_v2(struct rtl8125_private *tp,
++                                u32 message_id)
++{
++        RTL_W32(tp, IMR_V2_CLEAR_REG_8125, BIT(message_id));
++}
++
++static inline void
++rtl8125_enable_hw_interrupt_v2(struct rtl8125_private *tp, u32 message_id)
++{
++        RTL_W32(tp, IMR_V2_SET_REG_8125, BIT(message_id));
++}
++
++int rtl8125_open(struct net_device *dev);
++int rtl8125_close(struct net_device *dev);
++void rtl8125_hw_config(struct net_device *dev);
++void rtl8125_hw_set_timer_int(struct rtl8125_private *tp, u32 message_id, u8 timer_intmiti_val);
++void rtl8125_set_rx_q_num(struct rtl8125_private *tp, unsigned int num_rx_queues);
++void rtl8125_set_tx_q_num(struct rtl8125_private *tp, unsigned int num_tx_queues);
++void rtl8125_enable_mcu(struct rtl8125_private *tp, bool enable);
++void rtl8125_hw_start(struct net_device *dev);
++void rtl8125_hw_reset(struct net_device *dev);
++void rtl8125_tx_clear(struct rtl8125_private *tp);
++void rtl8125_rx_clear(struct rtl8125_private *tp);
++int rtl8125_init_ring(struct net_device *dev);
++void rtl8125_hw_set_rx_packet_filter(struct net_device *dev);
++void rtl8125_enable_hw_linkchg_interrupt(struct rtl8125_private *tp);
++int rtl8125_dump_tally_counter(struct rtl8125_private *tp, dma_addr_t paddr);
++void rtl8125_enable_napi(struct rtl8125_private *tp);
++void _rtl8125_wait_for_quiescence(struct net_device *dev);
++
++void rtl8125_clear_mac_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask);
++void rtl8125_set_mac_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask);
++
++void rtl8125_mdio_direct_write_phy_ocp(struct rtl8125_private *tp, u16 RegAddr,u16 value);
++u32 rtl8125_mdio_direct_read_phy_ocp(struct rtl8125_private *tp, u16 RegAddr);
++void rtl8125_clear_and_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 clearmask, u16 setmask);
++void rtl8125_clear_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask);
++void rtl8125_set_eth_phy_ocp_bit(struct rtl8125_private *tp,  u16 addr, u16 mask);
++
++#ifndef ENABLE_LIB_SUPPORT
++static inline void rtl8125_lib_reset_prepare(struct rtl8125_private *tp) { }
++static inline void rtl8125_lib_reset_complete(struct rtl8125_private *tp) { }
++#endif
++
++#define HW_SUPPORT_CHECK_PHY_DISABLE_MODE(_M)        ((_M)->HwSuppCheckPhyDisableModeVer > 0)
++#define HW_HAS_WRITE_PHY_MCU_RAM_CODE(_M)        (((_M)->HwHasWrRamCodeToMicroP == TRUE) ? 1 : 0)
++#define HW_SUPPORT_D0_SPEED_UP(_M)        ((_M)->HwSuppD0SpeedUpVer > 0)
++#define HW_SUPPORT_MAC_MCU(_M)        ((_M)->HwSuppMacMcuVer > 0)
++#define HW_SUPPORT_TCAM(_M)        ((_M)->HwSuppTcamVer > 0)
++
++#define HW_SUPP_PHY_LINK_SPEED_GIGA(_M)        ((_M)->HwSuppMaxPhyLinkSpeed >= 1000)
++#define HW_SUPP_PHY_LINK_SPEED_2500M(_M)        ((_M)->HwSuppMaxPhyLinkSpeed >= 2500)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
++#define netdev_mc_count(dev) ((dev)->mc_count)
++#define netdev_mc_empty(dev) (netdev_mc_count(dev) == 0)
++#define netdev_for_each_mc_addr(mclist, dev) \
++    for (mclist = dev->mc_list; mclist; mclist = mclist->next)
++#endif
++
++#endif /* __R8125_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_dash.c b/drivers/net/ethernet/realtek/r8125_dash.c
+new file mode 100755
+index 000000000000..a71c19aea412
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_dash.c
+@@ -0,0 +1,573 @@
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/pci.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/delay.h>
++#include <linux/ethtool.h>
++#include <linux/mii.h>
++#include <linux/if_vlan.h>
++#include <linux/crc32.h>
++#include <linux/in.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
++#include <linux/init.h>
++#include <linux/rtnetlink.h>
++#include <linux/completion.h>
++
++#include <asm/uaccess.h>
++
++#include "r8125.h"
++#include "r8125_dash.h"
++#include "rtl_eeprom.h"
++
++static void r8125_dash_set_ipc2_reg_bit(struct rtl8125_private *tp, unsigned long reg, u32 mask)
++{
++        RTL_DASH_IPC2_W32(tp, reg, RTL_DASH_IPC2_R32(tp, reg) | mask);
++}
++
++/*
++static void r8125_dash_clear_ipc2_reg_bit(struct rtl8125_private *tp, unsigned long reg, u32 mask)
++{
++        RTL_DASH_IPC2_W32(tp, reg, RTL_DASH_IPC2_R32(tp, reg) & ~mask);
++}
++*/
++
++static void r8125_write_ipc2_tx_ack(struct rtl8125_private *tp)
++{
++        if (!tp->DASH)
++                return;
++
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        r8125_dash_set_ipc2_reg_bit(tp, IPC2_TX_SET_REG, IPC2_TX_ACK_BIT);
++}
++
++static void r8125_write_ipc2_tx_polling(struct rtl8125_private *tp)
++{
++        if (!tp->DASH)
++                return;
++
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        r8125_dash_set_ipc2_reg_bit(tp, IPC2_TX_SET_REG, IPC2_TX_SEND_BIT);
++}
++
++static unsigned long
++r8125_get_ipc2_rx_buffer(struct rtl8125_private *tp)
++{
++        if (HW_DASH_SUPPORT_IPC2(tp))
++                return IPC2_RX_BUFFER;
++        else
++                return 0;
++}
++
++static u8 rtl8125_copy_from_ipc2(struct rtl8125_private *tp, u8 *dest, u32 len)
++{
++        unsigned long const data_reg = r8125_get_ipc2_rx_buffer(tp);
++        u32 offset = 0;
++        u32 *pDword;
++        u8 *pByte;
++
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                goto exit;
++
++        if (!dest)
++                goto exit;
++
++        if (len == 0)
++                goto exit;
++
++        pDword = (u32*)dest;
++        while (len > 3 && offset < (IPC2_BUFFER_LENGTH - 4)) {
++                *pDword++ = RTL_DASH_IPC2_R32(tp, data_reg + offset);
++
++                len -= 4;
++                offset += 4;
++        }
++
++        pByte = (u8*)pDword;
++        while (len > 0 && offset < (IPC2_BUFFER_LENGTH - 1)) {
++                *pByte++ = RTL_DASH_IPC2_R8(tp, data_reg + offset);
++
++                len -= 1;
++                offset += 1;
++        }
++
++exit:
++        return (len == 0) ? TRUE : FALSE;
++}
++
++static void RecvFromDashFwComplete(struct rtl8125_private *tp)
++{
++        if (!tp->DASH)
++                return;
++
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        if (tp->DashReqRegValue == DASH_OOB_HDR_TYPE_REQ) { //rok
++                RX_DASH_BUFFER_TYPE_2 rxDashBufferType2 = {0};
++                u32 dataLen;
++
++                if (!tp->OobReq)
++                        goto exit;
++
++                /* copy header for check data length */
++                if (!rtl8125_copy_from_ipc2(tp,
++                                            (u8*)&rxDashBufferType2,
++                                            sizeof(rxDashBufferType2)))
++                        goto exit;
++
++                dataLen = (u16)rxDashBufferType2.oobhdr.len;
++
++                tp->AfterRecvFromFwBufLen = dataLen + sizeof(OSOOBHdr);
++                if (tp->AfterRecvFromFwBufLen > tp->SizeOfRecvFromFwBuffer) {
++                        tp->AfterRecvFromFwBufLen = tp->SizeOfRecvFromFwBuffer;
++                        tp->RecvFromFwBufErrCnt++;
++                }
++
++                /* copy data */
++                rtl8125_copy_from_ipc2(tp,
++                                       tp->AfterRecvFromFwBuf,
++                                       tp->AfterRecvFromFwBufLen);
++
++                r8125_write_ipc2_tx_ack(tp);
++
++                tp->OobReqComplete = TRUE;
++
++                tp->RecvFromDashFwCnt++;
++        } else if (tp->DashReqRegValue == DASH_OOB_HDR_TYPE_ACK) { //rx ack
++                if (!tp->OobAck)
++                        goto exit;
++
++                tp->OobAckComplete = TRUE;
++
++                tp->RecvFromDashFwCnt++;
++        }
++
++exit:
++        return;
++}
++
++static unsigned long r8125_get_ipc2_tx_buffer(struct rtl8125_private *tp)
++{
++        if (HW_DASH_SUPPORT_IPC2(tp))
++                return IPC2_TX_BUFFER;
++        else
++                return 0;
++}
++
++static u32 rtl8125_copy_to_ipc2(struct rtl8125_private *tp, u8 *src, u32 len)
++{
++        unsigned long const data_reg = r8125_get_ipc2_tx_buffer(tp);
++        u32 offset = 0;
++        u32 *pDword;
++        u8 *pByte;
++
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                goto exit;
++
++        if (!src)
++                goto exit;
++
++        if (len == 0)
++                goto exit;
++
++        pDword = (u32*)src;
++        while (len > 3 && offset < (IPC2_BUFFER_LENGTH - 4)) {
++                RTL_DASH_IPC2_W32(tp, data_reg + offset, *pDword++);
++
++                len -= 4;
++                offset += 4;
++        }
++
++        pByte = (u8*)pDword;
++        while (len > 0 && offset < (IPC2_BUFFER_LENGTH - 1)) {
++                RTL_DASH_IPC2_W8(tp, data_reg + offset, *pByte++);
++
++                len -= 1;
++                offset += 1;
++        }
++
++exit:
++        return offset;
++}
++
++static int SendToDashFw(struct rtl8125_private *tp, u8 *src, u16 len)
++{
++        POSOOBHdr pOobHdr;
++        int rc = -1;
++
++        if (!tp->DASH)
++                goto exit;
++
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                goto exit;
++
++        if (TRUE == tp->SendingToFw)
++                goto exit;
++
++        if (!src)
++                goto exit;
++
++        if (len > tp->SizeOfSendToFwBuffer)
++                goto exit;
++
++        if (len < sizeof(OSOOBHdr))
++                goto exit;
++
++        pOobHdr = (POSOOBHdr)src;
++        if (pOobHdr->hostReqV == DASH_OOB_HDR_TYPE_REQ) {
++                r8125_write_ipc2_tx_ack(tp);
++                rc = 0;
++                goto exit;
++        }
++
++        tp->SendingToFw = TRUE;
++
++        rtl8125_copy_to_ipc2(tp, src, len);
++
++        r8125_write_ipc2_tx_polling(tp);
++
++        tp->SendingToFw = FALSE;
++
++        rc = 0;
++
++exit:
++        if (!rc)
++                tp->AfterSendToFwBufLen = len;
++        else
++                tp->AfterSendToFwBufLen = 0;
++
++        return rc;
++}
++
++static u32 rtl8125_get_ipc2_isr(struct rtl8125_private *tp)
++{
++        u32 isr = 0;
++
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                goto exit;
++
++        isr = RTL_DASH_IPC2_R32(tp, IPC2_RX_STATUS_REG);
++
++        if (isr == ULONG_MAX)
++                isr = 0;
++
++exit:
++        return isr;
++}
++
++static void rtl8125_set_ipc2_isr(struct rtl8125_private *tp, u32 val)
++{
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        RTL_DASH_IPC2_W32(tp, IPC2_RX_CLEAR_REG, val);
++}
++
++void rtl8125_clear_ipc2_isr(struct rtl8125_private *tp)
++{
++        rtl8125_set_ipc2_isr(tp, rtl8125_get_ipc2_isr(tp));
++}
++
++void rtl8125_set_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask)
++{
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        RTL_W16(tp, RISC_IMR_8125BP, RTL_R16(tp, RISC_IMR_8125BP) | mask);
++}
++
++void rtl8125_clear_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask)
++{
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        RTL_W16(tp, RISC_IMR_8125BP, RTL_R16(tp, RISC_IMR_8125BP) & ~mask);
++}
++
++bool rtl8125_check_dash_interrupt(struct rtl8125_private *tp)
++{
++        bool rc = false;
++        u32 isr;
++
++        if(!tp->DASH)
++                goto exit;
++
++        if (FALSE == HW_DASH_SUPPORT_IPC2(tp))
++                goto exit;
++
++        isr = rtl8125_get_ipc2_isr(tp);
++
++        if (isr & (IPC2_RX_ROK_BIT | IPC2_RX_ACK_BIT)) {
++                set_bit(R8125_RCV_REQ_DASH_OK, tp->dash_req_flags);
++                if (isr & IPC2_RX_ROK_BIT)
++                        tp->DashReqRegValue = DASH_OOB_HDR_TYPE_REQ;
++                else
++                        tp->DashReqRegValue = DASH_OOB_HDR_TYPE_ACK;
++        }
++
++        rtl8125_set_ipc2_isr(tp, isr);
++
++exit:
++        return rc;
++}
++
++void rtl8125_handle_dash_interrupt(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if(!tp->DASH)
++                return;
++
++        if (test_and_clear_bit(R8125_RCV_REQ_DASH_OK, tp->dash_req_flags))
++                RecvFromDashFwComplete(tp);
++}
++
++static int DashIoctlGetRcvFromFwData(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 ulInfoLen;
++        void *InformationBuffer;
++        u32 InformationBufferLength;
++        void *pInfo;
++        u8 *pByte;
++        u16 *pWord;
++        u8 *tmpBuf;
++        int ret = -EFAULT;
++
++        if (!tp->DASH)
++                goto exit;
++
++        if (!tp->rtk_enable_diag)
++                goto exit;
++
++        if (tp->AfterRecvFromFwBufLen == 0)
++                goto exit;
++
++        InformationBufferLength = prtl_dash_usrdata->len;
++        InformationBuffer = prtl_dash_usrdata->data_buffer;
++
++        ulInfoLen = tp->AfterRecvFromFwBufLen + 2 + 2;
++        if (InformationBufferLength < ulInfoLen) {
++                ret = -EFAULT;
++                goto exit;
++        }
++
++        if (!(tmpBuf = kmalloc(ulInfoLen, GFP_ATOMIC))) {
++                ret = -ENOMEM;
++                goto exit;
++        }
++
++        pInfo = (void*) tp->AfterRecvFromFwBuf;
++        pWord = (u16*) tmpBuf;
++        *pWord++ = tp->AfterRecvFromFwBufLen;
++        pByte = (u8*)pWord;
++        memcpy(pByte, pInfo, tp->AfterRecvFromFwBufLen);
++        pWord = (u16*)(pByte + tp->AfterRecvFromFwBufLen);
++        *pWord= tp->DashReqRegValue;
++        tp->AfterRecvFromFwBufLen = 0;
++        if (copy_to_user(InformationBuffer, tmpBuf, ulInfoLen)) {
++                kfree(tmpBuf);
++                ret = -EFAULT;
++                goto exit;
++        }
++        kfree(tmpBuf);
++        ret = 0;
++
++exit:
++        return ret;
++}
++
++static int DashIoctlCheckSendBufferToFwComplete(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 ulInfoLen;
++        void *InformationBuffer;
++        u32 InformationBufferLength;
++        u16 *pWord;
++        u8 *tmpBuf;
++        int ret = -EFAULT;
++
++        if (!tp->DASH)
++                goto exit;
++
++        if (!tp->rtk_enable_diag)
++                goto exit;
++
++        InformationBufferLength = prtl_dash_usrdata->len;
++        InformationBuffer = prtl_dash_usrdata->data_buffer;
++
++        if (tp->SendingToFw == FALSE)
++                ulInfoLen = tp->AfterSendToFwBufLen + sizeof(u16);
++        else
++                ulInfoLen = sizeof(u16);
++
++        if (InformationBufferLength < ulInfoLen) {
++                ret = -EFAULT;
++                goto exit;
++        }
++
++        if (!(tmpBuf = kmalloc(ulInfoLen, GFP_ATOMIC))) {
++                ret = -ENOMEM;
++                goto exit;
++        }
++
++        pWord = (u16*) tmpBuf;
++        if (tp->SendingToFw == FALSE) {
++                *pWord++ = tp->AfterSendToFwBufLen;
++                memcpy(pWord, tp->AfterSendToFwBuf, tp->AfterSendToFwBufLen);
++                tp->AfterSendToFwBufLen = 0;
++        } else {
++                *pWord = 0xffff;
++        }
++
++        if (copy_to_user(InformationBuffer, tmpBuf, ulInfoLen))
++                ret = -EFAULT;
++        else
++                ret = 0;
++
++        kfree(tmpBuf);
++
++exit:
++        return ret;
++}
++
++static int DashIoctlCheckSendBufferToFw(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 ulInfoLen;
++        void *InformationBuffer;
++        u32 InformationBufferLength;
++        u16 *pWord;
++        u16 SetDataSize;
++        int ret = -EFAULT;
++
++        if (!tp->DASH)
++                goto exit;
++
++        if (!tp->rtk_enable_diag)
++                goto exit;
++
++        InformationBufferLength = prtl_dash_usrdata->len;
++        if (!(InformationBuffer = kmalloc(InformationBufferLength, GFP_KERNEL))) {
++                ret = -ENOMEM;
++                goto exit;
++        }
++
++        if (copy_from_user(InformationBuffer, prtl_dash_usrdata->data_buffer,
++                           InformationBufferLength)) {
++                ret = -EFAULT;
++                goto free_mem;
++        }
++
++        ulInfoLen = sizeof(u16) + sizeof(u16);
++
++        if (InformationBufferLength < ulInfoLen)
++                goto free_mem;
++
++        pWord = (u16*) InformationBuffer;
++        SetDataSize = *pWord++;
++
++        if (InformationBufferLength < (SetDataSize + sizeof(u16) + sizeof(u16))) {
++                ret = -EFAULT;
++                goto free_mem;
++        }
++
++        ret = SendToDashFw(tp, (u8*)pWord, SetDataSize);
++
++free_mem:
++        kfree(InformationBuffer);
++
++exit:
++        return ret;
++}
++
++int rtl8125_dash_ioctl(struct net_device *dev, struct ifreq *ifr)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void *user_data = ifr->ifr_data;
++        struct rtl_dash_ioctl_struct rtl_dash_usrdata;
++
++        int ret=0;
++
++        if (FALSE == HW_DASH_SUPPORT_DASH(tp))
++                return -EOPNOTSUPP;
++
++        if (!tp->DASH)
++                return -EINVAL;
++
++        if (copy_from_user(&rtl_dash_usrdata, user_data,
++                           sizeof(struct rtl_dash_ioctl_struct)))
++                return -EFAULT;
++
++        switch (rtl_dash_usrdata.cmd) {
++        case RTL_DASH_SEND_BUFFER_DATA_TO_DASH_FW:
++                ret = DashIoctlCheckSendBufferToFw(dev, &rtl_dash_usrdata);
++                break;
++        case RTL_DASH_CHECK_SEND_BUFFER_TO_DASH_FW_COMPLETE:
++                ret = DashIoctlCheckSendBufferToFwComplete(dev,
++                                &rtl_dash_usrdata);
++                break;
++        case RTL_DASH_GET_RCV_FROM_FW_BUFFER_DATA:
++                ret = DashIoctlGetRcvFromFwData(dev, &rtl_dash_usrdata);
++                break;
++        case RTL_DASH_OOB_REQ:
++                tp->OobReq = TRUE;
++                tp->OobReqComplete = FALSE;
++                break;
++        case RTL_DASH_OOB_ACK:
++                tp->OobAck = TRUE;
++                tp->OobAckComplete = FALSE;
++                break;
++        case RTL_DASH_DETACH_OOB_REQ:
++                tp->OobReq = FALSE;
++                tp->OobReqComplete = FALSE;
++                break;
++        case RTL_DASH_DETACH_OOB_ACK:
++                tp->OobAck = FALSE;
++                tp->OobAckComplete = FALSE;
++                break;
++        default:
++                return -EOPNOTSUPP;
++        }
++
++        return ret;
++}
+diff --git a/drivers/net/ethernet/realtek/r8125_dash.h b/drivers/net/ethernet/realtek/r8125_dash.h
+new file mode 100755
+index 000000000000..1a4b7dae624c
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_dash.h
+@@ -0,0 +1,196 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_R8125_DASH_H
++#define _LINUX_R8125_DASH_H
++
++#include <linux/if.h>
++
++#define SIOCDEVPRIVATE_RTLDASH   SIOCDEVPRIVATE+2
++
++enum rtl_dash_cmd {
++        RTL_DASH_ARP_NS_OFFLOAD = 0,
++        RTL_DASH_SET_OOB_IPMAC,
++        RTL_DASH_NOTIFY_OOB,
++
++        RTL_DASH_SEND_BUFFER_DATA_TO_DASH_FW,
++        RTL_DASH_CHECK_SEND_BUFFER_TO_DASH_FW_COMPLETE,
++        RTL_DASH_GET_RCV_FROM_FW_BUFFER_DATA,
++        RTL_DASH_OOB_REQ,
++        RTL_DASH_OOB_ACK,
++        RTL_DASH_DETACH_OOB_REQ,
++        RTL_DASH_DETACH_OOB_ACK,
++
++        RTL_FW_SET_IPV4 = 0x10,
++        RTL_FW_GET_IPV4,
++        RTL_FW_SET_IPV6,
++        RTL_FW_GET_IPV6,
++        RTL_FW_SET_EXT_SNMP,
++        RTL_FW_GET_EXT_SNMP,
++        RTL_FW_SET_WAKEUP_PATTERN,
++        RTL_FW_GET_WAKEUP_PATTERN,
++        RTL_FW_DEL_WAKEUP_PATTERN,
++
++        RTLT_DASH_COMMAND_INVALID,
++};
++
++struct rtl_dash_ip_mac {
++        struct sockaddr ifru_addr;
++        struct sockaddr ifru_netmask;
++        struct sockaddr ifru_hwaddr;
++};
++
++struct rtl_dash_ioctl_struct {
++        __u32	cmd;
++        __u32	offset;
++        __u32	len;
++        union {
++                __u32	data;
++                void *data_buffer;
++        };
++};
++
++typedef struct _OSOOBHdr {
++        __le32 len;
++        u8 type;
++        u8 flag;
++        u8 hostReqV;
++        u8 res;
++}
++OSOOBHdr, *POSOOBHdr;
++
++typedef struct _RX_DASH_BUFFER_TYPE_2 {
++        OSOOBHdr oobhdr;
++        u8 RxDataBuffer[0];
++}
++RX_DASH_BUFFER_TYPE_2, *PRX_DASH_BUFFER_TYPE_2;
++
++#define ALIGN_8                 (0x7)
++#define ALIGN_16                (0xf)
++#define ALIGN_32                (0x1f)
++#define ALIGN_64                (0x3f)
++#define ALIGN_256               (0xff)
++#define ALIGN_4096              (0xfff)
++
++#define OCP_REG_FIRMWARE_MAJOR_VERSION (0x120)
++
++#define HW_DASH_SUPPORT_DASH(_M)        ((_M)->HwSuppDashVer > 0)
++#define HW_DASH_SUPPORT_TYPE_1(_M)      ((_M)->HwSuppDashVer == 1)
++#define HW_DASH_SUPPORT_TYPE_2(_M)      ((_M)->HwSuppDashVer == 2)
++#define HW_DASH_SUPPORT_TYPE_3(_M)      ((_M)->HwSuppDashVer == 3)
++#define HW_DASH_SUPPORT_TYPE_4(_M)      ((_M)->HwSuppDashVer == 4)
++#define HW_DASH_SUPPORT_CMAC(_M)        (HW_DASH_SUPPORT_TYPE_2(_M) || HW_DASH_SUPPORT_TYPE_3(_M))
++#define HW_DASH_SUPPORT_IPC2(_M)        (HW_DASH_SUPPORT_TYPE_4(_M))
++#define HW_DASH_SUPPORT_GET_FIRMWARE_VERSION(_M) (HW_DASH_SUPPORT_TYPE_2(_M) || \
++                                                  HW_DASH_SUPPORT_TYPE_3(_M) || \
++                                                  HW_DASH_SUPPORT_TYPE_4(_M))
++
++#define RECV_FROM_FW_BUF_SIZE (1520)
++#define SEND_TO_FW_BUF_SIZE (1520)
++
++#define TXS_CC3_0       (BIT_0|BIT_1|BIT_2|BIT_3)
++#define TXS_EXC         BIT_4
++#define TXS_LNKF        BIT_5
++#define TXS_OWC         BIT_6
++#define TXS_TES         BIT_7
++#define TXS_UNF         BIT_9
++#define TXS_LGSEN       BIT_11
++#define TXS_LS          BIT_12
++#define TXS_FS          BIT_13
++#define TXS_EOR         BIT_14
++#define TXS_OWN         BIT_15
++
++#define TPPool_HRDY     0x20
++
++#define RXS_OWN      BIT_15
++#define RXS_EOR      BIT_14
++#define RXS_FS       BIT_13
++#define RXS_LS       BIT_12
++
++#define ISRIMR_DASH_INTR_EN BIT_12
++
++#define NO_BASE_ADDRESS 0x00000000
++
++/* IB2SOC registers */
++#define IPC2_SWISR_DRIVER_READY 0x05
++#define IPC2_SWISR_DRIVER_EXIT 0x06
++#define IPC2_SWISR_CLIENTTOOL_SYNC_HOSTNAME 0x20
++#define IPC2_SWISR_DIS_DASH 0x55
++#define IPC2_SWISR_EN_DASH 0x56
++
++#define IPC2_IB2SOC_SET 0x10
++#define IPC2_IB2SOC_DATA 0x14
++#define IPC2_IB2SOC_CMD 0x18
++#define IPC2_IB2SOC_IMR 0x1C
++
++/* IPC2 registers */
++#define IPC2_PCIE_BASE      0xC100
++#define IPC2_TX_SET_REG     IPC2_PCIE_BASE
++#define IPC2_TX_STATUS_REG  (IPC2_PCIE_BASE+0x04)
++#define IPC2_RX_STATUS_REG  (IPC2_PCIE_BASE+0x08)
++#define IPC2_RX_CLEAR_REG   (IPC2_PCIE_BASE+0x0C)
++#define IPC2_DATA_BASE      0x32000
++#define IPC2_BUFFER_LENGTH  0x1000
++#define IPC2_DATA_MASTER    IPC2_DATA_BASE                        //dash tx buffer base
++#define IPC2_DATA_SLAVE     (IPC2_DATA_BASE+IPC2_BUFFER_LENGTH)   //dash rx buffer base
++#define IPC2_TX_BUFFER      IPC2_DATA_MASTER
++#define IPC2_RX_BUFFER      IPC2_DATA_SLAVE
++
++#define IPC2_TX_SEND_BIT        BIT_0
++#define IPC2_TX_ACK_BIT         BIT_8
++#define IPC2_RX_ROK_BIT         BIT_0
++#define IPC2_RX_ACK_BIT         BIT_8
++
++/* IPC2 write/read MMIO register */
++#define RTL_DASH_IPC2_W8(tp, reg, val8)   RTL_W8(tp, reg, val8)
++#define RTL_DASH_IPC2_W16(tp, reg, val16) RTL_W16(tp, reg, val16)
++#define RTL_DASH_IPC2_W32(tp, reg, val32)  RTL_W32(tp, reg, val32)
++#define RTL_DASH_IPC2_R8(tp, reg)     RTL_R8(tp, reg)
++#define RTL_DASH_IPC2_R16(tp, reg)    RTL_R16(tp, reg)
++#define RTL_DASH_IPC2_R32(tp, reg)    RTL_R32(tp, reg)
++
++/* DASH OOB Header Type */
++#define DASH_OOB_HDR_TYPE_REQ 0x91
++#define DASH_OOB_HDR_TYPE_ACK 0x92
++
++struct  rtl8125_private;
++
++int rtl8125_dash_ioctl(struct net_device *dev, struct ifreq *ifr);
++bool rtl8125_check_dash_interrupt(struct rtl8125_private *tp);
++void rtl8125_handle_dash_interrupt(struct net_device *dev);
++void rtl8125_clear_ipc2_isr(struct rtl8125_private *tp);
++void rtl8125_set_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask);
++void rtl8125_clear_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask);
++
++#endif /* _LINUX_R8125_DASH_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_fiber.c b/drivers/net/ethernet/realtek/r8125_fiber.c
+new file mode 100755
+index 000000000000..76527719bf9c
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_fiber.c
+@@ -0,0 +1,464 @@
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/version.h>
++#include <linux/pci.h>
++#include <linux/netdevice.h>
++
++#include "r8125.h"
++#include "r8125_fiber.h"
++
++static void
++rtl8125_fiber_set_mdc_gpio_c45(struct rtl8125_private *tp, bool pu)
++{
++        if (pu)
++                rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_7);
++        else
++                rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_7);
++
++        //RtPciCommittp);
++}
++
++static void
++rtl8125_fiber_set_mdc(struct rtl8125_private *tp, bool pu)
++{
++        rtl8125_fiber_set_mdc_gpio_c45(tp, pu);
++}
++
++static void
++rtl8125_fiber_set_mdcDownUp(struct rtl8125_private *tp)
++{
++        udelay(1);
++        rtl8125_fiber_set_mdc(tp, 0);
++        udelay(1);
++        rtl8125_fiber_set_mdc(tp, 1);
++}
++
++static void
++rtl8125_fiber_set_mdio_bit_gpio_c45(struct rtl8125_private *tp, bool pu)
++{
++        if (pu)
++                rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_2);
++        else
++                rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_2);
++
++        //RtPciCommittp);
++
++        rtl8125_fiber_set_mdcDownUp(tp);
++}
++
++static void
++rtl8125_fiber_set_mdio_bit(struct rtl8125_private *tp, bool pu)
++{
++        rtl8125_fiber_set_mdio_bit_gpio_c45(tp, pu);
++}
++
++static u16
++rtl8125_fiber_get_mdio_bit_gpio_c45(struct rtl8125_private *tp)
++{
++        rtl8125_fiber_set_mdcDownUp(tp);
++
++        return !!(rtl8125_mac_ocp_read(tp, 0xDC58) & BIT(2));
++}
++
++static u16
++rtl8125_fiber_get_mdio_bit(struct rtl8125_private *tp)
++{
++        return rtl8125_fiber_get_mdio_bit_gpio_c45(tp);
++}
++
++static void
++rtl8125_fiber_shift_bit_in(struct rtl8125_private *tp, u32 val, int count)
++{
++        int i;
++
++        for (i = (count - 1); i >= 0; i--)
++                rtl8125_fiber_set_mdio_bit(tp, !!(val & BIT(i)));
++}
++
++static u16
++rtl8125_fiber_shift_bit_out(struct rtl8125_private *tp)
++{
++        u16 data = 0;
++        int i;
++
++        for (i = 15; i >= 0; i--)
++                data += (rtl8125_fiber_get_mdio_bit(tp) << i);
++
++        return data;
++}
++
++static void
++rtl8125_fiber_dir_gpio_c45(struct rtl8125_private *tp, bool output_mode)
++{
++        if (output_mode)
++                rtl8125_set_mac_ocp_bit(tp, 0xDC4C, BIT_2);
++        else
++                rtl8125_clear_mac_ocp_bit(tp, 0xDC4C, BIT_2);
++}
++
++static void
++rtl8125_fiber_dir(struct rtl8125_private *tp, bool output_mode)
++{
++        rtl8125_fiber_dir_gpio_c45(tp, output_mode);
++}
++
++//fiber
++#define R8125_FIBER_C22 (0)
++#define R8125_FIBER_C45 (1)
++
++// sfp opcodes
++#define R8125_FIBER_ST (1)
++#define R8125_FIBER_OP_W (1)
++#define R8125_FIBER_OP_R (2)
++#define R8125_FIBER_TA (2)
++
++// sfp C45 opcodes
++#define R8125_FIBER_MDIO_C45 (BIT(15))
++#define R8125_FIBER_C45_ST (R8125_FIBER_MDIO_C45 | 0)
++#define R8125_FIBER_C45_OP_ADDR (R8125_FIBER_MDIO_C45 | 0)
++#define R8125_FIBER_C45_OP_W (R8125_FIBER_MDIO_C45 | 1)
++#define R8125_FIBER_C45_OP_R (R8125_FIBER_MDIO_C45 | 3)
++
++static void
++rtl8125_fiber_cmd(struct rtl8125_private *tp, u32 cmd, u8 phy_addr,
++                  u32 reg)
++{
++        /* change to output mode */
++        rtl8125_fiber_dir(tp, 1);
++
++        /* preamble 32bit of 1 */
++        rtl8125_fiber_shift_bit_in(tp, UINT_MAX, 32);
++
++        /* start bit */
++        if (cmd & R8125_FIBER_MDIO_C45)
++                rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_C45_ST, 2);
++        else
++                rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_ST, 2);
++
++        /* opcode */
++        rtl8125_fiber_shift_bit_in(tp, cmd, 2);
++
++        /* phy address */
++        rtl8125_fiber_shift_bit_in(tp, phy_addr, 5);
++
++        /* phy reg */
++        rtl8125_fiber_shift_bit_in(tp, reg, 5);
++}
++
++static u8
++rtl8125_fiber_cmdAddr(struct rtl8125_private *tp, u8 phy_addr, u32 reg)
++{
++        u8 dev_addr = (reg >> 16) & 0x1F;
++        u16 addr = (u16)reg;
++
++        rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_ADDR, phy_addr, dev_addr);
++
++        /* turn-around(TA) */
++        rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_TA, 2);
++
++        rtl8125_fiber_shift_bit_in(tp, addr, 16);
++
++        rtl8125_fiber_dir(tp, 0);
++
++        rtl8125_fiber_get_mdio_bit(tp);
++
++        return dev_addr;
++}
++
++static void
++rtl8125_fiber_reset_gpio_c45(struct rtl8125_private *tp)
++{
++        rtl8125_set_mac_ocp_bit(tp, 0xDC4C, (BIT_7 | BIT_2));
++
++        /* init sfp interface */
++        rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_7);
++        rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_2);
++}
++
++static void
++rtl8125_fiber_write_common(struct rtl8125_private *tp, u16 val)
++{
++        /* turn-around(TA) */
++        rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_TA, 2);
++
++        /* write phy data */
++        rtl8125_fiber_shift_bit_in(tp, val, 16);
++
++        /* change to input mode */
++        rtl8125_fiber_dir(tp, 0);
++
++        rtl8125_fiber_get_mdio_bit(tp);
++}
++
++static void
++rtl8125_fiber_mdio_write_gpio_c45(
++        struct rtl8125_private *tp,
++        u32 reg,
++        u16 val,
++        u8 phy_addr)
++{
++        /* opcode write */
++        reg = rtl8125_fiber_cmdAddr(tp, phy_addr, reg);
++        rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_W, phy_addr, reg);
++
++        rtl8125_fiber_write_common(tp, val);
++}
++
++static u16
++rtl8125_fiber_read_common(struct rtl8125_private *tp)
++{
++        u16 data = 0;
++
++        /* change to input mode */
++        rtl8125_fiber_dir(tp, 0);
++
++        /* TA 0 */
++        rtl8125_fiber_get_mdio_bit(tp);
++
++        /* read phy data */
++        data = rtl8125_fiber_shift_bit_out(tp);
++
++        rtl8125_fiber_get_mdio_bit(tp);
++
++        return data;
++}
++
++static u16
++rtl8125_fiber_mdio_read_gpio_c45(
++        struct rtl8125_private *tp,
++        u32 reg,
++        u8 phy_addr)
++{
++        reg = rtl8125_fiber_cmdAddr(tp, phy_addr, reg);
++        rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_R, phy_addr, reg);
++
++        return rtl8125_fiber_read_common(tp);
++}
++
++void
++rtl8125_fiber_mdio_write(
++        struct rtl8125_private *tp,
++        u32 reg,
++        u16 val)
++{
++        switch(tp->HwFiberStat) {
++        case FIBER_STAT_CONNECT_GPO_C45:
++                return rtl8125_fiber_mdio_write_gpio_c45(tp, reg, val, 0);
++        default:
++                return;
++        }
++}
++
++u16
++rtl8125_fiber_mdio_read(
++        struct rtl8125_private *tp,
++        u32 reg)
++{
++        switch(tp->HwFiberStat) {
++        case FIBER_STAT_CONNECT_GPO_C45:
++                return rtl8125_fiber_mdio_read_gpio_c45(tp, reg, 0);
++        default:
++                return 0xffff;
++        }
++}
++
++static void
++rtl8125_fiber_clear_and_set_phy_bit(struct rtl8125_private *tp, u32 addr, u16 clearmask, u16 setmask)
++{
++        u16 PhyRegValue;
++
++        PhyRegValue = rtl8125_fiber_mdio_read(tp, addr);
++        PhyRegValue &= ~clearmask;
++        PhyRegValue |= setmask;
++        rtl8125_fiber_mdio_write(tp, addr, PhyRegValue);
++}
++
++static void
++rtl8125_fiber_clear_phy_bit(struct rtl8125_private *tp, u32 addr, u16 mask)
++{
++        rtl8125_fiber_clear_and_set_phy_bit(tp, addr, mask, 0);
++}
++
++static void
++rtl8125_fiber_set_phy_bit(struct rtl8125_private *tp, u32 addr, u16 mask)
++{
++        rtl8125_fiber_clear_and_set_phy_bit(tp, addr, 0, mask);
++}
++
++#define R8125_MAKE_C45_ADDR(_mmd, _addr) (_mmd << 16 | _addr)
++
++static void
++rtl8125_fiber_phy_reset_8221d(struct rtl8125_private *tp)
++{
++        u16 PhyRegValue;
++        u32 Timeout;
++
++        rtl8125_fiber_set_phy_bit(tp, R8125_MAKE_C45_ADDR(0x01, 0x00), BIT_15);
++
++        Timeout = 0;
++        do {
++                udelay(1000);
++
++                PhyRegValue = rtl8125_fiber_mdio_read(tp, R8125_MAKE_C45_ADDR(0x01, 0x00));
++
++                Timeout++;
++        } while ((PhyRegValue & BIT_15) && (Timeout < 20));
++}
++
++static void
++rtl8125_fiber_phy_reset(struct rtl8125_private *tp)
++{
++        switch (tp->HwFiberModeVer) {
++        case FIBER_MODE_RTL8125D_RTL8221D:
++                rtl8125_fiber_phy_reset_8221d(tp);
++                break;
++        }
++}
++
++static void
++rtl8125_fiber_set_rtl8221d_phy_mode(struct rtl8125_private *tp, u16 mode)
++{
++        mode &= 0x3f;
++
++        rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(30, 0x75F3), BIT_0);
++        rtl8125_fiber_clear_and_set_phy_bit(tp,
++                                            R8125_MAKE_C45_ADDR(30, 0x697A),
++                                            0x003F,
++                                            mode);
++}
++
++static void
++rtl8125_fiber_set_phy_mode(struct rtl8125_private *tp, u16 mode)
++{
++        switch (tp->HwFiberModeVer) {
++        case FIBER_MODE_RTL8125D_RTL8221D:
++                rtl8125_fiber_set_rtl8221d_phy_mode(tp, mode);
++                break;
++        default:
++                break;
++        }
++}
++
++static void
++rtl8125_hw_rtl8221d_phy_config(struct rtl8125_private *tp)
++{
++        rtl8125_fiber_reset_gpio_c45(tp);
++
++        rtl8125_fiber_set_phy_mode(tp, (tp->speed == SPEED_2500) ? 0x02 : 0x04);
++
++
++        rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(0x07, 0x3C), (BIT_2 | BIT_1));
++        rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(0x07, 0x3E), (BIT_1 | BIT_0));
++
++
++        rtl8125_fiber_phy_reset(tp);
++}
++
++void
++rtl8125_hw_fiber_phy_config(struct rtl8125_private *tp)
++{
++        switch (tp->HwFiberModeVer) {
++        case FIBER_MODE_RTL8125D_RTL8221D:
++                rtl8125_hw_rtl8221d_phy_config(tp);
++                break;
++        default:
++                break;
++        }
++}
++
++#define RTL8221D_PHY_ID_1 0x001C
++#define RTL8221D_PHY_ID_2 0xC849
++static u32
++rtl8125_fiber_get_connect_status_8221d(struct rtl8125_private *tp)
++{
++        int i;
++        int const checkcnt = 4;
++
++        rtl8125_fiber_reset_gpio_c45(tp);
++
++        for (i = 0; i < checkcnt; i++) {
++                if (RTL8221D_PHY_ID_1 != rtl8125_fiber_mdio_read_gpio_c45(tp, R8125_MAKE_C45_ADDR(0x01, 0x02), 0) ||
++                    RTL8221D_PHY_ID_2 != rtl8125_fiber_mdio_read_gpio_c45(tp, R8125_MAKE_C45_ADDR(0x01, 0x03), 0))
++                        return FIBER_STAT_DISCONNECT;
++        }
++
++        return FIBER_STAT_CONNECT_GPO_C45;
++}
++
++static u32
++rtl8125_fiber_get_connect_status(struct rtl8125_private *tp)
++{
++        switch (tp->HwFiberModeVer) {
++        case FIBER_MODE_RTL8125D_RTL8221D:
++                return rtl8125_fiber_get_connect_status_8221d(tp);
++        default:
++                return FIBER_STAT_NOT_CHECKED;
++        }
++}
++
++void
++rtl8125_check_fiber_mode_support(struct rtl8125_private *tp)
++{
++        switch(tp->mcfg) {
++        case CFG_METHOD_10:
++        case CFG_METHOD_11: {
++                u8 tmp = (u8)rtl8125_mac_ocp_read(tp, 0xD006);
++                if (tmp == 0x03)
++                        tp->HwFiberModeVer = FIBER_MODE_RTL8125D_RTL8221D;
++        }
++        break;
++        }
++
++        if (HW_FIBER_MODE_ENABLED(tp))
++                tp->HwFiberStat = rtl8125_fiber_get_connect_status(tp);
++}
++
++unsigned int
++rtl8125_fiber_link_ok(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 status;
++
++        switch (tp->HwFiberStat) {
++        case FIBER_STAT_CONNECT_GPO_C45:
++                status = rtl8125_fiber_mdio_read(tp, R8125_MAKE_C45_ADDR(30, 0x758D));
++                if (status != USHRT_MAX && status & BIT_1)
++                        return 1;
++                else
++                        return 0;
++                break;
++        default:
++                return 0;
++        }
++}
+diff --git a/drivers/net/ethernet/realtek/r8125_fiber.h b/drivers/net/ethernet/realtek/r8125_fiber.h
+new file mode 100755
+index 000000000000..3a328574f291
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_fiber.h
+@@ -0,0 +1,63 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_R8125_FIBER_H
++#define _LINUX_R8125_FIBER_H
++
++enum {
++        FIBER_MODE_NIC_ONLY = 0,
++        FIBER_MODE_RTL8125D_RTL8221D,
++        FIBER_MODE_MAX
++};
++
++enum {
++        FIBER_STAT_NOT_CHECKED = 0,
++        FIBER_STAT_DISCONNECT,
++        FIBER_STAT_CONNECT_GPO_C45,
++        FIBER_STAT_MAX
++};
++
++#define HW_FIBER_MODE_ENABLED(_M)        ((_M)->HwFiberModeVer > 0)
++#define HW_FIBER_STATUS_CONNECTED(_M)        (((_M)->HwFiberStat == FIBER_STAT_CONNECT_GPO_C45))
++#define HW_FIBER_STATUS_DISCONNECTED(_M)        ((_M)->HwFiberStat == FIBER_STAT_DISCONNECT)
++
++struct rtl8125_private;
++
++void rtl8125_hw_fiber_phy_config(struct rtl8125_private *tp);
++void rtl8125_check_fiber_mode_support(struct rtl8125_private *tp);
++void rtl8125_fiber_mdio_write( struct rtl8125_private *tp, u32 reg, u16 val);
++u16 rtl8125_fiber_mdio_read(struct rtl8125_private *tp, u32 reg);
++unsigned int rtl8125_fiber_link_ok(struct net_device *dev);
++
++#endif /* _LINUX_R8125_FIBER_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_firmware.c b/drivers/net/ethernet/realtek/r8125_firmware.c
+new file mode 100755
+index 000000000000..313c7d91b1c3
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_firmware.c
+@@ -0,0 +1,264 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/version.h>
++#include <linux/delay.h>
++#include <linux/firmware.h>
++
++#include "r8125_firmware.h"
++
++enum rtl_fw_opcode {
++        PHY_READ		= 0x0,
++        PHY_DATA_OR		= 0x1,
++        PHY_DATA_AND		= 0x2,
++        PHY_BJMPN		= 0x3,
++        PHY_MDIO_CHG		= 0x4,
++        PHY_CLEAR_READCOUNT	= 0x7,
++        PHY_WRITE		= 0x8,
++        PHY_READCOUNT_EQ_SKIP	= 0x9,
++        PHY_COMP_EQ_SKIPN	= 0xa,
++        PHY_COMP_NEQ_SKIPN	= 0xb,
++        PHY_WRITE_PREVIOUS	= 0xc,
++        PHY_SKIPN		= 0xd,
++        PHY_DELAY_MS		= 0xe,
++};
++
++struct fw_info {
++        u32	magic;
++        char	version[RTL8125_VER_SIZE];
++        __le32	fw_start;
++        __le32	fw_len;
++        u8	chksum;
++} __packed;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0)
++#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
++#endif
++#define FW_OPCODE_SIZE sizeof_field(struct rtl8125_fw_phy_action, code[0])
++
++static bool rtl8125_fw_format_ok(struct rtl8125_fw *rtl_fw)
++{
++        const struct firmware *fw = rtl_fw->fw;
++        struct fw_info *fw_info = (struct fw_info *)fw->data;
++        struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action;
++
++        if (fw->size < FW_OPCODE_SIZE)
++                return false;
++
++        if (!fw_info->magic) {
++                size_t i, size, start;
++                u8 checksum = 0;
++
++                if (fw->size < sizeof(*fw_info))
++                        return false;
++
++                for (i = 0; i < fw->size; i++)
++                        checksum += fw->data[i];
++                if (checksum != 0)
++                        return false;
++
++                start = le32_to_cpu(fw_info->fw_start);
++                if (start > fw->size)
++                        return false;
++
++                size = le32_to_cpu(fw_info->fw_len);
++                if (size > (fw->size - start) / FW_OPCODE_SIZE)
++                        return false;
++
++                strscpy(rtl_fw->version, fw_info->version, RTL8125_VER_SIZE);
++
++                pa->code = (__le32 *)(fw->data + start);
++                pa->size = size;
++        } else {
++                if (fw->size % FW_OPCODE_SIZE)
++                        return false;
++
++                strscpy(rtl_fw->version, rtl_fw->fw_name, RTL8125_VER_SIZE);
++
++                pa->code = (__le32 *)fw->data;
++                pa->size = fw->size / FW_OPCODE_SIZE;
++        }
++
++        return true;
++}
++
++static bool rtl8125_fw_data_ok(struct rtl8125_fw *rtl_fw)
++{
++        struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action;
++        size_t index;
++
++        for (index = 0; index < pa->size; index++) {
++                u32 action = le32_to_cpu(pa->code[index]);
++                u32 val = action & 0x0000ffff;
++                u32 regno = (action & 0x0fff0000) >> 16;
++
++                switch (action >> 28) {
++                case PHY_READ:
++                case PHY_DATA_OR:
++                case PHY_DATA_AND:
++                case PHY_CLEAR_READCOUNT:
++                case PHY_WRITE:
++                case PHY_WRITE_PREVIOUS:
++                case PHY_DELAY_MS:
++                        break;
++
++                case PHY_MDIO_CHG:
++                        if (val > 1)
++                                goto out;
++                        break;
++
++                case PHY_BJMPN:
++                        if (regno > index)
++                                goto out;
++                        break;
++                case PHY_READCOUNT_EQ_SKIP:
++                        if (index + 2 >= pa->size)
++                                goto out;
++                        break;
++                case PHY_COMP_EQ_SKIPN:
++                case PHY_COMP_NEQ_SKIPN:
++                case PHY_SKIPN:
++                        if (index + 1 + regno >= pa->size)
++                                goto out;
++                        break;
++
++                default:
++                        dev_err(rtl_fw->dev, "Invalid action 0x%08x\n", action);
++                        return false;
++                }
++        }
++
++        return true;
++out:
++        dev_err(rtl_fw->dev, "Out of range of firmware\n");
++        return false;
++}
++
++void rtl8125_fw_write_firmware(struct rtl8125_private *tp, struct rtl8125_fw *rtl_fw)
++{
++        struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action;
++        rtl8125_fw_write_t fw_write = rtl_fw->phy_write;
++        rtl8125_fw_read_t fw_read = rtl_fw->phy_read;
++        int predata = 0, count = 0;
++        size_t index;
++
++        for (index = 0; index < pa->size; index++) {
++                u32 action = le32_to_cpu(pa->code[index]);
++                u32 data = action & 0x0000ffff;
++                u32 regno = (action & 0x0fff0000) >> 16;
++                enum rtl_fw_opcode opcode = action >> 28;
++
++                if (!action)
++                        break;
++
++                switch (opcode) {
++                case PHY_READ:
++                        predata = fw_read(tp, regno);
++                        count++;
++                        break;
++                case PHY_DATA_OR:
++                        predata |= data;
++                        break;
++                case PHY_DATA_AND:
++                        predata &= data;
++                        break;
++                case PHY_BJMPN:
++                        index -= (regno + 1);
++                        break;
++                case PHY_MDIO_CHG:
++                        if (data) {
++                                fw_write = rtl_fw->mac_mcu_write;
++                                fw_read = rtl_fw->mac_mcu_read;
++                        } else {
++                                fw_write = rtl_fw->phy_write;
++                                fw_read = rtl_fw->phy_read;
++                        }
++
++                        break;
++                case PHY_CLEAR_READCOUNT:
++                        count = 0;
++                        break;
++                case PHY_WRITE:
++                        fw_write(tp, regno, data);
++                        break;
++                case PHY_READCOUNT_EQ_SKIP:
++                        if (count == data)
++                                index++;
++                        break;
++                case PHY_COMP_EQ_SKIPN:
++                        if (predata == data)
++                                index += regno;
++                        break;
++                case PHY_COMP_NEQ_SKIPN:
++                        if (predata != data)
++                                index += regno;
++                        break;
++                case PHY_WRITE_PREVIOUS:
++                        fw_write(tp, regno, predata);
++                        break;
++                case PHY_SKIPN:
++                        index += regno;
++                        break;
++                case PHY_DELAY_MS:
++                        mdelay(1 * data);
++                        break;
++                }
++        }
++}
++
++void rtl8125_fw_release_firmware(struct rtl8125_fw *rtl_fw)
++{
++        release_firmware(rtl_fw->fw);
++}
++
++int rtl8125_fw_request_firmware(struct rtl8125_fw *rtl_fw)
++{
++        int rc;
++
++        rc = request_firmware(&rtl_fw->fw, rtl_fw->fw_name, rtl_fw->dev);
++        if (rc < 0)
++                goto out;
++
++        if (!rtl8125_fw_format_ok(rtl_fw) || !rtl8125_fw_data_ok(rtl_fw)) {
++                release_firmware(rtl_fw->fw);
++                rc = -EINVAL;
++                goto out;
++        }
++
++        return 0;
++out:
++        dev_err(rtl_fw->dev, "Unable to load firmware %s (%d)\n",
++                rtl_fw->fw_name, rc);
++        return rc;
++}
+diff --git a/drivers/net/ethernet/realtek/r8125_firmware.h b/drivers/net/ethernet/realtek/r8125_firmware.h
+new file mode 100755
+index 000000000000..540c1d22f281
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_firmware.h
+@@ -0,0 +1,68 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_rtl8125_FIRMWARE_H
++#define _LINUX_rtl8125_FIRMWARE_H
++
++#include <linux/device.h>
++#include <linux/firmware.h>
++
++struct rtl8125_private;
++typedef void (*rtl8125_fw_write_t)(struct rtl8125_private *tp, u16 reg, u16 val);
++typedef u32 (*rtl8125_fw_read_t)(struct rtl8125_private *tp, u16 reg);
++
++#define RTL8125_VER_SIZE		32
++
++struct rtl8125_fw {
++        rtl8125_fw_write_t phy_write;
++        rtl8125_fw_read_t phy_read;
++        rtl8125_fw_write_t mac_mcu_write;
++        rtl8125_fw_read_t mac_mcu_read;
++        const struct firmware *fw;
++        const char *fw_name;
++        struct device *dev;
++
++        char version[RTL8125_VER_SIZE];
++
++        struct rtl8125_fw_phy_action {
++                __le32 *code;
++                size_t size;
++        } phy_action;
++};
++
++int rtl8125_fw_request_firmware(struct rtl8125_fw *rtl_fw);
++void rtl8125_fw_release_firmware(struct rtl8125_fw *rtl_fw);
++void rtl8125_fw_write_firmware(struct rtl8125_private *tp, struct rtl8125_fw *rtl_fw);
++
++#endif /* _LINUX_rtl8125_FIRMWARE_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_n.c b/drivers/net/ethernet/realtek/r8125_n.c
+new file mode 100755
+index 000000000000..3d775975bfc4
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_n.c
+@@ -0,0 +1,21312 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++/*
++ * This driver is modified from r8169.c in Linux kernel 2.6.18
++ */
++
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/pci.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/delay.h>
++#include <linux/mii.h>
++#include <linux/if_vlan.h>
++#include <linux/crc32.h>
++#include <linux/interrupt.h>
++#include <linux/in.h>
++#include <linux/ip.h>
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
++#include <linux/ipv6.h>
++#include <net/ip6_checksum.h>
++#endif
++#include <linux/tcp.h>
++#include <linux/init.h>
++#include <linux/rtnetlink.h>
++#include <linux/completion.h>
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0)
++#include <linux/pci-aspm.h>
++#endif
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
++#include <linux/prefetch.h>
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++#define dev_printk(A,B,fmt,args...) printk(A fmt,##args)
++#else
++#include <linux/dma-mapping.h>
++#include <linux/moduleparam.h>
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31)
++#include <linux/mdio.h>
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,4,10)
++#include <net/gso.h>
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,4,10) */
++
++#include <asm/io.h>
++#include <asm/irq.h>
++
++#include "r8125.h"
++#include "rtl_eeprom.h"
++#include "rtltool.h"
++#include "r8125_firmware.h"
++
++#ifdef ENABLE_R8125_PROCFS
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#endif
++
++#define FIRMWARE_8125A_3	"rtl_nic/rtl8125a-3.fw"
++#define FIRMWARE_8125B_2	"rtl_nic/rtl8125b-2.fw"
++#define FIRMWARE_8125BP_1	"rtl_nic/rtl8125bp-1.fw"
++#define FIRMWARE_8125BP_2	"rtl_nic/rtl8125bp-2.fw"
++#define FIRMWARE_8125D_1	"rtl_nic/rtl8125d-1.fw"
++#define FIRMWARE_8125D_2	"rtl_nic/rtl8125d-2.fw"
++#define FIRMWARE_8125CP_1	"rtl_nic/rtl8125cp-1.fw"
++
++static const struct {
++        const char *name;
++        const char *fw_name;
++} rtl_chip_fw_infos[] = {
++        /* PCI-E devices. */
++        [CFG_METHOD_2] = {"RTL8125A"				},
++        [CFG_METHOD_3] = {"RTL8125A",		FIRMWARE_8125A_3},
++        [CFG_METHOD_4] = {"RTL8125B",                       },
++        [CFG_METHOD_5] = {"RTL8125B",		FIRMWARE_8125B_2},
++        [CFG_METHOD_6] = {"RTL8168KB",		FIRMWARE_8125A_3},
++        [CFG_METHOD_7] = {"RTL8168KB",		FIRMWARE_8125B_2},
++        [CFG_METHOD_8] = {"RTL8125BP",		FIRMWARE_8125BP_1},
++        [CFG_METHOD_9] = {"RTL8125BP",		FIRMWARE_8125BP_2},
++        [CFG_METHOD_10] = {"RTL8125D",		FIRMWARE_8125D_1},
++        [CFG_METHOD_11] = {"RTL8125D",		FIRMWARE_8125D_2},
++        [CFG_METHOD_12] = {"RTL8125CP",		FIRMWARE_8125CP_1},
++        [CFG_METHOD_13] = {"RTL8168KD",		FIRMWARE_8125D_2},
++        [CFG_METHOD_DEFAULT] = {"Unknown",                  },
++};
++
++#define _R(NAME,MAC,RCR,MASK,JumFrameSz) \
++    { .name = NAME, .mcfg = MAC, .RCR_Cfg = RCR, .RxConfigMask = MASK, .jumbo_frame_sz = JumFrameSz }
++
++static const struct {
++        const char *name;
++        u8 mcfg;
++        u32 RCR_Cfg;
++        u32 RxConfigMask;   /* Clears the bits supported by this chip */
++        u32 jumbo_frame_sz;
++} rtl_chip_info[] = {
++        _R("RTL8125A",
++        CFG_METHOD_2,
++        Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125A",
++        CFG_METHOD_3,
++        Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125B",
++        CFG_METHOD_4,
++        Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125B",
++        CFG_METHOD_5,
++        Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8168KB",
++        CFG_METHOD_6,
++        Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8168KB",
++        CFG_METHOD_7,
++        Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125BP",
++        CFG_METHOD_8,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125BP",
++        CFG_METHOD_9,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125D",
++        CFG_METHOD_10,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125D",
++        CFG_METHOD_11,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8125CP",
++        CFG_METHOD_12,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("RTL8168KD",
++        CFG_METHOD_13,
++        Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_9k),
++
++        _R("Unknown",
++        CFG_METHOD_DEFAULT,
++        (RX_DMA_BURST_512 << RxCfgDMAShift),
++        0xff7e5880,
++        Jumbo_Frame_1k)
++};
++#undef _R
++
++
++#ifndef PCI_VENDOR_ID_DLINK
++#define PCI_VENDOR_ID_DLINK 0x1186
++#endif
++
++static struct pci_device_id rtl8125_pci_tbl[] = {
++        { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8125), },
++        { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8162), },
++        { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x3000), },
++        {0,},
++};
++
++MODULE_DEVICE_TABLE(pci, rtl8125_pci_tbl);
++
++static int use_dac = 1;
++static int timer_count = 0x2600;
++static int timer_count_v2 = (0x2600 / 0x100);
++
++static struct {
++        u32 msg_enable;
++} debug = { -1 };
++
++static unsigned int speed_mode = SPEED_2500;
++static unsigned int duplex_mode = DUPLEX_FULL;
++static unsigned int autoneg_mode = AUTONEG_ENABLE;
++#ifdef CONFIG_ASPM
++static int aspm = 1;
++#else
++static int aspm = 0;
++#endif
++#ifdef ENABLE_S5WOL
++static int s5wol = 1;
++#else
++static int s5wol = 0;
++#endif
++#ifdef ENABLE_S5_KEEP_CURR_MAC
++static int s5_keep_curr_mac = 1;
++#else
++static int s5_keep_curr_mac = 0;
++#endif
++#ifdef ENABLE_EEE
++static int eee_enable = 1;
++#else
++static int eee_enable = 0;
++#endif
++#ifdef CONFIG_SOC_LAN
++static ulong hwoptimize = HW_PATCH_SOC_LAN;
++#else
++static ulong hwoptimize = 0;
++#endif
++#ifdef ENABLE_S0_MAGIC_PACKET
++static int s0_magic_packet = 1;
++#else
++static int s0_magic_packet = 0;
++#endif
++#ifdef ENABLE_TX_NO_CLOSE
++static int tx_no_close_enable = 1;
++#else
++static int tx_no_close_enable = 0;
++#endif
++#ifdef ENABLE_PTP_MASTER_MODE
++static int enable_ptp_master_mode = 1;
++#else
++static int enable_ptp_master_mode = 0;
++#endif
++#ifdef DISABLE_WOL_SUPPORT
++static int disable_wol_support = 1;
++#else
++static int disable_wol_support = 0;
++#endif
++#ifdef ENABLE_DOUBLE_VLAN
++static int enable_double_vlan = 1;
++#else
++static int enable_double_vlan = 0;
++#endif
++#ifdef ENABLE_GIGA_LITE
++static int eee_giga_lite = 1;
++#else
++static int eee_giga_lite = 0;
++#endif
++
++MODULE_AUTHOR("Realtek and the Linux r8125 crew <netdev@vger.kernel.org>");
++MODULE_DESCRIPTION("Realtek r8125 Ethernet controller driver");
++
++module_param(speed_mode, uint, 0);
++MODULE_PARM_DESC(speed_mode, "force phy operation. Deprecated by ethtool (8).");
++
++module_param(duplex_mode, uint, 0);
++MODULE_PARM_DESC(duplex_mode, "force phy operation. Deprecated by ethtool (8).");
++
++module_param(autoneg_mode, uint, 0);
++MODULE_PARM_DESC(autoneg_mode, "force phy operation. Deprecated by ethtool (8).");
++
++module_param(aspm, int, 0);
++MODULE_PARM_DESC(aspm, "Enable ASPM.");
++
++module_param(s5wol, int, 0);
++MODULE_PARM_DESC(s5wol, "Enable Shutdown Wake On Lan.");
++
++module_param(s5_keep_curr_mac, int, 0);
++MODULE_PARM_DESC(s5_keep_curr_mac, "Enable Shutdown Keep Current MAC Address.");
++
++module_param(use_dac, int, 0);
++MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
++
++module_param(timer_count, int, 0);
++MODULE_PARM_DESC(timer_count, "Timer Interrupt Interval.");
++
++module_param(eee_enable, int, 0);
++MODULE_PARM_DESC(eee_enable, "Enable Energy Efficient Ethernet.");
++
++module_param(hwoptimize, ulong, 0);
++MODULE_PARM_DESC(hwoptimize, "Enable HW optimization function.");
++
++module_param(s0_magic_packet, int, 0);
++MODULE_PARM_DESC(s0_magic_packet, "Enable S0 Magic Packet.");
++
++module_param(tx_no_close_enable, int, 0);
++MODULE_PARM_DESC(tx_no_close_enable, "Enable TX No Close.");
++
++module_param(enable_ptp_master_mode, int, 0);
++MODULE_PARM_DESC(enable_ptp_master_mode, "Enable PTP Master Mode.");
++
++module_param(disable_wol_support, int, 0);
++MODULE_PARM_DESC(disable_wol_support, "Disable PM support.");
++
++module_param(enable_double_vlan, int, 0);
++MODULE_PARM_DESC(enable_double_vlan, "Enable Double VLAN.");
++
++module_param(eee_giga_lite, int, 0);
++MODULE_PARM_DESC(eee_giga_lite, "Enable Giga Lite.");
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++module_param_named(debug, debug.msg_enable, int, 0);
++MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)");
++#endif//LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++
++MODULE_LICENSE("GPL");
++#ifdef ENABLE_USE_FIRMWARE_FILE
++MODULE_FIRMWARE(FIRMWARE_8125A_3);
++MODULE_FIRMWARE(FIRMWARE_8125B_2);
++MODULE_FIRMWARE(FIRMWARE_8125BP_1);
++MODULE_FIRMWARE(FIRMWARE_8125BP_2);
++MODULE_FIRMWARE(FIRMWARE_8125D_1);
++MODULE_FIRMWARE(FIRMWARE_8125D_2);
++MODULE_FIRMWARE(FIRMWARE_8125CP_1);
++#endif
++
++MODULE_VERSION(RTL8125_VERSION);
++
++/*
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++static void rtl8125_esd_timer(unsigned long __opaque);
++#else
++static void rtl8125_esd_timer(struct timer_list *t);
++#endif
++*/
++/*
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++static void rtl8125_link_timer(unsigned long __opaque);
++#else
++static void rtl8125_link_timer(struct timer_list *t);
++#endif
++*/
++
++static netdev_tx_t rtl8125_start_xmit(struct sk_buff *skb, struct net_device *dev);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
++#else
++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance);
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance, struct pt_regs *regs);
++#else
++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance);
++#endif
++static void rtl8125_set_rx_mode(struct net_device *dev);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
++static void rtl8125_tx_timeout(struct net_device *dev, unsigned int txqueue);
++#else
++static void rtl8125_tx_timeout(struct net_device *dev);
++#endif
++static int rtl8125_rx_interrupt(struct net_device *, struct rtl8125_private *, struct rtl8125_rx_ring *, napi_budget);
++static int rtl8125_tx_interrupt(struct rtl8125_tx_ring *ring, int budget);
++static int rtl8125_tx_interrupt_with_vector(struct rtl8125_private *tp, const int message_id, int budget);
++static void rtl8125_wait_for_quiescence(struct net_device *dev);
++static int rtl8125_change_mtu(struct net_device *dev, int new_mtu);
++static void rtl8125_down(struct net_device *dev);
++
++static int rtl8125_set_mac_address(struct net_device *dev, void *p);
++static void rtl8125_rar_set(struct rtl8125_private *tp, const u8 *addr);
++static void rtl8125_desc_addr_fill(struct rtl8125_private *);
++static void rtl8125_tx_desc_init(struct rtl8125_private *tp);
++static void rtl8125_rx_desc_init(struct rtl8125_private *tp);
++
++static u16 rtl8125_get_hw_phy_mcu_code_ver(struct rtl8125_private *tp);
++static void rtl8125_phy_power_up(struct net_device *dev);
++static void rtl8125_phy_power_down(struct net_device *dev);
++static int rtl8125_set_speed(struct net_device *dev, u8 autoneg, u32 speed, u8 duplex, u64 adv);
++static bool rtl8125_set_phy_mcu_patch_request(struct rtl8125_private *tp);
++static bool rtl8125_clear_phy_mcu_patch_request(struct rtl8125_private *tp);
++
++#ifdef CONFIG_R8125_NAPI
++static int rtl8125_poll(napi_ptr napi, napi_budget budget);
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_reset_task(void *_data);
++static void rtl8125_esd_task(void *_data);
++static void rtl8125_linkchg_task(void *_data);
++static void rtl8125_link_task(void *_data);
++static void rtl8125_dash_task(void *_data);
++#else
++static void rtl8125_reset_task(struct work_struct *work);
++static void rtl8125_esd_task(struct work_struct *work);
++static void rtl8125_linkchg_task(struct work_struct *work);
++static void rtl8125_link_task(struct work_struct *work);
++static void rtl8125_dash_task(struct work_struct *work);
++#endif
++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp);
++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp);
++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp);
++static void rtl8125_schedule_link_work(struct rtl8125_private *tp);
++void rtl8125_schedule_dash_work(struct rtl8125_private *tp);
++static void rtl8125_init_all_schedule_work(struct rtl8125_private *tp);
++static void rtl8125_cancel_all_schedule_work(struct rtl8125_private *tp);
++
++static inline struct device *tp_to_dev(struct rtl8125_private *tp)
++{
++        return &tp->pci_dev->dev;
++}
++
++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) && \
++     LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,00)))
++void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
++                u32 legacy_u32)
++{
++        bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
++        dst[0] = legacy_u32;
++}
++
++bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
++                const unsigned long *src)
++{
++        bool retval = true;
++
++        /* TODO: following test will soon always be true */
++        if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
++                __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
++
++                bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
++                bitmap_fill(ext, 32);
++                bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
++                if (bitmap_intersects(ext, src,
++                                      __ETHTOOL_LINK_MODE_MASK_NBITS)) {
++                        /* src mask goes beyond bit 31 */
++                        retval = false;
++                }
++        }
++        *legacy_u32 = src[0];
++        return retval;
++}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++
++#ifndef LPA_1000FULL
++#define LPA_1000FULL            0x0800
++#endif
++
++#ifndef LPA_1000HALF
++#define LPA_1000HALF            0x0400
++#endif
++
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
++static inline void eth_hw_addr_random(struct net_device *dev)
++{
++        random_ether_addr(dev->dev_addr);
++}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++#undef ethtool_ops
++#define ethtool_ops _kc_ethtool_ops
++
++struct _kc_ethtool_ops {
++        int  (*get_settings)(struct net_device *, struct ethtool_cmd *);
++        int  (*set_settings)(struct net_device *, struct ethtool_cmd *);
++        void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
++        int  (*get_regs_len)(struct net_device *);
++        void (*get_regs)(struct net_device *, struct ethtool_regs *, void *);
++        void (*get_wol)(struct net_device *, struct ethtool_wolinfo *);
++        int  (*set_wol)(struct net_device *, struct ethtool_wolinfo *);
++        u32  (*get_msglevel)(struct net_device *);
++        void (*set_msglevel)(struct net_device *, u32);
++        int  (*nway_reset)(struct net_device *);
++        u32  (*get_link)(struct net_device *);
++        int  (*get_eeprom_len)(struct net_device *);
++        int  (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *);
++        int  (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *);
++        int  (*get_coalesce)(struct net_device *, struct ethtool_coalesce *);
++        int  (*set_coalesce)(struct net_device *, struct ethtool_coalesce *);
++        void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *);
++        int  (*set_ringparam)(struct net_device *, struct ethtool_ringparam *);
++        void (*get_pauseparam)(struct net_device *,
++                               struct ethtool_pauseparam*);
++        int  (*set_pauseparam)(struct net_device *,
++                               struct ethtool_pauseparam*);
++        u32  (*get_rx_csum)(struct net_device *);
++        int  (*set_rx_csum)(struct net_device *, u32);
++        u32  (*get_tx_csum)(struct net_device *);
++        int  (*set_tx_csum)(struct net_device *, u32);
++        u32  (*get_sg)(struct net_device *);
++        int  (*set_sg)(struct net_device *, u32);
++        u32  (*get_tso)(struct net_device *);
++        int  (*set_tso)(struct net_device *, u32);
++        int  (*self_test_count)(struct net_device *);
++        void (*self_test)(struct net_device *, struct ethtool_test *, u64 *);
++        void (*get_strings)(struct net_device *, u32 stringset, u8 *);
++        int  (*phys_id)(struct net_device *, u32);
++        int  (*get_stats_count)(struct net_device *);
++        void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *,
++                                  u64 *);
++} *ethtool_ops = NULL;
++
++#undef SET_ETHTOOL_OPS
++#define SET_ETHTOOL_OPS(netdev, ops) (ethtool_ops = (ops))
++
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
++#ifndef SET_ETHTOOL_OPS
++#define SET_ETHTOOL_OPS(netdev,ops) \
++         ((netdev)->ethtool_ops = (ops))
++#endif //SET_ETHTOOL_OPS
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
++
++//#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5)
++#ifndef netif_msg_init
++#define netif_msg_init _kc_netif_msg_init
++/* copied from linux kernel 2.6.20 include/linux/netdevice.h */
++static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
++{
++        /* use default */
++        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
++                return default_msg_enable_bits;
++        if (debug_value == 0)   /* no output */
++                return 0;
++        /* set low N bits */
++        return (1 << debug_value) - 1;
++}
++
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5)
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
++static inline void eth_copy_and_sum (struct sk_buff *dest,
++                                     const unsigned char *src,
++                                     int len, int base)
++{
++        skb_copy_to_linear_data(dest, src, len);
++}
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7)
++/* copied from linux kernel 2.6.20 /include/linux/time.h */
++/* Parameters used to convert the timespec values: */
++#define MSEC_PER_SEC    1000L
++
++/* copied from linux kernel 2.6.20 /include/linux/jiffies.h */
++/*
++ * Change timeval to jiffies, trying to avoid the
++ * most obvious overflows..
++ *
++ * And some not so obvious.
++ *
++ * Note that we don't want to return MAX_LONG, because
++ * for various timeout reasons we often end up having
++ * to wait "jiffies+1" in order to guarantee that we wait
++ * at _least_ "jiffies" - so "jiffies+1" had better still
++ * be positive.
++ */
++#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1)
++
++/*
++ * Convert jiffies to milliseconds and back.
++ *
++ * Avoid unnecessary multiplications/divisions in the
++ * two most common HZ cases:
++ */
++static inline unsigned int _kc_jiffies_to_msecs(const unsigned long j)
++{
++#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
++        return (MSEC_PER_SEC / HZ) * j;
++#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
++        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
++#else
++        return (j * MSEC_PER_SEC) / HZ;
++#endif
++}
++
++static inline unsigned long _kc_msecs_to_jiffies(const unsigned int m)
++{
++        if (m > _kc_jiffies_to_msecs(MAX_JIFFY_OFFSET))
++                return MAX_JIFFY_OFFSET;
++#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
++        return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
++#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
++        return m * (HZ / MSEC_PER_SEC);
++#else
++        return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
++#endif
++}
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7)
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
++
++/* copied from linux kernel 2.6.12.6 /include/linux/pm.h */
++typedef int __bitwise pci_power_t;
++
++/* copied from linux kernel 2.6.12.6 /include/linux/pci.h */
++typedef u32 __bitwise pm_message_t;
++
++#define PCI_D0  ((pci_power_t __force) 0)
++#define PCI_D1  ((pci_power_t __force) 1)
++#define PCI_D2  ((pci_power_t __force) 2)
++#define PCI_D3hot   ((pci_power_t __force) 3)
++#define PCI_D3cold  ((pci_power_t __force) 4)
++#define PCI_POWER_ERROR ((pci_power_t __force) -1)
++
++/* copied from linux kernel 2.6.12.6 /drivers/pci/pci.c */
++/**
++ * pci_choose_state - Choose the power state of a PCI device
++ * @dev: PCI device to be suspended
++ * @state: target sleep state for the whole system. This is the value
++ *  that is passed to suspend() function.
++ *
++ * Returns PCI power state suitable for given device and given system
++ * message.
++ */
++
++pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
++{
++        if (!pci_find_capability(dev, PCI_CAP_ID_PM))
++                return PCI_D0;
++
++        switch (state) {
++        case 0:
++                return PCI_D0;
++        case 3:
++                return PCI_D3hot;
++        default:
++                printk("They asked me for state %d\n", state);
++//      BUG();
++        }
++        return PCI_D0;
++}
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++/**
++ * msleep_interruptible - sleep waiting for waitqueue interruptions
++ * @msecs: Time in milliseconds to sleep for
++ */
++#define msleep_interruptible _kc_msleep_interruptible
++unsigned long _kc_msleep_interruptible(unsigned int msecs)
++{
++        unsigned long timeout = _kc_msecs_to_jiffies(msecs);
++
++        while (timeout && !signal_pending(current)) {
++                set_current_state(TASK_INTERRUPTIBLE);
++                timeout = schedule_timeout(timeout);
++        }
++        return _kc_jiffies_to_msecs(timeout);
++}
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7)
++/* copied from linux kernel 2.6.20 include/linux/sched.h */
++#ifndef __sched
++#define __sched     __attribute__((__section__(".sched.text")))
++#endif
++
++/* copied from linux kernel 2.6.20 kernel/timer.c */
++signed long __sched schedule_timeout_uninterruptible(signed long timeout)
++{
++        __set_current_state(TASK_UNINTERRUPTIBLE);
++        return schedule_timeout(timeout);
++}
++
++/* copied from linux kernel 2.6.20 include/linux/mii.h */
++#undef if_mii
++#define if_mii _kc_if_mii
++static inline struct mii_ioctl_data *if_mii(struct ifreq *rq)
++{
++        return (struct mii_ioctl_data *) &rq->ifr_ifru;
++}
++#endif  //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7)
++
++static u16 _rtl8125_read_thermal_sensor(struct rtl8125_private *tp)
++{
++        u16 ts_digout;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                ts_digout = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD84);
++                ts_digout &= 0x3ff;
++                break;
++        default:
++                ts_digout = 0xffff;
++                break;
++        }
++
++        return ts_digout;
++}
++
++static int rtl8125_read_thermal_sensor(struct rtl8125_private *tp)
++{
++        int tmp;
++
++        tmp = _rtl8125_read_thermal_sensor(tp);
++        if (tmp > 512)
++                return (0 - ((512 - (tmp - 512)) / 2));
++        else
++                return (tmp / 2);
++}
++
++int rtl8125_dump_tally_counter(struct rtl8125_private *tp, dma_addr_t paddr)
++{
++        u32 cmd;
++        u32 WaitCnt;
++        int retval = -1;
++
++        RTL_W32(tp, CounterAddrHigh, (u64)paddr >> 32);
++        cmd = (u64)paddr & DMA_BIT_MASK(32);
++        RTL_W32(tp, CounterAddrLow, cmd);
++        RTL_W32(tp, CounterAddrLow, cmd | CounterDump);
++
++        WaitCnt = 0;
++        while (RTL_R32(tp, CounterAddrLow) & CounterDump) {
++                udelay(10);
++
++                WaitCnt++;
++                if (WaitCnt > 20)
++                        break;
++        }
++
++        if (WaitCnt <= 20)
++                retval = 0;
++
++        return retval;
++}
++
++static u32
++rtl8125_get_hw_clo_ptr(struct rtl8125_tx_ring *ring)
++{
++        struct rtl8125_private *tp = ring->priv;
++
++        if (!tp)
++                return 0;
++
++        switch (tp->HwSuppTxNoCloseVer) {
++        case 3:
++                return RTL_R16(tp, ring->hw_clo_ptr_reg);
++        case 4:
++        case 5:
++        case 6:
++                return RTL_R32(tp, ring->hw_clo_ptr_reg);
++        default:
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                WARN_ON(1);
++#endif
++                return 0;
++        }
++}
++
++static u32
++rtl8125_get_sw_tail_ptr(struct rtl8125_tx_ring *ring)
++{
++        struct rtl8125_private *tp = ring->priv;
++
++        if (!tp)
++                return 0;
++
++        switch (tp->HwSuppTxNoCloseVer) {
++        case 3:
++                return RTL_R16(tp, ring->sw_tail_ptr_reg);
++        case 4:
++        case 5:
++        case 6:
++                return RTL_R32(tp, ring->sw_tail_ptr_reg);
++        default:
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                WARN_ON(1);
++#endif
++                return 0;
++        }
++}
++
++static u32
++rtl8125_get_phy_status(struct rtl8125_private *tp)
++{
++        return RTL_R32(tp, PHYstatus);
++}
++
++static bool
++rtl8125_sysfs_testmode_on(struct rtl8125_private *tp)
++{
++#ifdef ENABLE_R8125_SYSFS
++        return !!tp->testmode;
++#else
++        return 1;
++#endif
++}
++
++static u32 rtl8125_convert_link_speed(u32 status)
++{
++        u32 speed = SPEED_UNKNOWN;
++
++        if (status & LinkStatus) {
++                if (status & _2500bpsF)
++                        speed = SPEED_2500;
++                else if (status & (_1000bpsF | _2500bpsL | _1000bpsL))
++                        speed = SPEED_1000;
++                else if (status & _100bps)
++                        speed = SPEED_100;
++                else if (status & _10bps)
++                        speed = SPEED_10;
++        }
++
++        return speed;
++}
++
++static void rtl8125_mdi_swap(struct rtl8125_private *tp)
++{
++        int i;
++        u16 reg, val, mdi_reverse;
++        u16 tps_p0, tps_p1, tps_p2, tps_p3, tps_p3_p0;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                reg = 0x8284;
++                break;
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                reg = 0x81aa;
++                break;
++        default:
++                return;
++        };
++
++        tps_p3_p0 = rtl8125_mac_ocp_read(tp, 0xD440) & 0xF000;
++        tps_p3 = !!(tps_p3_p0 & BIT_15);
++        tps_p2 = !!(tps_p3_p0 & BIT_14);
++        tps_p1 = !!(tps_p3_p0 & BIT_13);
++        tps_p0 = !!(tps_p3_p0 & BIT_12);
++        mdi_reverse = rtl8125_mac_ocp_read(tp, 0xD442);
++
++        if ((mdi_reverse & BIT_5) && tps_p3_p0 == 0xA000)
++                return;
++
++        if (!(mdi_reverse & BIT_5))
++                val = tps_p0 << 8 |
++                      tps_p1 << 9 |
++                      tps_p2 << 10 |
++                      tps_p3 << 11;
++        else
++                val = tps_p3 << 8 |
++                      tps_p2 << 9 |
++                      tps_p1 << 10 |
++                      tps_p0 << 11;
++
++        for (i=8; i<12; i++) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, reg);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      BIT(i),
++                                                      val & BIT(i));
++        }
++}
++
++static int _rtl8125_vcd_test(struct rtl8125_private *tp)
++{
++        u16 val;
++        u32 wait_cnt;
++        int ret = -1;
++
++        rtl8125_mdi_swap(tp);
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA422, BIT(0));
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA422, 0x00F0);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA422, BIT(0));
++
++        wait_cnt = 0;
++        do {
++                mdelay(1);
++                val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA422);
++                wait_cnt++;
++        } while (!(val & BIT_15) && (wait_cnt < 5000));
++
++        if (wait_cnt == 5000)
++                goto exit;
++
++        ret = 0;
++
++exit:
++        return ret;
++}
++
++static int rtl8125_vcd_test(struct rtl8125_private *tp, bool poe_mode)
++{
++        int ret;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                /* update rtct threshold for poe mode */
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE1);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, poe_mode ? 0x0A44 : 0x0000);
++
++                /* enable rtct poe mode */
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, poe_mode ? 0x0100 : 0x0000);
++
++                ret = _rtl8125_vcd_test(tp);
++
++                /* disable rtct poe mode */
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++
++                /* restore rtct threshold */
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE1);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++                break;
++        default:
++                ret = _rtl8125_vcd_test(tp);
++                break;
++        }
++
++        return ret;
++}
++
++static void rtl8125_get_cp_len(struct rtl8125_private *tp,
++                               int cp_len[RTL8125_CP_NUM])
++{
++        int i;
++        u32 status;
++        int tmp_cp_len;
++
++        status = rtl8125_get_phy_status(tp);
++        if (status & LinkStatus) {
++                if (status & _10bps) {
++                        tmp_cp_len = -1;
++                } else if (status & (_100bps | _1000bpsF)) {
++                        rtl8125_mdio_write(tp, 0x1f, 0x0a88);
++                        tmp_cp_len = rtl8125_mdio_read(tp, 0x10);
++                } else if (status & _2500bpsF) {
++                        switch (tp->mcfg) {
++                        case CFG_METHOD_2:
++                        case CFG_METHOD_3:
++                        case CFG_METHOD_6:
++                                rtl8125_mdio_write(tp, 0x1f, 0x0ac5);
++                                tmp_cp_len = rtl8125_mdio_read(tp, 0x14);
++                                tmp_cp_len >>= 4;
++                                break;
++                        default:
++                                rtl8125_mdio_write(tp, 0x1f, 0x0acb);
++                                tmp_cp_len = rtl8125_mdio_read(tp, 0x15);
++                                tmp_cp_len >>= 2;
++                                break;
++                        }
++                } else
++                        tmp_cp_len = 0;
++        } else
++                tmp_cp_len = 0;
++
++        if (tmp_cp_len > 0)
++                tmp_cp_len &= 0xff;
++        for (i=0; i<RTL8125_CP_NUM; i++)
++                cp_len[i] = tmp_cp_len;
++
++        rtl8125_mdio_write(tp, 0x1f, 0x0000);
++
++        for (i=0; i<RTL8125_CP_NUM; i++)
++                if (cp_len[i] > RTL8125_MAX_SUPPORT_CP_LEN)
++                        cp_len[i] = RTL8125_MAX_SUPPORT_CP_LEN;
++
++        return;
++}
++
++static int __rtl8125_get_cp_status(u16 val)
++{
++        switch (val) {
++        case 0x0060:
++                return rtl8125_cp_normal;
++        case 0x0048:
++                return rtl8125_cp_open;
++        case 0x0050:
++                return rtl8125_cp_short;
++        case 0x0042:
++        case 0x0044:
++                return rtl8125_cp_mismatch;
++        default:
++                return rtl8125_cp_normal;
++        }
++}
++
++static int _rtl8125_get_cp_status(struct rtl8125_private *tp, u8 pair_num)
++{
++        u16 val;
++        int cp_status = rtl8125_cp_unknown;
++
++        if (pair_num > 3)
++                goto exit;
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8027 + 4 * pair_num);
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438);
++
++        cp_status = __rtl8125_get_cp_status(val);
++
++exit:
++        return cp_status;
++}
++
++static const char * rtl8125_get_cp_status_string(int cp_status)
++{
++        switch(cp_status) {
++        case rtl8125_cp_normal:
++                return "normal  ";
++        case rtl8125_cp_short:
++                return "short   ";
++        case rtl8125_cp_open:
++                return "open    ";
++        case rtl8125_cp_mismatch:
++                return "mismatch";
++        default:
++                return "unknown ";
++        }
++}
++
++static u16 rtl8125_get_cp_pp(struct rtl8125_private *tp, u8 pair_num)
++{
++        u16 pp = 0;
++
++        if (pair_num > 3)
++                goto exit;
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8029 + 4 * pair_num);
++        pp = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438);
++
++        pp &= 0x3fff;
++        pp /= 80;
++
++exit:
++        return pp;
++}
++
++static void rtl8125_get_cp_status(struct rtl8125_private *tp,
++                                  int cp_status[RTL8125_CP_NUM],
++                                  bool poe_mode)
++{
++        u32 status;
++        int i;
++
++        status = rtl8125_get_phy_status(tp);
++        if (status & LinkStatus && !(status & (_10bps | _100bps))) {
++                for (i=0; i<RTL8125_CP_NUM; i++)
++                        cp_status[i] = rtl8125_cp_normal;
++        } else {
++                /* cannot do vcd when link is on */
++                rtl8125_vcd_test(tp, poe_mode);
++
++                for (i=0; i<RTL8125_CP_NUM; i++)
++                        cp_status[i] = _rtl8125_get_cp_status(tp, i);
++        }
++
++        if (poe_mode) {
++                for (i=0; i<RTL8125_CP_NUM; i++) {
++                        if (cp_status[i] == rtl8125_cp_mismatch)
++                                cp_status[i] = rtl8125_cp_normal;
++                }
++        }
++}
++
++static int rtl8125_cel_to_fah(int cel)
++{
++        return (cel * 9 / 5) + 32;
++}
++
++#ifdef ENABLE_R8125_PROCFS
++/****************************************************************************
++*   -----------------------------PROCFS STUFF-------------------------
++*****************************************************************************
++*/
++
++static struct proc_dir_entry *rtl8125_proc;
++static int proc_init_num = 0;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++static int proc_get_driver_variable(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        seq_puts(m, "\nDump Driver Variable\n");
++
++        rtnl_lock();
++
++        seq_puts(m, "Variable\tValue\n----------\t-----\n");
++        seq_printf(m, "MODULENAME\t%s\n", MODULENAME);
++        seq_printf(m, "driver version\t%s\n", RTL8125_VERSION);
++        seq_printf(m, "mcfg\t%d\n", tp->mcfg);
++        seq_printf(m, "chipset\t%d\n", tp->chipset);
++        seq_printf(m, "chipset_name\t%s\n", rtl_chip_info[tp->chipset].name);
++        seq_printf(m, "mtu\t%d\n", dev->mtu);
++        seq_printf(m, "NUM_RX_DESC\t0x%x\n", tp->rx_ring[0].num_rx_desc);
++        seq_printf(m, "cur_rx0\t0x%x\n", tp->rx_ring[0].cur_rx);
++        seq_printf(m, "dirty_rx0\t0x%x\n", tp->rx_ring[0].dirty_rx);
++        seq_printf(m, "cur_rx1\t0x%x\n", tp->rx_ring[1].cur_rx);
++        seq_printf(m, "dirty_rx1\t0x%x\n", tp->rx_ring[1].dirty_rx);
++        seq_printf(m, "cur_rx2\t0x%x\n", tp->rx_ring[2].cur_rx);
++        seq_printf(m, "dirty_rx2\t0x%x\n", tp->rx_ring[2].dirty_rx);
++        seq_printf(m, "cur_rx3\t0x%x\n", tp->rx_ring[3].cur_rx);
++        seq_printf(m, "dirty_rx3\t0x%x\n", tp->rx_ring[3].dirty_rx);
++        seq_printf(m, "NUM_TX_DESC\t0x%x\n", tp->tx_ring[0].num_tx_desc);
++        seq_printf(m, "cur_tx0\t0x%x\n", tp->tx_ring[0].cur_tx);
++        seq_printf(m, "dirty_tx0\t0x%x\n", tp->tx_ring[0].dirty_tx);
++        seq_printf(m, "cur_tx1\t0x%x\n", tp->tx_ring[1].cur_tx);
++        seq_printf(m, "dirty_tx1\t0x%x\n", tp->tx_ring[1].dirty_tx);
++        seq_printf(m, "rx_buf_sz\t0x%x\n", tp->rx_buf_sz);
++#ifdef ENABLE_PAGE_REUSE
++        seq_printf(m, "rx_buf_page_order\t0x%x\n", tp->rx_buf_page_order);
++        seq_printf(m, "rx_buf_page_size\t0x%x\n", tp->rx_buf_page_size);
++        seq_printf(m, "page_reuse_fail_cnt\t0x%x\n", tp->page_reuse_fail_cnt);
++#endif //ENABLE_PAGE_REUSE
++        seq_printf(m, "esd_flag\t0x%x\n", tp->esd_flag);
++        seq_printf(m, "pci_cfg_is_read\t0x%x\n", tp->pci_cfg_is_read);
++        seq_printf(m, "rtl8125_rx_config\t0x%x\n", tp->rtl8125_rx_config);
++        seq_printf(m, "cp_cmd\t0x%x\n", tp->cp_cmd);
++        seq_printf(m, "intr_mask\t0x%x\n", tp->intr_mask);
++        seq_printf(m, "timer_intr_mask\t0x%x\n", tp->timer_intr_mask);
++        seq_printf(m, "wol_enabled\t0x%x\n", tp->wol_enabled);
++        seq_printf(m, "wol_opts\t0x%x\n", tp->wol_opts);
++        seq_printf(m, "efuse_ver\t0x%x\n", tp->efuse_ver);
++        seq_printf(m, "eeprom_type\t0x%x\n", tp->eeprom_type);
++        seq_printf(m, "autoneg\t0x%x\n", tp->autoneg);
++        seq_printf(m, "duplex\t0x%x\n", tp->duplex);
++        seq_printf(m, "speed\t%d\n", tp->speed);
++        seq_printf(m, "advertising\t0x%llx\n", tp->advertising);
++        seq_printf(m, "eeprom_len\t0x%x\n", tp->eeprom_len);
++        seq_printf(m, "cur_page\t0x%x\n", tp->cur_page);
++        seq_printf(m, "features\t0x%x\n", tp->features);
++        seq_printf(m, "org_pci_offset_99\t0x%x\n", tp->org_pci_offset_99);
++        seq_printf(m, "org_pci_offset_180\t0x%x\n", tp->org_pci_offset_180);
++        seq_printf(m, "issue_offset_99_event\t0x%x\n", tp->issue_offset_99_event);
++        seq_printf(m, "org_pci_offset_80\t0x%x\n", tp->org_pci_offset_80);
++        seq_printf(m, "org_pci_offset_81\t0x%x\n", tp->org_pci_offset_81);
++        seq_printf(m, "use_timer_interrupt\t0x%x\n", tp->use_timer_interrupt);
++        seq_printf(m, "HwIcVerUnknown\t0x%x\n", tp->HwIcVerUnknown);
++        seq_printf(m, "NotWrRamCodeToMicroP\t0x%x\n", tp->NotWrRamCodeToMicroP);
++        seq_printf(m, "NotWrMcuPatchCode\t0x%x\n", tp->NotWrMcuPatchCode);
++        seq_printf(m, "HwHasWrRamCodeToMicroP\t0x%x\n", tp->HwHasWrRamCodeToMicroP);
++        seq_printf(m, "sw_ram_code_ver\t0x%x\n", tp->sw_ram_code_ver);
++        seq_printf(m, "hw_ram_code_ver\t0x%x\n", tp->hw_ram_code_ver);
++        seq_printf(m, "rtk_enable_diag\t0x%x\n", tp->rtk_enable_diag);
++        seq_printf(m, "ShortPacketSwChecksum\t0x%x\n", tp->ShortPacketSwChecksum);
++        seq_printf(m, "UseSwPaddingShortPkt\t0x%x\n", tp->UseSwPaddingShortPkt);
++        seq_printf(m, "RequireAdcBiasPatch\t0x%x\n", tp->RequireAdcBiasPatch);
++        seq_printf(m, "AdcBiasPatchIoffset\t0x%x\n", tp->AdcBiasPatchIoffset);
++        seq_printf(m, "RequireAdjustUpsTxLinkPulseTiming\t0x%x\n", tp->RequireAdjustUpsTxLinkPulseTiming);
++        seq_printf(m, "SwrCnt1msIni\t0x%x\n", tp->SwrCnt1msIni);
++        seq_printf(m, "HwSuppNowIsOobVer\t0x%x\n", tp->HwSuppNowIsOobVer);
++        seq_printf(m, "HwFiberModeVer\t0x%x\n", tp->HwFiberModeVer);
++        seq_printf(m, "HwFiberStat\t0x%x\n", tp->HwFiberStat);
++        seq_printf(m, "HwSwitchMdiToFiber\t0x%x\n", tp->HwSwitchMdiToFiber);
++        seq_printf(m, "Led0\t0x%x\n", tp->BackupLedSel[0]);
++        seq_printf(m, "RequiredSecLanDonglePatch\t0x%x\n", tp->RequiredSecLanDonglePatch);
++        seq_printf(m, "RequiredPfmPatch\t0x%x\n", tp->RequiredPfmPatch);
++        seq_printf(m, "HwSuppDashVer\t0x%x\n", tp->HwSuppDashVer);
++        seq_printf(m, "DASH\t0x%x\n", tp->DASH);
++        seq_printf(m, "DashFirmwareVersion\t0x%x\n", tp->DashFirmwareVersion);
++        seq_printf(m, "HwSuppKCPOffloadVer\t0x%x\n", tp->HwSuppKCPOffloadVer);
++        seq_printf(m, "speed_mode\t0x%x\n", speed_mode);
++        seq_printf(m, "duplex_mode\t0x%x\n", duplex_mode);
++        seq_printf(m, "autoneg_mode\t0x%x\n", autoneg_mode);
++        seq_printf(m, "aspm\t0x%x\n", aspm);
++        seq_printf(m, "s5wol\t0x%x\n", s5wol);
++        seq_printf(m, "s5_keep_curr_mac\t0x%x\n", s5_keep_curr_mac);
++        seq_printf(m, "eee_enable\t0x%x\n", tp->eee.eee_enabled);
++        seq_printf(m, "hwoptimize\t0x%lx\n", hwoptimize);
++        seq_printf(m, "proc_init_num\t0x%x\n", proc_init_num);
++        seq_printf(m, "s0_magic_packet\t0x%x\n", s0_magic_packet);
++        seq_printf(m, "disable_wol_support\t0x%x\n", disable_wol_support);
++        seq_printf(m, "enable_double_vlan\t0x%x\n", enable_double_vlan);
++        seq_printf(m, "eee_giga_lite\t0x%x\n", eee_giga_lite);
++        seq_printf(m, "HwSuppMagicPktVer\t0x%x\n", tp->HwSuppMagicPktVer);
++        seq_printf(m, "HwSuppEsdVer\t0x%x\n", tp->HwSuppEsdVer);
++        seq_printf(m, "HwSuppLinkChgWakeUpVer\t0x%x\n", tp->HwSuppLinkChgWakeUpVer);
++        seq_printf(m, "HwSuppD0SpeedUpVer\t0x%x\n", tp->HwSuppD0SpeedUpVer);
++        seq_printf(m, "D0SpeedUpSpeed\t0x%x\n", tp->D0SpeedUpSpeed);
++        seq_printf(m, "HwSuppCheckPhyDisableModeVer\t0x%x\n", tp->HwSuppCheckPhyDisableModeVer);
++        seq_printf(m, "HwPkgDet\t0x%x\n", tp->HwPkgDet);
++        seq_printf(m, "HwSuppTxNoCloseVer\t0x%x\n", tp->HwSuppTxNoCloseVer);
++        seq_printf(m, "EnableTxNoClose\t0x%x\n", tp->EnableTxNoClose);
++        seq_printf(m, "NextHwDesCloPtr0\t0x%x\n", tp->tx_ring[0].NextHwDesCloPtr);
++        seq_printf(m, "BeginHwDesCloPtr0\t0x%x\n", tp->tx_ring[0].BeginHwDesCloPtr);
++        seq_printf(m, "hw_clo_ptr_reg0\t0x%x\n", rtl8125_get_hw_clo_ptr(&tp->tx_ring[0]));
++        seq_printf(m, "sw_tail_ptr_reg0\t0x%x\n", rtl8125_get_sw_tail_ptr(&tp->tx_ring[0]));
++        seq_printf(m, "NextHwDesCloPtr1\t0x%x\n", tp->tx_ring[1].NextHwDesCloPtr);
++        seq_printf(m, "BeginHwDesCloPtr1\t0x%x\n", tp->tx_ring[1].BeginHwDesCloPtr);
++        seq_printf(m, "hw_clo_ptr_reg1\t0x%x\n", rtl8125_get_hw_clo_ptr(&tp->tx_ring[1]));
++        seq_printf(m, "sw_tail_ptr_reg1\t0x%x\n", rtl8125_get_sw_tail_ptr(&tp->tx_ring[1]));
++        seq_printf(m, "InitRxDescType\t0x%x\n", tp->InitRxDescType);
++        seq_printf(m, "RxDescLength\t0x%x\n", tp->RxDescLength);
++        seq_printf(m, "num_rx_rings\t0x%x\n", tp->num_rx_rings);
++        seq_printf(m, "num_tx_rings\t0x%x\n", tp->num_tx_rings);
++        seq_printf(m, "tot_rx_rings\t0x%x\n", rtl8125_tot_rx_rings(tp));
++        seq_printf(m, "tot_tx_rings\t0x%x\n", rtl8125_tot_tx_rings(tp));
++        seq_printf(m, "HwSuppNumRxQueues\t0x%x\n", tp->HwSuppNumRxQueues);
++        seq_printf(m, "HwSuppNumTxQueues\t0x%x\n", tp->HwSuppNumTxQueues);
++        seq_printf(m, "EnableRss\t0x%x\n", tp->EnableRss);
++        seq_printf(m, "EnablePtp\t0x%x\n", tp->EnablePtp);
++        seq_printf(m, "ptp_master_mode\t0x%x\n", tp->ptp_master_mode);
++        seq_printf(m, "min_irq_nvecs\t0x%x\n", tp->min_irq_nvecs);
++        seq_printf(m, "irq_nvecs\t0x%x\n", tp->irq_nvecs);
++        seq_printf(m, "hw_supp_irq_nvecs\t0x%x\n", tp->hw_supp_irq_nvecs);
++        seq_printf(m, "ring_lib_enabled\t0x%x\n", tp->ring_lib_enabled);
++        seq_printf(m, "HwSuppIsrVer\t0x%x\n", tp->HwSuppIsrVer);
++        seq_printf(m, "HwCurrIsrVer\t0x%x\n", tp->HwCurrIsrVer);
++        seq_printf(m, "HwSuppMacMcuVer\t0x%x\n", tp->HwSuppMacMcuVer);
++        seq_printf(m, "MacMcuPageSize\t0x%x\n", tp->MacMcuPageSize);
++        seq_printf(m, "hw_mcu_patch_code_ver\t0x%llx\n", tp->hw_mcu_patch_code_ver);
++        seq_printf(m, "bin_mcu_patch_code_ver\t0x%llx\n", tp->bin_mcu_patch_code_ver);
++#ifdef ENABLE_PTP_SUPPORT
++        seq_printf(m, "tx_hwtstamp_timeouts\t0x%x\n", tp->tx_hwtstamp_timeouts);
++        seq_printf(m, "tx_hwtstamp_skipped\t0x%x\n", tp->tx_hwtstamp_skipped);
++#endif
++        seq_printf(m, "random_mac\t0x%x\n", tp->random_mac);
++        seq_printf(m, "org_mac_addr\t%pM\n", tp->org_mac_addr);
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++        seq_printf(m, "perm_addr\t%pM\n", dev->perm_addr);
++#endif
++        seq_printf(m, "dev_addr\t%pM\n", dev->dev_addr);
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_tally_counter(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct rtl8125_counters *counters;
++        dma_addr_t paddr;
++
++        seq_puts(m, "\nDump Tally Counter\n");
++
++        rtnl_lock();
++
++        counters = tp->tally_vaddr;
++        paddr = tp->tally_paddr;
++        if (!counters) {
++                seq_puts(m, "\nDump Tally Counter Fail\n");
++                goto out_unlock;
++        }
++
++        rtl8125_dump_tally_counter(tp, paddr);
++
++        seq_puts(m, "Statistics\tValue\n----------\t-----\n");
++        seq_printf(m, "tx_packets\t%lld\n", le64_to_cpu(counters->tx_packets));
++        seq_printf(m, "rx_packets\t%lld\n", le64_to_cpu(counters->rx_packets));
++        seq_printf(m, "tx_errors\t%lld\n", le64_to_cpu(counters->tx_errors));
++        seq_printf(m, "rx_errors\t%d\n", le32_to_cpu(counters->rx_errors));
++        seq_printf(m, "rx_missed\t%d\n", le16_to_cpu(counters->rx_missed));
++        seq_printf(m, "align_errors\t%d\n", le16_to_cpu(counters->align_errors));
++        seq_printf(m, "tx_one_collision\t%d\n", le32_to_cpu(counters->tx_one_collision));
++        seq_printf(m, "tx_multi_collision\t%d\n", le32_to_cpu(counters->tx_multi_collision));
++        seq_printf(m, "rx_unicast\t%lld\n", le64_to_cpu(counters->rx_unicast));
++        seq_printf(m, "rx_broadcast\t%lld\n", le64_to_cpu(counters->rx_broadcast));
++        seq_printf(m, "rx_multicast\t%d\n", le32_to_cpu(counters->rx_multicast));
++        seq_printf(m, "tx_aborted\t%d\n", le16_to_cpu(counters->tx_aborted));
++        seq_printf(m, "tx_underrun\t%d\n", le16_to_cpu(counters->tx_underrun));
++
++        seq_printf(m, "tx_octets\t%lld\n", le64_to_cpu(counters->tx_octets));
++        seq_printf(m, "rx_octets\t%lld\n", le64_to_cpu(counters->rx_octets));
++        seq_printf(m, "rx_multicast64\t%lld\n", le64_to_cpu(counters->rx_multicast64));
++        seq_printf(m, "tx_unicast64\t%lld\n", le64_to_cpu(counters->tx_unicast64));
++        seq_printf(m, "tx_broadcast64\t%lld\n", le64_to_cpu(counters->tx_broadcast64));
++        seq_printf(m, "tx_multicast64\t%lld\n", le64_to_cpu(counters->tx_multicast64));
++        seq_printf(m, "tx_pause_on\t%d\n", le32_to_cpu(counters->tx_pause_on));
++        seq_printf(m, "tx_pause_off\t%d\n", le32_to_cpu(counters->tx_pause_off));
++        seq_printf(m, "tx_pause_all\t%d\n", le32_to_cpu(counters->tx_pause_all));
++        seq_printf(m, "tx_deferred\t%d\n", le32_to_cpu(counters->tx_deferred));
++        seq_printf(m, "tx_late_collision\t%d\n", le32_to_cpu(counters->tx_late_collision));
++        seq_printf(m, "tx_all_collision\t%d\n", le32_to_cpu(counters->tx_all_collision));
++        seq_printf(m, "tx_aborted32\t%d\n", le32_to_cpu(counters->tx_aborted32));
++        seq_printf(m, "align_errors32\t%d\n", le32_to_cpu(counters->align_errors32));
++        seq_printf(m, "rx_frame_too_long\t%d\n", le32_to_cpu(counters->rx_frame_too_long));
++        seq_printf(m, "rx_runt\t%d\n", le32_to_cpu(counters->rx_runt));
++        seq_printf(m, "rx_pause_on\t%d\n", le32_to_cpu(counters->rx_pause_on));
++        seq_printf(m, "rx_pause_off\t%d\n", le32_to_cpu(counters->rx_pause_off));
++        seq_printf(m, "rx_pause_all\t%d\n", le32_to_cpu(counters->rx_pause_all));
++        seq_printf(m, "rx_unknown_opcode\t%d\n", le32_to_cpu(counters->rx_unknown_opcode));
++        seq_printf(m, "rx_mac_error\t%d\n", le32_to_cpu(counters->rx_mac_error));
++        seq_printf(m, "tx_underrun32\t%d\n", le32_to_cpu(counters->tx_underrun32));
++        seq_printf(m, "rx_mac_missed\t%d\n", le32_to_cpu(counters->rx_mac_missed));
++        seq_printf(m, "rx_tcam_dropped\t%d\n", le32_to_cpu(counters->rx_tcam_dropped));
++        seq_printf(m, "tdu\t%d\n", le32_to_cpu(counters->tdu));
++        seq_printf(m, "rdu\t%d\n", le32_to_cpu(counters->rdu));
++
++        seq_putc(m, '\n');
++
++out_unlock:
++        rtnl_unlock();
++
++        return 0;
++}
++
++static int proc_get_registers(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max = R8125_MAC_REGS_SIZE;
++        u8 byte_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++
++        seq_puts(m, "\nDump MAC Registers\n");
++        seq_puts(m, "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%04x:\t", n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        seq_printf(m, "%02x ", byte_rd);
++                }
++        }
++
++        max = 0xB00;
++        for (n = 0xA00; n < max;) {
++                seq_printf(m, "\n0x%04x:\t", n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        seq_printf(m, "%02x ", byte_rd);
++                }
++        }
++
++        max = 0xD40;
++        for (n = 0xD00; n < max;) {
++                seq_printf(m, "\n0x%04x:\t", n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        seq_printf(m, "%02x ", byte_rd);
++                }
++        }
++
++        max = 0x2840;
++        for (n = 0x2800; n < max;) {
++                seq_printf(m, "\n0x%04x:\t", n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        seq_printf(m, "%02x ", byte_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_all_registers(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max;
++        u8 byte_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++        struct pci_dev *pdev = tp->pci_dev;
++
++        seq_puts(m, "\nDump All MAC Registers\n");
++        seq_puts(m, "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        max = pci_resource_len(pdev, 2);
++        max = min(max, 0x8000);
++
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%04x:\t", n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        seq_printf(m, "%02x ", byte_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        seq_printf(m, "\nTotal length:0x%X", max);
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_pcie_phy(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max = R8125_EPHY_REGS_SIZE/2;
++        u16 word_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        seq_puts(m, "\nDump PCIE PHY\n");
++        seq_puts(m, "\nOffset\tValue\n------\t-----\n ");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%02x:\t", n);
++
++                for (i = 0; i < 8 && n < max; i++, n++) {
++                        word_rd = rtl8125_ephy_read(tp, n);
++                        seq_printf(m, "%04x ", word_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_eth_phy(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max = R8125_PHY_REGS_SIZE/2;
++        u16 word_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++
++        seq_puts(m, "\nDump Ethernet PHY\n");
++        seq_puts(m, "\nOffset\tValue\n------\t-----\n ");
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        seq_puts(m, "\n####################page 0##################\n ");
++        rtl8125_mdio_write(tp, 0x1f, 0x0000);
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%02x:\t", n);
++
++                for (i = 0; i < 8 && n < max; i++, n++) {
++                        word_rd = rtl8125_mdio_read(tp, n);
++                        seq_printf(m, "%04x ", word_rd);
++                }
++        }
++
++        seq_puts(m, "\n####################extra reg##################\n ");
++        n = 0xA400;
++        seq_printf(m, "\n0x%02x:\t", n);
++        for (i = 0; i < 8; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                seq_printf(m, "%04x ", word_rd);
++        }
++
++        n = 0xA410;
++        seq_printf(m, "\n0x%02x:\t", n);
++        for (i = 0; i < 3; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                seq_printf(m, "%04x ", word_rd);
++        }
++
++        n = 0xA434;
++        seq_printf(m, "\n0x%02x:\t", n);
++        word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++        seq_printf(m, "%04x ", word_rd);
++
++        n = 0xA5D0;
++        seq_printf(m, "\n0x%02x:\t", n);
++        for (i = 0; i < 4; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                seq_printf(m, "%04x ", word_rd);
++        }
++
++        n = 0xA61A;
++        seq_printf(m, "\n0x%02x:\t", n);
++        word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++        seq_printf(m, "%04x ", word_rd);
++
++        n = 0xA6D0;
++        seq_printf(m, "\n0x%02x:\t", n);
++        for (i = 0; i < 3; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                seq_printf(m, "%04x ", word_rd);
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_extended_registers(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max = R8125_ERI_REGS_SIZE;
++        u32 dword_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        seq_puts(m, "\nDump Extended Registers\n");
++        seq_puts(m, "\nOffset\tValue\n------\t-----\n ");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%02x:\t", n);
++
++                for (i = 0; i < 4 && n < max; i++, n+=4) {
++                        dword_rd = rtl8125_eri_read(tp, n, 4, ERIAR_ExGMAC);
++                        seq_printf(m, "%08x ", dword_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_pci_registers(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        int i, n, max = R8125_PCI_REGS_SIZE;
++        u32 dword_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        seq_puts(m, "\nDump PCI Registers\n");
++        seq_puts(m, "\nOffset\tValue\n------\t-----\n ");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                seq_printf(m, "\n0x%03x:\t", n);
++
++                for (i = 0; i < 4 && n < max; i++, n+=4) {
++                        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++                        seq_printf(m, "%08x ", dword_rd);
++                }
++        }
++
++        n = 0x110;
++        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++        seq_printf(m, "\n0x%03x:\t%08x ", n, dword_rd);
++        n = 0x70c;
++        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++        seq_printf(m, "\n0x%03x:\t%08x ", n, dword_rd);
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_get_temperature(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        int cel, fah;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                seq_puts(m, "\nChip Temperature\n");
++                break;
++        default:
++                return -EOPNOTSUPP;
++        }
++
++        rtnl_lock();
++
++        if (!rtl8125_sysfs_testmode_on(tp)) {
++                seq_puts(m, "\nPlease turn on ""/sys/class/net/<iface>/rtk_adv/testmode"".\n\n");
++                rtnl_unlock();
++                return 0;
++        }
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        netif_testing_on(dev);
++        cel = rtl8125_read_thermal_sensor(tp);
++        netif_testing_off(dev);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        rtnl_unlock();
++
++        fah = rtl8125_cel_to_fah(cel);
++
++        seq_printf(m, "Cel:%d\n", cel);
++        seq_printf(m, "Fah:%d\n", fah);
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int _proc_get_cable_info(struct seq_file *m, void *v, bool poe_mode)
++{
++        int i;
++        u32 status;
++        int cp_status[RTL8125_CP_NUM];
++        int cp_len[RTL8125_CP_NUM] = {0};
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        const char *pair_str[RTL8125_CP_NUM] = {"1-2", "3-6", "4-5", "7-8"};
++        unsigned long flags;
++        int ret;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2 ... CFG_METHOD_7:
++                /* support */
++                break;
++        default:
++                ret = -EOPNOTSUPP;
++                goto error_out;
++        }
++
++        rtnl_lock();
++
++        if (!rtl8125_sysfs_testmode_on(tp)) {
++                seq_puts(m, "\nPlease turn on ""/sys/class/net/<iface>/rtk_adv/testmode"".\n\n");
++                ret = 0;
++                goto error_unlock;
++        }
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        if (rtl8125_mdio_read(tp, MII_BMCR) & BMCR_PDOWN) {
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                ret = -EIO;
++                goto error_unlock;
++        }
++
++        netif_testing_on(dev);
++
++        status = rtl8125_get_phy_status(tp);
++        if (status & LinkStatus)
++                seq_printf(m, "\nlink speed:%d",
++                           rtl8125_convert_link_speed(status));
++        else
++                seq_puts(m, "\nlink status:off");
++
++        rtl8125_get_cp_len(tp, cp_len);
++
++        rtl8125_get_cp_status(tp, cp_status, poe_mode);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        seq_puts(m, "\npair\tlength\tstatus   \tpp\n");
++
++        for (i=0; i<RTL8125_CP_NUM; i++) {
++                if (cp_len[i] < 0)
++                        seq_printf(m, "%s\t%s\t%s\t",
++                                   pair_str[i], "none",
++                                   rtl8125_get_cp_status_string(cp_status[i]));
++                else
++                        seq_printf(m, "%s\t%d\t%s\t",
++                                   pair_str[i], cp_len[i],
++                                   rtl8125_get_cp_status_string(cp_status[i]));
++                if (cp_status[i] == rtl8125_cp_normal)
++                        seq_printf(m, "none\n");
++                else
++                        seq_printf(m, "%dm\n", rtl8125_get_cp_pp(tp, i));
++        }
++
++        netif_testing_off(dev);
++
++        seq_putc(m, '\n');
++
++        ret = 0;
++
++error_unlock:
++        rtnl_unlock();
++
++error_out:
++        return ret;
++}
++
++static int proc_get_cable_info(struct seq_file *m, void *v)
++{
++        return _proc_get_cable_info(m, v, 0);
++}
++
++static int proc_get_poe_cable_info(struct seq_file *m, void *v)
++{
++        return _proc_get_cable_info(m, v, 1);
++}
++
++static void _proc_dump_desc(struct seq_file *m, void *desc_base, u32 alloc_size)
++{
++        u32 *pdword;
++        int i;
++
++        if (desc_base == NULL ||
++            alloc_size == 0)
++                return;
++
++        pdword = (u32*)desc_base;
++        for (i=0; i<(alloc_size/4); i++) {
++                if (!(i % 4))
++                        seq_printf(m, "\n%04x ", i);
++                seq_printf(m, "%08x ", pdword[i]);
++        }
++
++        seq_putc(m, '\n');
++        return;
++}
++
++static int proc_dump_rx_desc(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        rtnl_lock();
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++
++                if (!ring)
++                        continue;
++
++                seq_printf(m, "\ndump rx %d desc:%d\n", i, ring->num_rx_desc);
++
++                _proc_dump_desc(m, (void*)ring->RxDescArray, ring->RxDescAllocSize);
++        }
++
++#ifdef ENABLE_LIB_SUPPORT
++        if (rtl8125_num_lib_rx_rings(tp) > 0) {
++                for (i = 0; i < tp->HwSuppNumRxQueues; i++) {
++                        struct rtl8125_ring *lib_ring = &tp->lib_rx_ring[i];
++                        if (lib_ring->enabled) {
++                                seq_printf(m, "\ndump lib rx %d desc:%d\n", i,
++                                           lib_ring->ring_size);
++                                _proc_dump_desc(m, (void*)lib_ring->desc_addr,
++                                                lib_ring->desc_size);
++                        }
++                }
++        }
++#endif //ENABLE_LIB_SUPPORT
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_dump_tx_desc(struct seq_file *m, void *v)
++{
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        rtnl_lock();
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++
++                if (!ring)
++                        continue;
++
++                seq_printf(m, "\ndump tx %d desc:%d\n", i, ring->num_tx_desc);
++
++                _proc_dump_desc(m, (void*)ring->TxDescArray, ring->TxDescAllocSize);
++        }
++
++#ifdef ENABLE_LIB_SUPPORT
++        if (rtl8125_num_lib_tx_rings(tp) > 0) {
++                for (i = 0; i < tp->HwSuppNumTxQueues; i++) {
++                        struct rtl8125_ring *lib_ring = &tp->lib_tx_ring[i];
++                        if (lib_ring->enabled) {
++                                seq_printf(m, "\ndump lib tx %d desc:%d\n", i,
++                                           lib_ring->ring_size);
++                                _proc_dump_desc(m, (void*)lib_ring->desc_addr,
++                                                lib_ring->desc_size);
++                        }
++                }
++        }
++#endif //ENABLE_LIB_SUPPORT
++
++        rtnl_unlock();
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++static int proc_dump_msix_tbl(struct seq_file *m, void *v)
++{
++        int i, j;
++        void __iomem *ioaddr;
++        struct net_device *dev = m->private;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        /* ioremap MMIO region */
++        ioaddr = ioremap(pci_resource_start(tp->pci_dev, 4), pci_resource_len(tp->pci_dev, 4));
++        if (!ioaddr)
++                return -EFAULT;
++
++        rtnl_lock();
++
++        seq_printf(m, "\ndump MSI-X Table. Total Entry %d. \n", tp->hw_supp_irq_nvecs);
++
++        for (i=0; i<tp->hw_supp_irq_nvecs; i++) {
++                seq_printf(m, "\n%04x ", i);
++                for (j=0; j<4; j++)
++                        seq_printf(m, "%08x ",
++                                   readl(ioaddr + i*0x10 + 4*j));
++        }
++
++        rtnl_unlock();
++
++        iounmap(ioaddr);
++
++        seq_putc(m, '\n');
++        return 0;
++}
++
++#else //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++
++static int proc_get_driver_variable(char *page, char **start,
++                                    off_t offset, int count,
++                                    int *eof, void *data)
++{
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump Driver Driver\n");
++
++        rtnl_lock();
++
++        len += snprintf(page + len, count - len,
++                        "Variable\tValue\n----------\t-----\n");
++
++        len += snprintf(page + len, count - len,
++                        "MODULENAME\t%s\n"
++                        "driver version\t%s\n"
++                        "mcfg\t%d\n"
++                        "chipset\t%d\n"
++                        "chipset_name\t%s\n"
++                        "mtu\t%d\n"
++                        "NUM_RX_DESC\t0x%x\n"
++                        "cur_rx0\t0x%x\n"
++                        "dirty_rx0\t0x%x\n"
++                        "cur_rx1\t0x%x\n"
++                        "dirty_rx1\t0x%x\n"
++                        "cur_rx2\t0x%x\n"
++                        "dirty_rx2\t0x%x\n"
++                        "cur_rx3\t0x%x\n"
++                        "dirty_rx3\t0x%x\n"
++                        "NUM_TX_DESC\t0x%x\n"
++                        "cur_tx0\t0x%x\n"
++                        "dirty_tx0\t0x%x\n"
++                        "cur_tx1\t0x%x\n"
++                        "dirty_tx1\t0x%x\n"
++                        "rx_buf_sz\t0x%x\n"
++#ifdef ENABLE_PAGE_REUSE
++                        "rx_buf_page_order\t0x%x\n"
++                        "rx_buf_page_size\t0x%x\n"
++                        "page_reuse_fail_cnt\t0x%x\n"
++#endif //ENABLE_PAGE_REUSE
++                        "esd_flag\t0x%x\n"
++                        "pci_cfg_is_read\t0x%x\n"
++                        "rtl8125_rx_config\t0x%x\n"
++                        "cp_cmd\t0x%x\n"
++                        "intr_mask\t0x%x\n"
++                        "timer_intr_mask\t0x%x\n"
++                        "wol_enabled\t0x%x\n"
++                        "wol_opts\t0x%x\n"
++                        "efuse_ver\t0x%x\n"
++                        "eeprom_type\t0x%x\n"
++                        "autoneg\t0x%x\n"
++                        "duplex\t0x%x\n"
++                        "speed\t%d\n"
++                        "advertising\t0x%llx\n"
++                        "eeprom_len\t0x%x\n"
++                        "cur_page\t0x%x\n"
++                        "features\t0x%x\n"
++                        "org_pci_offset_99\t0x%x\n"
++                        "org_pci_offset_180\t0x%x\n"
++                        "issue_offset_99_event\t0x%x\n"
++                        "org_pci_offset_80\t0x%x\n"
++                        "org_pci_offset_81\t0x%x\n"
++                        "use_timer_interrupt\t0x%x\n"
++                        "HwIcVerUnknown\t0x%x\n"
++                        "NotWrRamCodeToMicroP\t0x%x\n"
++                        "NotWrMcuPatchCode\t0x%x\n"
++                        "HwHasWrRamCodeToMicroP\t0x%x\n"
++                        "sw_ram_code_ver\t0x%x\n"
++                        "hw_ram_code_ver\t0x%x\n"
++                        "rtk_enable_diag\t0x%x\n"
++                        "ShortPacketSwChecksum\t0x%x\n"
++                        "UseSwPaddingShortPkt\t0x%x\n"
++                        "RequireAdcBiasPatch\t0x%x\n"
++                        "AdcBiasPatchIoffset\t0x%x\n"
++                        "RequireAdjustUpsTxLinkPulseTiming\t0x%x\n"
++                        "SwrCnt1msIni\t0x%x\n"
++                        "HwSuppNowIsOobVer\t0x%x\n"
++                        "HwFiberModeVer\t0x%x\n"
++                        "HwFiberStat\t0x%x\n"
++                        "HwSwitchMdiToFiber\t0x%x\n"
++                        "Led0\t0x%x\n"
++                        "RequiredSecLanDonglePatch\t0x%x\n"
++                        "RequiredPfmPatch\t0x%x\n"
++                        "HwSuppDashVer\t0x%x\n"
++                        "DASH\t0x%x\n"
++                        "DashFirmwareVersion\t0x%x\n"
++                        "HwSuppKCPOffloadVer\t0x%x\n"
++                        "speed_mode\t0x%x\n"
++                        "duplex_mode\t0x%x\n"
++                        "autoneg_mode\t0x%x\n"
++                        "aspm\t0x%x\n"
++                        "s5wol\t0x%x\n"
++                        "s5_keep_curr_mac\t0x%x\n"
++                        "eee_enable\t0x%x\n"
++                        "hwoptimize\t0x%lx\n"
++                        "proc_init_num\t0x%x\n"
++                        "s0_magic_packet\t0x%x\n"
++                        "disable_wol_support\t0x%x\n"
++                        "enable_double_vlan\t0x%x\n"
++                        "eee_giga_lite\t0x%x\n"
++                        "HwSuppMagicPktVer\t0x%x\n"
++                        "HwSuppEsdVer\t0x%x\n"
++                        "HwSuppLinkChgWakeUpVer\t0x%x\n"
++                        "HwSuppD0SpeedUpVer\t0x%x\n"
++                        "D0SpeedUpSpeed\t0x%x\n"
++                        "HwSuppCheckPhyDisableModeVer\t0x%x\n"
++                        "HwPkgDet\t0x%x\n"
++                        "HwSuppTxNoCloseVer\t0x%x\n"
++                        "EnableTxNoClose\t0x%x\n"
++                        "NextHwDesCloPtr0\t0x%x\n"
++                        "BeginHwDesCloPtr0\t0x%x\n"
++                        "hw_clo_ptr_reg0\t0x%x\n"
++                        "sw_tail_ptr_reg0\t0x%x\n"
++                        "NextHwDesCloPtr1\t0x%x\n"
++                        "BeginHwDesCloPtr1\t0x%x\n"
++                        "hw_clo_ptr_reg1\t0x%x\n"
++                        "sw_tail_ptr_reg1\t0x%x\n"
++                        "InitRxDescType\t0x%x\n"
++                        "RxDescLength\t0x%x\n"
++                        "num_rx_rings\t0x%x\n"
++                        "num_tx_rings\t0x%x\n"
++                        "tot_rx_rings\t0x%x\n"
++                        "tot_tx_rings\t0x%x\n"
++                        "HwSuppNumRxQueues\t0x%x\n"
++                        "HwSuppNumTxQueues\t0x%x\n"
++                        "EnableRss\t0x%x\n"
++                        "EnablePtp\t0x%x\n"
++                        "ptp_master_mode\t0x%x\n"
++                        "min_irq_nvecs\t0x%x\n"
++                        "irq_nvecs\t0x%x\n"
++                        "hw_supp_irq_nvecs\t0x%x\n"
++                        "ring_lib_enabled\t0x%x\n"
++                        "HwSuppIsrVer\t0x%x\n"
++                        "HwCurrIsrVer\t0x%x\n"
++                        "HwSuppMacMcuVer\t0x%x\n"
++                        "MacMcuPageSize\t0x%x\n"
++                        "hw_mcu_patch_code_ver\t0x%llx\n"
++                        "bin_mcu_patch_code_ver\t0x%llx\n"
++#ifdef ENABLE_PTP_SUPPORT
++                        "tx_hwtstamp_timeouts\t0x%x\n"
++                        "tx_hwtstamp_skipped\t0x%x\n"
++#endif
++                        "random_mac\t0x%x\n"
++                        "org_mac_addr\t%pM\n"
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++                        "perm_addr\t%pM\n"
++#endif
++                        "dev_addr\t%pM\n",
++                        MODULENAME,
++                        RTL8125_VERSION,
++                        tp->mcfg,
++                        tp->chipset,
++                        rtl_chip_info[tp->chipset].name,
++                        dev->mtu,
++                        tp->rx_ring[0].num_rx_desc,
++                        tp->rx_ring[0].cur_rx,
++                        tp->rx_ring[0].dirty_rx,
++                        tp->rx_ring[1].cur_rx,
++                        tp->rx_ring[1].dirty_rx,
++                        tp->rx_ring[2].cur_rx,
++                        tp->rx_ring[2].dirty_rx,
++                        tp->rx_ring[3].cur_rx,
++                        tp->rx_ring[3].dirty_rx,
++                        tp->tx_ring[0].num_tx_desc,
++                        tp->tx_ring[0].cur_tx,
++                        tp->tx_ring[0].dirty_tx,
++                        tp->tx_ring[1].cur_tx,
++                        tp->tx_ring[1].dirty_tx,
++                        tp->rx_buf_sz,
++#ifdef ENABLE_PAGE_REUSE
++                        tp->rx_buf_page_order,
++                        tp->rx_buf_page_size,
++                        tp->page_reuse_fail_cnt,
++#endif //ENABLE_PAGE_REUSE
++                        tp->esd_flag,
++                        tp->pci_cfg_is_read,
++                        tp->rtl8125_rx_config,
++                        tp->cp_cmd,
++                        tp->intr_mask,
++                        tp->timer_intr_mask,
++                        tp->wol_enabled,
++                        tp->wol_opts,
++                        tp->efuse_ver,
++                        tp->eeprom_type,
++                        tp->autoneg,
++                        tp->duplex,
++                        tp->speed,
++                        tp->advertising,
++                        tp->eeprom_len,
++                        tp->cur_page,
++                        tp->features,
++                        tp->org_pci_offset_99,
++                        tp->org_pci_offset_180,
++                        tp->issue_offset_99_event,
++                        tp->org_pci_offset_80,
++                        tp->org_pci_offset_81,
++                        tp->use_timer_interrupt,
++                        tp->HwIcVerUnknown,
++                        tp->NotWrRamCodeToMicroP,
++                        tp->NotWrMcuPatchCode,
++                        tp->HwHasWrRamCodeToMicroP,
++                        tp->sw_ram_code_ver,
++                        tp->hw_ram_code_ver,
++                        tp->rtk_enable_diag,
++                        tp->ShortPacketSwChecksum,
++                        tp->UseSwPaddingShortPkt,
++                        tp->RequireAdcBiasPatch,
++                        tp->AdcBiasPatchIoffset,
++                        tp->RequireAdjustUpsTxLinkPulseTiming,
++                        tp->SwrCnt1msIni,
++                        tp->HwSuppNowIsOobVer,
++                        tp->HwFiberModeVer,
++                        tp->HwFiberStat,
++                        tp->HwSwitchMdiToFiber,
++                        tp->BackupLedSel[0],
++                        tp->RequiredSecLanDonglePatch,
++                        tp->RequiredPfmPatch,
++                        tp->HwSuppDashVer,
++                        tp->DASH,
++                        tp->DashFirmwareVersion,
++                        tp->HwSuppKCPOffloadVer,
++                        speed_mode,
++                        duplex_mode,
++                        autoneg_mode,
++                        aspm,
++                        s5wol,
++                        s5_keep_curr_mac,
++                        tp->eee.eee_enabled,
++                        hwoptimize,
++                        proc_init_num,
++                        s0_magic_packet,
++                        disable_wol_support,
++                        enable_double_vlan,
++                        eee_giga_lite,
++                        tp->HwSuppMagicPktVer,
++                        tp->HwSuppEsdVer,
++                        tp->HwSuppLinkChgWakeUpVer,
++                        tp->HwSuppD0SpeedUpVer,
++                        tp->D0SpeedUpSpeed,
++                        tp->HwSuppCheckPhyDisableModeVer,
++                        tp->HwPkgDet,
++                        tp->HwSuppTxNoCloseVer,
++                        tp->EnableTxNoClose,
++                        tp->tx_ring[0].NextHwDesCloPtr,
++                        tp->tx_ring[0].BeginHwDesCloPtr,
++                        rtl8125_get_hw_clo_ptr(&tp->tx_ring[0]),
++                        rtl8125_get_sw_tail_ptr(&tp->tx_ring[0]),
++                        tp->tx_ring[1].NextHwDesCloPtr,
++                        tp->tx_ring[1].BeginHwDesCloPtr,
++                        rtl8125_get_hw_clo_ptr(&tp->tx_ring[1]),
++                        rtl8125_get_sw_tail_ptr(&tp->tx_ring[1]),
++                        tp->InitRxDescType,
++                        tp->RxDescLength,
++                        tp->num_rx_rings,
++                        tp->num_tx_rings,
++                        rtl8125_tot_rx_rings(tp),
++                        rtl8125_tot_tx_rings(tp),
++                        tp->HwSuppNumRxQueues,
++                        tp->HwSuppNumTxQueues,
++                        tp->EnableRss,
++                        tp->EnablePtp,
++                        tp->ptp_master_mode,
++                        tp->min_irq_nvecs,
++                        tp->irq_nvecs,
++                        tp->hw_supp_irq_nvecs,
++                        tp->ring_lib_enabled,
++                        tp->HwSuppIsrVer,
++                        tp->HwCurrIsrVer,
++                        tp->HwSuppMacMcuVer,
++                        tp->MacMcuPageSize,
++                        tp->hw_mcu_patch_code_ver,
++                        tp->bin_mcu_patch_code_ver,
++#ifdef ENABLE_PTP_SUPPORT
++                        tp->tx_hwtstamp_timeouts,
++                        tp->tx_hwtstamp_skipped,
++#endif
++                        tp->random_mac,
++                        tp->org_mac_addr,
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++                        dev->perm_addr,
++#endif
++                        dev->dev_addr);
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_tally_counter(char *page, char **start,
++                                  off_t offset, int count,
++                                  int *eof, void *data)
++{
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct rtl8125_counters *counters;
++        dma_addr_t paddr;
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump Tally Counter\n");
++
++        rtnl_lock();
++
++        counters = tp->tally_vaddr;
++        paddr = tp->tally_paddr;
++        if (!counters) {
++                len += snprintf(page + len, count - len,
++                                "\nDump Tally Counter Fail\n");
++                goto out_unlock;
++        }
++
++        rtl8125_dump_tally_counter(tp, paddr);
++
++        len += snprintf(page + len, count - len,
++                        "Statistics\tValue\n----------\t-----\n");
++
++        len += snprintf(page + len, count - len,
++                        "tx_packets\t%lld\n"
++                        "rx_packets\t%lld\n"
++                        "tx_errors\t%lld\n"
++                        "rx_errors\t%d\n"
++                        "rx_missed\t%d\n"
++                        "align_errors\t%d\n"
++                        "tx_one_collision\t%d\n"
++                        "tx_multi_collision\t%d\n"
++                        "rx_unicast\t%lld\n"
++                        "rx_broadcast\t%lld\n"
++                        "rx_multicast\t%d\n"
++                        "tx_aborted\t%d\n"
++                        "tx_underrun\t%d\n"
++
++                        "tx_octets\t%lld\n"
++                        "rx_octets\t%lld\n"
++                        "rx_multicast64\t%lld\n"
++                        "tx_unicast64\t%lld\n"
++                        "tx_broadcast64\t%lld\n"
++                        "tx_multicast64\t%lld\n"
++                        "tx_pause_on\t%d\n"
++                        "tx_pause_off\t%d\n"
++                        "tx_pause_all\t%d\n"
++                        "tx_deferred\t%d\n"
++                        "tx_late_collision\t%d\n"
++                        "tx_all_collision\t%d\n"
++                        "tx_aborted32\t%d\n"
++                        "align_errors32\t%d\n"
++                        "rx_frame_too_long\t%d\n"
++                        "rx_runt\t%d\n"
++                        "rx_pause_on\t%d\n"
++                        "rx_pause_off\t%d\n"
++                        "rx_pause_all\t%d\n"
++                        "rx_unknown_opcode\t%d\n"
++                        "rx_mac_error\t%d\n"
++                        "tx_underrun32\t%d\n"
++                        "rx_mac_missed\t%d\n"
++                        "rx_tcam_dropped\t%d\n"
++                        "tdu\t%d\n"
++                        "rdu\t%d\n",
++                        le64_to_cpu(counters->tx_packets),
++                        le64_to_cpu(counters->rx_packets),
++                        le64_to_cpu(counters->tx_errors),
++                        le32_to_cpu(counters->rx_errors),
++                        le16_to_cpu(counters->rx_missed),
++                        le16_to_cpu(counters->align_errors),
++                        le32_to_cpu(counters->tx_one_collision),
++                        le32_to_cpu(counters->tx_multi_collision),
++                        le64_to_cpu(counters->rx_unicast),
++                        le64_to_cpu(counters->rx_broadcast),
++                        le32_to_cpu(counters->rx_multicast),
++                        le16_to_cpu(counters->tx_aborted),
++                        le16_to_cpu(counters->tx_underrun),
++
++                        le64_to_cpu(counters->tx_octets),
++                        le64_to_cpu(counters->rx_octets),
++                        le64_to_cpu(counters->rx_multicast64),
++                        le64_to_cpu(counters->tx_unicast64),
++                        le64_to_cpu(counters->tx_broadcast64),
++                        le64_to_cpu(counters->tx_multicast64),
++                        le32_to_cpu(counters->tx_pause_on),
++                        le32_to_cpu(counters->tx_pause_off),
++                        le32_to_cpu(counters->tx_pause_all),
++                        le32_to_cpu(counters->tx_deferred),
++                        le32_to_cpu(counters->tx_late_collision),
++                        le32_to_cpu(counters->tx_all_collision),
++                        le32_to_cpu(counters->tx_aborted32),
++                        le32_to_cpu(counters->align_errors32),
++                        le32_to_cpu(counters->rx_frame_too_long),
++                        le32_to_cpu(counters->rx_runt),
++                        le32_to_cpu(counters->rx_pause_on),
++                        le32_to_cpu(counters->rx_pause_off),
++                        le32_to_cpu(counters->rx_pause_all),
++                        le32_to_cpu(counters->rx_unknown_opcode),
++                        le32_to_cpu(counters->rx_mac_error),
++                        le32_to_cpu(counters->tx_underrun32),
++                        le32_to_cpu(counters->rx_mac_missed),
++                        le32_to_cpu(counters->rx_tcam_dropped),
++                        le32_to_cpu(counters->tdu),
++                        le32_to_cpu(counters->rdu));
++
++        len += snprintf(page + len, count - len, "\n");
++out_unlock:
++        rtnl_unlock();
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_registers(char *page, char **start,
++                              off_t offset, int count,
++                              int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max = R8125_MAC_REGS_SIZE;
++        u8 byte_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump MAC Registers\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%04x:\t",
++                                n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        len += snprintf(page + len, count - len,
++                                        "%02x ",
++                                        byte_rd);
++                }
++        }
++
++        max = 0xB00;
++        for (n = 0xA00; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%04x:\t",
++                                n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        len += snprintf(page + len, count - len,
++                                        "%02x ",
++                                        byte_rd);
++                }
++        }
++
++        max = 0xD40;
++        for (n = 0xD00; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%04x:\t",
++                                n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        len += snprintf(page + len, count - len,
++                                        "%02x ",
++                                        byte_rd);
++                }
++        }
++
++        max = 0x2840;
++        for (n = 0x2800; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%04x:\t",
++                                n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        len += snprintf(page + len, count - len,
++                                        "%02x ",
++                                        byte_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_all_registers(char *page, char **start,
++                                  off_t offset, int count,
++                                  int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max;
++        u8 byte_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++        struct pci_dev *pdev = tp->pci_dev;
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump All MAC Registers\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        max = pci_resource_len(pdev, 2);
++        max = min(max, 0x8000);
++
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%04x:\t",
++                                n);
++
++                for (i = 0; i < 16 && n < max; i++, n++) {
++                        byte_rd = readb(ioaddr + n);
++                        len += snprintf(page + len, count - len,
++                                        "%02x ",
++                                        byte_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\nTotal length:0x%X", max);
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_pcie_phy(char *page, char **start,
++                             off_t offset, int count,
++                             int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max = R8125_EPHY_REGS_SIZE/2;
++        u16 word_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump PCIE PHY\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%02x:\t",
++                                n);
++
++                for (i = 0; i < 8 && n < max; i++, n++) {
++                        word_rd = rtl8125_ephy_read(tp, n);
++                        len += snprintf(page + len, count - len,
++                                        "%04x ",
++                                        word_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_eth_phy(char *page, char **start,
++                            off_t offset, int count,
++                            int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max = R8125_PHY_REGS_SIZE/2;
++        u16 word_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump Ethernet PHY\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        len += snprintf(page + len, count - len,
++                        "\n####################page 0##################\n");
++        rtl8125_mdio_write(tp, 0x1f, 0x0000);
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%02x:\t",
++                                n);
++
++                for (i = 0; i < 8 && n < max; i++, n++) {
++                        word_rd = rtl8125_mdio_read(tp, n);
++                        len += snprintf(page + len, count - len,
++                                        "%04x ",
++                                        word_rd);
++                }
++        }
++
++        len += snprintf(page + len, count - len,
++                        "\n####################extra reg##################\n");
++        n = 0xA400;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        for (i = 0; i < 8; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                len += snprintf(page + len, count - len,
++                                "%04x ",
++                                word_rd);
++        }
++
++        n = 0xA410;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        for (i = 0; i < 3; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                len += snprintf(page + len, count - len,
++                                "%04x ",
++                                word_rd);
++        }
++
++        n = 0xA434;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++        len += snprintf(page + len, count - len,
++                        "%04x ",
++                        word_rd);
++
++        n = 0xA5D0;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        for (i = 0; i < 4; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                len += snprintf(page + len, count - len,
++                                "%04x ",
++                                word_rd);
++        }
++
++        n = 0xA61A;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++        len += snprintf(page + len, count - len,
++                        "%04x ",
++                        word_rd);
++
++        n = 0xA6D0;
++        len += snprintf(page + len, count - len,
++                        "\n0x%02x:\t",
++                        n);
++        for (i = 0; i < 3; i++, n+=2) {
++                word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n);
++                len += snprintf(page + len, count - len,
++                                "%04x ",
++                                word_rd);
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_extended_registers(char *page, char **start,
++                                       off_t offset, int count,
++                                       int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max = R8125_ERI_REGS_SIZE;
++        u32 dword_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump Extended Registers\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%02x:\t",
++                                n);
++
++                for (i = 0; i < 4 && n < max; i++, n+=4) {
++                        dword_rd = rtl8125_eri_read(tp, n, 4, ERIAR_ExGMAC);
++                        len += snprintf(page + len, count - len,
++                                        "%08x ",
++                                        dword_rd);
++                }
++        }
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_pci_registers(char *page, char **start,
++                                  off_t offset, int count,
++                                  int *eof, void *data)
++{
++        struct net_device *dev = data;
++        int i, n, max = R8125_PCI_REGS_SIZE;
++        u32 dword_rd;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int len = 0;
++
++        len += snprintf(page + len, count - len,
++                        "\nDump PCI Registers\n"
++                        "Offset\tValue\n------\t-----\n");
++
++        rtnl_lock();
++
++        for (n = 0; n < max;) {
++                len += snprintf(page + len, count - len,
++                                "\n0x%03x:\t",
++                                n);
++
++                for (i = 0; i < 4 && n < max; i++, n+=4) {
++                        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++                        len += snprintf(page + len, count - len,
++                                        "%08x ",
++                                        dword_rd);
++                }
++        }
++
++        n = 0x110;
++        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++        len += snprintf(page + len, count - len,
++                        "\n0x%03x:\t%08x ",
++                        n,
++                        dword_rd);
++        n = 0x70c;
++        pci_read_config_dword(tp->pci_dev, n, &dword_rd);
++        len += snprintf(page + len, count - len,
++                        "\n0x%03x:\t%08x ",
++                        n,
++                        dword_rd);
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_temperature(char *page, char **start,
++                                off_t offset, int count,
++                                int *eof, void *data)
++{
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        int cel, fah;
++        int len = 0;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                len += snprintf(page + len, count - len,
++                                "\nChip Temperature\n");
++                break;
++        default:
++                return -EOPNOTSUPP;
++        }
++
++        rtnl_lock();
++
++        if (!rtl8125_sysfs_testmode_on(tp)) {
++                len += snprintf(page + len, count - len,
++                                "\nPlease turn on ""/sys/class/net/<iface>/rtk_adv/testmode"".\n\n");
++                goto out_unlock;
++        }
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++        cel = rtl8125_read_thermal_sensor(tp);
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        fah = rtl8125_cel_to_fah(cel);
++
++        len += snprintf(page + len, count - len,
++                        "Cel:%d\n",
++                        cel);
++        len += snprintf(page + len, count - len,
++                        "Fah:%d\n",
++                        fah);
++
++        len += snprintf(page + len, count - len, "\n");
++
++out_unlock:
++        rtnl_unlock();
++
++        *eof = 1;
++        return len;
++}
++
++static int _proc_get_cable_info(char *page, char **start,
++                                off_t offset, int count,
++                                int *eof, void *data,
++                                bool poe_mode)
++{
++        int i;
++        u32 status;
++        int len = 0;
++        struct net_device *dev = data;
++        int cp_status[RTL8125_CP_NUM] = {0};
++        int cp_len[RTL8125_CP_NUM] = {0};
++        struct rtl8125_private *tp = netdev_priv(dev);
++        const char *pair_str[RTL8125_CP_NUM] = {"1-2", "3-6", "4-5", "7-8"};
++        unsigned long flags;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2 ... CFG_METHOD_7:
++                /* support */
++                break;
++        default:
++                return -EOPNOTSUPP;
++        }
++
++        rtnl_lock();
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (!rtl8125_sysfs_testmode_on(tp)) {
++                len += snprintf(page + len, count - len,
++                                "\nPlease turn on ""/sys/class/net/<iface>/rtk_adv/testmode"".\n\n");
++                goto out_unlock;
++        }
++
++        status = rtl8125_get_phy_status(tp);
++        if (status & LinkStatus)
++                len += snprintf(page + len, count - len,
++                                "\nlink speed:%d",
++                                rtl8125_convert_link_speed(status));
++        else
++                len += snprintf(page + len, count - len,
++                                "\nlink status:off");
++
++        rtl8125_get_cp_len(tp, cp_len);
++
++        rtl8125_get_cp_status(tp, cp_status, poe_mode);
++
++        len += snprintf(page + len, count - len,
++                        "\npair\tlength\tstatus   \tpp\n");
++
++        for (i=0; i<RTL8125_CP_NUM; i++) {
++                if (cp_len[i] < 0)
++                        len += snprintf(page + len, count - len,
++                                        "%s\t%s\t%s\t",
++                                        pair_str[i], "none",
++                                        rtl8125_get_cp_status_string(cp_status[i]));
++                else
++                        len += snprintf(page + len, count - len,
++                                        "%s\t%d\t%s\t",
++                                        pair_str[i], cp_len[i],
++                                        rtl8125_get_cp_status_string(cp_status[i]));
++                if (cp_status[i] == rtl8125_cp_normal)
++                        len += snprintf(page + len, count - len, "none\n");
++                else
++                        len += snprintf(page + len, count - len, "%dm\n",
++                                        rtl8125_get_cp_pp(tp, i));
++        }
++
++        len += snprintf(page + len, count - len, "\n");
++
++out_unlock:
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        rtnl_unlock();
++
++        *eof = 1;
++        return len;
++}
++
++static int proc_get_cable_info(char *page, char **start,
++                               off_t offset, int count,
++                               int *eof, void *data)
++{
++        return _proc_get_cable_info(page, start, offset, count, eof, data, 0);
++}
++
++static int proc_get_poe_cable_info(char *page, char **start,
++                                   off_t offset, int count,
++                                   int *eof, void *data)
++{
++        return _proc_get_cable_info(page, start, offset, count, eof, data, 1);
++}
++
++static void _proc_dump_desc(char *page, int *page_len, int *count, void *desc_base,
++                            u32 alloc_size)
++{
++        u32 *pdword;
++        int i, len;
++
++        if (desc_base == NULL ||
++            alloc_size == 0)
++                return;
++
++        len = *page_len;
++        pdword = (u32*)desc_base;
++        for (i=0; i<(alloc_size/4); i++) {
++                if (!(i % 4))
++                        len += snprintf(page + len, *count - len,
++                                        "\n%04x ",
++                                        i);
++                len += snprintf(page + len, *count - len,
++                                "%08x ",
++                                pdword[i]);
++        }
++
++        len += snprintf(page + len, *count - len, "\n");
++
++        *page_len = len;
++        return;
++}
++
++static int proc_dump_rx_desc(char *page, char **start,
++                             off_t offset, int count,
++                             int *eof, void *data)
++{
++        int i;
++        int len = 0;
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtnl_lock();
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++
++                if (!ring)
++                        continue;
++
++                len += snprintf(page + len, count - len,
++                                "\ndump rx %d desc:%d",
++                                i, ring->num_rx_desc);
++
++                _proc_dump_desc(page, &len, &count,
++                                ring->RxDescArray,
++                                ring->RxDescAllocSize);
++        }
++
++#ifdef ENABLE_LIB_SUPPORT
++        if (rtl8125_num_lib_rx_rings(tp) > 0) {
++                for (i = 0; i < tp->HwSuppNumRxQueues; i++) {
++                        struct rtl8125_ring *lib_ring = &tp->lib_rx_ring[i];
++                        if (lib_ring->enabled) {
++                                len += snprintf(page + len, count - len,
++                                                "\ndump lib rx %d desc:%d",
++                                                i,
++                                                ring->ring_size);
++                                _proc_dump_desc(page, &len, &count,
++                                                (void*)lib_ring->desc_addr,
++                                                lib_ring->desc_size);
++                        }
++                }
++        }
++#endif //ENABLE_LIB_SUPPORT
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++
++        return len;
++}
++
++static int proc_dump_tx_desc(char *page, char **start,
++                             off_t offset, int count,
++                             int *eof, void *data)
++{
++        int len = 0;
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        rtnl_lock();
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++
++                if (!ring)
++                        continue;
++
++                len += snprintf(page + len, count - len,
++                                "\ndump tx desc:%d",
++                                ring->num_tx_desc);
++
++                _proc_dump_desc(page, &len, &count,
++                                ring->TxDescArray,
++                                ring->TxDescAllocSize);
++        }
++
++#ifdef ENABLE_LIB_SUPPORT
++        if (rtl8125_num_lib_tx_rings(tp) > 0) {
++                for (i = 0; i < tp->HwSuppNumTxQueues; i++) {
++                        struct rtl8125_ring *lib_ring = &tp->lib_tx_ring[i];
++                        if (lib_ring->enabled) {
++                                len += snprintf(page + len, count - len,
++                                                "\ndump lib tx %d desc:%d",
++                                                i,
++                                                ring->ring_size);
++                                _proc_dump_desc(page, &len, &count,
++                                                (void*)lib_ring->desc_addr,
++                                                lib_ring->desc_size);
++                        }
++                }
++        }
++#endif //ENABLE_LIB_SUPPORT
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++
++        return len;
++}
++
++static int proc_dump_msix_tbl(char *page, char **start,
++                              off_t offset, int count,
++                              int *eof, void *data)
++{
++        int i, j;
++        int len = 0;
++        void __iomem *ioaddr;
++        struct net_device *dev = data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        /* ioremap MMIO region */
++        ioaddr = ioremap(pci_resource_start(tp->pci_dev, 4), pci_resource_len(tp->pci_dev, 4));
++        if (!ioaddr)
++                return -EFAULT;
++
++        rtnl_lock();
++
++        len += snprintf(page + len, count - len,
++                        "\ndump MSI-X Table. Total Entry %d. \n",
++                        tp->hw_supp_irq_nvecs);
++
++        for (i=0; i<tp->hw_supp_irq_nvecs; i++) {
++                len += snprintf(page + len, count - len,
++                                "\n%04x ", i);
++                for (j=0; j<4; j++)
++                        len += snprintf(page + len, count - len, "%08x ",
++                                        readl(ioaddr + i*0x10 + 4*j));
++        }
++
++        rtnl_unlock();
++
++        len += snprintf(page + len, count - len, "\n");
++
++        *eof = 1;
++        return 0;
++}
++
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++
++static void rtl8125_proc_module_init(void)
++{
++        //create /proc/net/r8125
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
++        rtl8125_proc = proc_mkdir(MODULENAME, init_net.proc_net);
++#else
++        rtl8125_proc = proc_mkdir(MODULENAME, proc_net);
++#endif
++        if (!rtl8125_proc)
++                dprintk("cannot create %s proc entry \n", MODULENAME);
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++/*
++ * seq_file wrappers for procfile show routines.
++ */
++static int rtl8125_proc_open(struct inode *inode, struct file *file)
++{
++        struct net_device *dev = proc_get_parent_data(inode);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++        int (*show)(struct seq_file *, void *) = pde_data(inode);
++#else
++        int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++
++        return single_open(file, show, dev);
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
++static const struct proc_ops rtl8125_proc_fops = {
++        .proc_open           = rtl8125_proc_open,
++        .proc_read           = seq_read,
++        .proc_lseek          = seq_lseek,
++        .proc_release        = single_release,
++};
++#else
++static const struct file_operations rtl8125_proc_fops = {
++        .open           = rtl8125_proc_open,
++        .read           = seq_read,
++        .llseek         = seq_lseek,
++        .release        = single_release,
++};
++#endif
++
++#endif
++
++/*
++ * Table of proc files we need to create.
++ */
++struct rtl8125_proc_file {
++        char name[16];
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++        int (*show)(struct seq_file *, void *);
++#else
++        int (*show)(char *, char **, off_t, int, int *, void *);
++#endif
++};
++
++static const struct rtl8125_proc_file rtl8125_debug_proc_files[] = {
++        { "driver_var", &proc_get_driver_variable },
++        { "tally", &proc_get_tally_counter },
++        { "registers", &proc_get_registers },
++        { "registers2", &proc_get_all_registers },
++        { "pcie_phy", &proc_get_pcie_phy },
++        { "eth_phy", &proc_get_eth_phy },
++        { "ext_regs", &proc_get_extended_registers },
++        { "pci_regs", &proc_get_pci_registers },
++        { "tx_desc", &proc_dump_tx_desc },
++        { "rx_desc", &proc_dump_rx_desc },
++        { "msix_tbl", &proc_dump_msix_tbl },
++        { "", NULL }
++};
++
++static const struct rtl8125_proc_file rtl8125_test_proc_files[] = {
++        { "temp", &proc_get_temperature },
++        { "cdt", &proc_get_cable_info },
++        { "cdt_poe", &proc_get_poe_cable_info },
++        { "", NULL }
++};
++
++#define R8125_PROC_DEBUG_DIR "debug"
++#define R8125_PROC_TEST_DIR "test"
++
++static void rtl8125_proc_init(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        const struct rtl8125_proc_file *f;
++        struct proc_dir_entry *dir;
++
++        if (!rtl8125_proc)
++                return;
++
++        if (tp->proc_dir_debug || tp->proc_dir_test)
++                return;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++        dir = proc_mkdir_data(dev->name, 0, rtl8125_proc, dev);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s\n",
++                       MODULENAME, dev->name);
++                return;
++        }
++        tp->proc_dir = dir;
++        proc_init_num++;
++
++        /* create debug entry */
++        dir = proc_mkdir_data(R8125_PROC_DEBUG_DIR, 0, tp->proc_dir, dev);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s/%s\n",
++                       MODULENAME, dev->name, R8125_PROC_DEBUG_DIR);
++                return;
++        }
++
++        tp->proc_dir_debug = dir;
++        for (f = rtl8125_debug_proc_files; f->name[0]; f++) {
++                if (!proc_create_data(f->name, S_IFREG | S_IRUGO, dir,
++                                      &rtl8125_proc_fops, f->show)) {
++                        printk("Unable to initialize "
++                               "/proc/net/%s/%s/%s/%s\n",
++                               MODULENAME, dev->name, R8125_PROC_DEBUG_DIR,
++                               f->name);
++                        return;
++                }
++        }
++
++        /* create test entry */
++        dir = proc_mkdir_data(R8125_PROC_TEST_DIR, 0, tp->proc_dir, dev);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s/%s\n",
++                       MODULENAME, dev->name, R8125_PROC_TEST_DIR);
++                return;
++        }
++
++        tp->proc_dir_test = dir;
++        for (f = rtl8125_test_proc_files; f->name[0]; f++) {
++                if (!proc_create_data(f->name, S_IFREG | S_IRUGO, dir,
++                                      &rtl8125_proc_fops, f->show)) {
++                        printk("Unable to initialize "
++                               "/proc/net/%s/%s/%s/%s\n",
++                               MODULENAME, dev->name, R8125_PROC_TEST_DIR,
++                               f->name);
++                        return;
++                }
++        }
++#else
++        dir = proc_mkdir(dev->name, rtl8125_proc);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s\n",
++                       MODULENAME, dev->name);
++                return;
++        }
++
++        tp->proc_dir = dir;
++        proc_init_num++;
++
++        /* create debug entry */
++        dir = proc_mkdir(R8125_PROC_DEBUG_DIR, tp->proc_dir);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s/%s\n",
++                       MODULENAME, dev->name, R8125_PROC_DEBUG_DIR);
++                return;
++        }
++
++        tp->proc_dir_debug = dir;
++        for (f = rtl8125_debug_proc_files; f->name[0]; f++) {
++                if (!create_proc_read_entry(f->name, S_IFREG | S_IRUGO,
++                                            dir, f->show, dev)) {
++                        printk("Unable to initialize "
++                               "/proc/net/%s/%s/%s/%s\n",
++                               MODULENAME, dev->name, R8125_PROC_DEBUG_DIR,
++                               f->name);
++                        return;
++                }
++        }
++
++        /* create test entry */
++        dir = proc_mkdir(R8125_PROC_TEST_DIR, tp->proc_dir);
++        if (!dir) {
++                printk("Unable to initialize /proc/net/%s/%s/%s\n",
++                       MODULENAME, dev->name, R8125_PROC_TEST_DIR);
++                return;
++        }
++
++        tp->proc_dir_test = dir;
++        for (f = rtl8125_test_proc_files; f->name[0]; f++) {
++                if (!create_proc_read_entry(f->name, S_IFREG | S_IRUGO,
++                                            dir, f->show, dev)) {
++                        printk("Unable to initialize "
++                               "/proc/net/%s/%s/%s/%s\n",
++                               MODULENAME, dev->name, R8125_PROC_TEST_DIR,
++                               f->name);
++                        return;
++                }
++        }
++#endif
++}
++
++static void rtl8125_proc_remove(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->proc_dir) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++                remove_proc_subtree(dev->name, rtl8125_proc);
++#else
++                const struct rtl8125_proc_file *f;
++                struct rtl8125_private *tp = netdev_priv(dev);
++
++                if (tp->proc_dir_debug) {
++                        for (f = rtl8125_debug_proc_files; f->name[0]; f++)
++                                remove_proc_entry(f->name, tp->proc_dir_debug);
++                        remove_proc_entry(R8125_PROC_DEBUG_DIR, tp->proc_dir);
++                }
++
++                if (tp->proc_dir_test) {
++                        for (f = rtl8125_test_proc_files; f->name[0]; f++)
++                                remove_proc_entry(f->name, tp->proc_dir_test);
++                        remove_proc_entry(R8125_PROC_TEST_DIR, tp->proc_dir);
++                }
++
++                remove_proc_entry(dev->name, rtl8125_proc);
++#endif
++                proc_init_num--;
++
++                tp->proc_dir_debug = NULL;
++                tp->proc_dir_test = NULL;
++                tp->proc_dir = NULL;
++        }
++}
++
++#endif //ENABLE_R8125_PROCFS
++
++#ifdef ENABLE_R8125_SYSFS
++/****************************************************************************
++*   -----------------------------SYSFS STUFF-------------------------
++*****************************************************************************
++*/
++static ssize_t testmode_show(struct device *dev,
++                             struct device_attribute *attr, char *buf)
++{
++        struct net_device *netdev = to_net_dev(dev);
++        struct rtl8125_private *tp = netdev_priv(netdev);
++
++        sprintf(buf, "%u\n", tp->testmode);
++
++        return strlen(buf);
++}
++
++static ssize_t testmode_store(struct device *dev,
++                              struct device_attribute *attr,
++                              const char *buf, size_t count)
++{
++        struct net_device *netdev = to_net_dev(dev);
++        struct rtl8125_private *tp = netdev_priv(netdev);
++        u32 testmode;
++
++        if (sscanf(buf, "%u\n", &testmode) != 1)
++                return -EINVAL;
++
++        if (tp->testmode != testmode) {
++                rtnl_lock();
++                tp->testmode = testmode;
++                rtnl_unlock();
++        }
++
++        return count;
++}
++
++static DEVICE_ATTR_RW(testmode);
++
++static struct attribute *rtk_adv_attrs[] = {
++        &dev_attr_testmode.attr,
++        NULL
++};
++
++static struct attribute_group rtk_adv_grp = {
++        .name = "rtl_adv",
++        .attrs = rtk_adv_attrs,
++};
++
++static void rtl8125_sysfs_init(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret;
++
++        /* init rtl_adv */
++#ifdef ENABLE_LIB_SUPPORT
++        tp->testmode = 0;
++#else
++        tp->testmode = 1;
++#endif //ENABLE_LIB_SUPPORT
++
++        ret = sysfs_create_group(&dev->dev.kobj, &rtk_adv_grp);
++        if (ret < 0)
++                netif_warn(tp, probe, dev, "create rtk_adv_grp fail\n");
++        else
++                set_bit(R8125_SYSFS_RTL_ADV, tp->sysfs_flag);
++}
++
++static void rtl8125_sysfs_remove(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (test_and_clear_bit(R8125_SYSFS_RTL_ADV, tp->sysfs_flag))
++                sysfs_remove_group(&dev->dev.kobj, &rtk_adv_grp);
++}
++#endif //ENABLE_R8125_SYSFS
++
++static inline u16 map_phy_ocp_addr(u16 PageNum, u8 RegNum)
++{
++        u16 OcpPageNum = 0;
++        u8 OcpRegNum = 0;
++        u16 OcpPhyAddress = 0;
++
++        if (PageNum == 0) {
++                OcpPageNum = OCP_STD_PHY_BASE_PAGE + (RegNum / 8);
++                OcpRegNum = 0x10 + (RegNum % 8);
++        } else {
++                OcpPageNum = PageNum;
++                OcpRegNum = RegNum;
++        }
++
++        OcpPageNum <<= 4;
++
++        if (OcpRegNum < 16) {
++                OcpPhyAddress = 0;
++        } else {
++                OcpRegNum -= 16;
++                OcpRegNum <<= 1;
++
++                OcpPhyAddress = OcpPageNum + OcpRegNum;
++        }
++
++
++        return OcpPhyAddress;
++}
++
++static void mdio_real_direct_write_phy_ocp(struct rtl8125_private *tp,
++                u16 RegAddr,
++                u16 value)
++{
++        u32 data32;
++        int i;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(RegAddr % 2);
++#endif
++        data32 = RegAddr/2;
++        data32 <<= OCPR_Addr_Reg_shift;
++        data32 |= OCPR_Write | value;
++
++        RTL_W32(tp, PHYOCP, data32);
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                if (!(RTL_R32(tp, PHYOCP) & OCPR_Flag))
++                        break;
++        }
++}
++
++void rtl8125_mdio_direct_write_phy_ocp(struct rtl8125_private *tp,
++                                       u16 RegAddr,
++                                       u16 value)
++{
++        if (tp->rtk_enable_diag)
++                return;
++
++        mdio_real_direct_write_phy_ocp(tp, RegAddr, value);
++}
++
++/*
++void rtl8125_mdio_write_phy_ocp(struct rtl8125_private *tp,
++                                       u16 PageNum,
++                                       u32 RegAddr,
++                                       u32 value)
++{
++        u16 ocp_addr;
++
++        ocp_addr = map_phy_ocp_addr(PageNum, RegAddr);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, ocp_addr, value);
++}
++*/
++
++static void rtl8125_mdio_real_write_phy_ocp(struct rtl8125_private *tp,
++                u16 PageNum,
++                u32 RegAddr,
++                u32 value)
++{
++        u16 ocp_addr;
++
++        ocp_addr = map_phy_ocp_addr(PageNum, RegAddr);
++
++        mdio_real_direct_write_phy_ocp(tp, ocp_addr, value);
++}
++
++static void mdio_real_write(struct rtl8125_private *tp,
++                            u16 RegAddr,
++                            u16 value)
++{
++        if (RegAddr == 0x1F) {
++                tp->cur_page = value;
++                return;
++        }
++        rtl8125_mdio_real_write_phy_ocp(tp, tp->cur_page, RegAddr, value);
++}
++
++void rtl8125_mdio_write(struct rtl8125_private *tp,
++                        u16 RegAddr,
++                        u16 value)
++{
++        if (tp->rtk_enable_diag)
++                return;
++
++        mdio_real_write(tp, RegAddr, value);
++}
++
++void rtl8125_mdio_prot_write(struct rtl8125_private *tp,
++                             u32 RegAddr,
++                             u32 value)
++{
++        mdio_real_write(tp, RegAddr, value);
++}
++
++void rtl8125_mdio_prot_direct_write_phy_ocp(struct rtl8125_private *tp,
++                u32 RegAddr,
++                u32 value)
++{
++        mdio_real_direct_write_phy_ocp(tp, RegAddr, value);
++}
++
++static u32 mdio_real_direct_read_phy_ocp(struct rtl8125_private *tp,
++                u16 RegAddr)
++{
++        u32 data32;
++        int i, value = 0;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(RegAddr % 2);
++#endif
++        data32 = RegAddr/2;
++        data32 <<= OCPR_Addr_Reg_shift;
++
++        RTL_W32(tp, PHYOCP, data32);
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                if (RTL_R32(tp, PHYOCP) & OCPR_Flag)
++                        break;
++        }
++        value = RTL_R32(tp, PHYOCP) & OCPDR_Data_Mask;
++
++        return value;
++}
++
++u32 rtl8125_mdio_direct_read_phy_ocp(struct rtl8125_private *tp,
++                                     u16 RegAddr)
++{
++        if (tp->rtk_enable_diag)
++                return 0xffffffff;
++
++        return mdio_real_direct_read_phy_ocp(tp, RegAddr);
++}
++
++/*
++static u32 rtl8125_mdio_read_phy_ocp(struct rtl8125_private *tp,
++                                     u16 PageNum,
++                                     u32 RegAddr)
++{
++        u16 ocp_addr;
++
++        ocp_addr = map_phy_ocp_addr(PageNum, RegAddr);
++
++        return rtl8125_mdio_direct_read_phy_ocp(tp, ocp_addr);
++}
++*/
++
++static u32 rtl8125_mdio_real_read_phy_ocp(struct rtl8125_private *tp,
++                u16 PageNum,
++                u32 RegAddr)
++{
++        u16 ocp_addr;
++
++        ocp_addr = map_phy_ocp_addr(PageNum, RegAddr);
++
++        return mdio_real_direct_read_phy_ocp(tp, ocp_addr);
++}
++
++static u32 mdio_real_read(struct rtl8125_private *tp,
++                          u16 RegAddr)
++{
++        return rtl8125_mdio_real_read_phy_ocp(tp, tp->cur_page, RegAddr);
++}
++
++u32 rtl8125_mdio_read(struct rtl8125_private *tp,
++                      u16 RegAddr)
++{
++        if (tp->rtk_enable_diag)
++                return 0xffffffff;
++
++        return mdio_real_read(tp, RegAddr);
++}
++
++u32 rtl8125_mdio_prot_read(struct rtl8125_private *tp,
++                           u32 RegAddr)
++{
++        return mdio_real_read(tp, RegAddr);
++}
++
++u32 rtl8125_mdio_prot_direct_read_phy_ocp(struct rtl8125_private *tp,
++                u32 RegAddr)
++{
++        return mdio_real_direct_read_phy_ocp(tp, RegAddr);
++}
++
++static void rtl8125_clear_and_set_eth_phy_bit(struct rtl8125_private *tp, u8  addr, u16 clearmask, u16 setmask)
++{
++        u16 PhyRegValue;
++
++        PhyRegValue = rtl8125_mdio_read(tp, addr);
++        PhyRegValue &= ~clearmask;
++        PhyRegValue |= setmask;
++        rtl8125_mdio_write(tp, addr, PhyRegValue);
++}
++
++void rtl8125_clear_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask)
++{
++        rtl8125_clear_and_set_eth_phy_bit(tp,
++                                          addr,
++                                          mask,
++                                          0);
++}
++
++void rtl8125_set_eth_phy_bit(struct rtl8125_private *tp,  u8  addr, u16  mask)
++{
++        rtl8125_clear_and_set_eth_phy_bit(tp,
++                                          addr,
++                                          0,
++                                          mask);
++}
++
++void rtl8125_clear_and_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 clearmask, u16 setmask)
++{
++        u16 PhyRegValue;
++
++        PhyRegValue = rtl8125_mdio_direct_read_phy_ocp(tp, addr);
++        PhyRegValue &= ~clearmask;
++        PhyRegValue |= setmask;
++        rtl8125_mdio_direct_write_phy_ocp(tp, addr, PhyRegValue);
++}
++
++void rtl8125_clear_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask)
++{
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              addr,
++                                              mask,
++                                              0);
++}
++
++void rtl8125_set_eth_phy_ocp_bit(struct rtl8125_private *tp,  u16 addr, u16 mask)
++{
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              addr,
++                                              0,
++                                              mask);
++}
++
++void rtl8125_mac_ocp_write(struct rtl8125_private *tp, u16 reg_addr, u16 value)
++{
++        u32 data32;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(reg_addr % 2);
++#endif
++
++        data32 = reg_addr/2;
++        data32 <<= OCPR_Addr_Reg_shift;
++        data32 += value;
++        data32 |= OCPR_Write;
++
++        RTL_W32(tp, MACOCP, data32);
++}
++
++u16 rtl8125_mac_ocp_read(struct rtl8125_private *tp, u16 reg_addr)
++{
++        u32 data32;
++        u16 data16 = 0;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(reg_addr % 2);
++#endif
++
++        data32 = reg_addr/2;
++        data32 <<= OCPR_Addr_Reg_shift;
++
++        RTL_W32(tp, MACOCP, data32);
++        data16 = (u16)RTL_R32(tp, MACOCP);
++
++        return data16;
++}
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++static void mac_mcu_write(struct rtl8125_private *tp, u16 reg, u16 value)
++{
++        if (reg == 0x1f) {
++                tp->ocp_base = value << 4;
++                return;
++        }
++
++        rtl8125_mac_ocp_write(tp, tp->ocp_base + reg, value);
++}
++
++static u32 mac_mcu_read(struct rtl8125_private *tp, u16 reg)
++{
++        return rtl8125_mac_ocp_read(tp, tp->ocp_base + reg);
++}
++#endif
++
++static void
++rtl8125_clear_set_mac_ocp_bit(
++        struct rtl8125_private *tp,
++        u16   addr,
++        u16   clearmask,
++        u16   setmask
++)
++{
++        u16 PhyRegValue;
++
++        PhyRegValue = rtl8125_mac_ocp_read(tp, addr);
++        PhyRegValue &= ~clearmask;
++        PhyRegValue |= setmask;
++        rtl8125_mac_ocp_write(tp, addr, PhyRegValue);
++}
++
++void
++rtl8125_clear_mac_ocp_bit(
++        struct rtl8125_private *tp,
++        u16   addr,
++        u16   mask
++)
++{
++        rtl8125_clear_set_mac_ocp_bit(tp,
++                                      addr,
++                                      mask,
++                                      0);
++}
++
++void
++rtl8125_set_mac_ocp_bit(
++        struct rtl8125_private *tp,
++        u16   addr,
++        u16   mask
++)
++{
++        rtl8125_clear_set_mac_ocp_bit(tp,
++                                      addr,
++                                      0,
++                                      mask);
++}
++
++u32 rtl8125_ocp_read_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, const u32 base_address)
++{
++        return rtl8125_eri_read_with_oob_base_address(tp, addr, len, ERIAR_OOB, base_address);
++}
++
++u32 rtl8125_ocp_read(struct rtl8125_private *tp, u16 addr, u8 len)
++{
++        if (!tp->AllowAccessDashOcp || tp->HwSuppOcpChannelVer != 2)
++                return 0xffffffff;
++
++        return rtl8125_ocp_read_with_oob_base_address(tp, addr, len,
++                        NO_BASE_ADDRESS);
++}
++
++u32 rtl8125_ocp_write_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 value, const u32 base_address)
++{
++        return rtl8125_eri_write_with_oob_base_address(tp, addr, len, value,
++                        ERIAR_OOB, base_address);
++}
++
++void rtl8125_ocp_write(struct rtl8125_private *tp, u16 addr, u8 len, u32 value)
++{
++        if (!tp->AllowAccessDashOcp || tp->HwSuppOcpChannelVer != 2)
++                return;
++
++        rtl8125_ocp_write_with_oob_base_address(tp, addr, len, value, NO_BASE_ADDRESS);
++}
++
++void rtl8125_oob_mutex_lock(struct rtl8125_private *tp)
++{
++        u8 reg_16, reg_a0;
++        u32 wait_cnt_0, wait_Cnt_1;
++        u16 ocp_reg_mutex_ib;
++        u16 ocp_reg_mutex_oob;
++        u16 ocp_reg_mutex_prio;
++
++        if (!HW_DASH_SUPPORT_DASH(tp))
++                return;
++
++        if (!tp->DASH)
++                return;
++
++        ocp_reg_mutex_oob = 0x110;
++        ocp_reg_mutex_ib = 0x114;
++        ocp_reg_mutex_prio = 0x11C;
++
++        rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, BIT_0);
++        reg_16 = rtl8125_ocp_read(tp, ocp_reg_mutex_oob, 1);
++        wait_cnt_0 = 0;
++        while(reg_16) {
++                reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1);
++                if (reg_a0) {
++                        rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, 0x00);
++                        reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1);
++                        wait_Cnt_1 = 0;
++                        while(reg_a0) {
++                                reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1);
++
++                                wait_Cnt_1++;
++
++                                if (wait_Cnt_1 > 2000)
++                                        break;
++                        };
++                        rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, BIT_0);
++
++                }
++                reg_16 = rtl8125_ocp_read(tp, ocp_reg_mutex_oob, 1);
++
++                wait_cnt_0++;
++
++                if (wait_cnt_0 > 2000)
++                        break;
++        };
++}
++
++void rtl8125_oob_mutex_unlock(struct rtl8125_private *tp)
++{
++        u16 ocp_reg_mutex_ib;
++        u16 ocp_reg_mutex_prio;
++
++        if (!HW_DASH_SUPPORT_DASH(tp))
++                return;
++
++        if (!tp->DASH)
++                return;
++
++        ocp_reg_mutex_ib = 0x114;
++        ocp_reg_mutex_prio = 0x11C;
++
++        rtl8125_ocp_write(tp, ocp_reg_mutex_prio, 1, BIT_0);
++        rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, 0x00);
++}
++
++static bool rtl8125_is_allow_access_dash_ocp(struct rtl8125_private *tp)
++{
++        bool allow_access = false;
++        u16 mac_ocp_data;
++
++        if (!HW_DASH_SUPPORT_DASH(tp))
++                goto exit;
++
++        allow_access = true;
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++                mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xd460);
++                if (mac_ocp_data == 0xffff || !(mac_ocp_data & BIT_0))
++                        allow_access = false;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++                mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xd4c0);
++                if (mac_ocp_data == 0xffff || (mac_ocp_data & BIT_3))
++                        allow_access = false;
++                break;
++        default:
++                goto exit;
++        }
++exit:
++        return allow_access;
++}
++
++static u32 rtl8125_get_dash_fw_ver(struct rtl8125_private *tp)
++{
++        u32 ver = 0xffffffff;
++
++        if (FALSE == HW_DASH_SUPPORT_GET_FIRMWARE_VERSION(tp))
++                goto exit;
++
++        ver = rtl8125_ocp_read(tp, OCP_REG_FIRMWARE_MAJOR_VERSION, 4);
++
++exit:
++        return ver;
++}
++
++static int _rtl8125_check_dash(struct rtl8125_private *tp)
++{
++        if (!tp->AllowAccessDashOcp)
++                return 0;
++
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return 0;
++
++        if (rtl8125_ocp_read(tp, 0x128, 1) & BIT_0)
++                return 1;
++
++        return 0;
++}
++
++static int rtl8125_check_dash(struct rtl8125_private *tp)
++{
++        if (HW_DASH_SUPPORT_DASH(tp) && _rtl8125_check_dash(tp)) {
++                u32 ver = rtl8125_get_dash_fw_ver(tp);
++                if (!(ver == 0 || ver == 0xffffffff))
++                        return 1;
++        }
++
++        return 0;
++}
++
++static int rtl8125_wait_dash_fw_ready(struct rtl8125_private *tp)
++{
++        int rc = -1;
++        int timeout;
++
++        if (!tp->DASH)
++                goto out;
++
++        for (timeout = 0; timeout < 10; timeout++) {
++                fsleep(10000);
++                if (rtl8125_ocp_read(tp, 0x124, 1) & BIT_0) {
++                        rc = 1;
++                        goto out;
++                }
++        }
++
++        rc = 0;
++
++out:
++        return rc;
++}
++
++static void
++rtl8125_notify_dash_oob_cmac(struct rtl8125_private *tp, u32 cmd)
++{
++        u32 val;
++
++        if (!HW_DASH_SUPPORT_CMAC(tp))
++                return;
++
++        rtl8125_ocp_write(tp, 0x180, 4, cmd);
++        val = rtl8125_ocp_read(tp, 0x30, 4);
++        val |= BIT_0;
++        rtl8125_ocp_write(tp, 0x30, 4, val);
++}
++
++static void
++rtl8125_notify_dash_oob_ipc2(struct rtl8125_private *tp, u32 cmd)
++{
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        rtl8125_ocp_write(tp, IB2SOC_DATA, 4, cmd);
++        rtl8125_ocp_write(tp, IB2SOC_CMD, 4, 0x00);
++        rtl8125_ocp_write(tp, IB2SOC_SET, 4, 0x01);
++}
++
++static void
++rtl8125_notify_dash_oob(struct rtl8125_private *tp, u32 cmd)
++{
++        if (HW_DASH_SUPPORT_CMAC(tp))
++                return rtl8125_notify_dash_oob_cmac(tp, cmd);
++        else if (HW_DASH_SUPPORT_IPC2(tp))
++                return rtl8125_notify_dash_oob_ipc2(tp, cmd);
++        else
++                return;
++}
++
++static void rtl8125_driver_start(struct rtl8125_private *tp)
++{
++        if (!tp->AllowAccessDashOcp)
++                return;
++
++        rtl8125_notify_dash_oob(tp, OOB_CMD_DRIVER_START);
++
++        rtl8125_wait_dash_fw_ready(tp);
++}
++
++static void rtl8125_driver_stop(struct rtl8125_private *tp)
++{
++        if (!tp->AllowAccessDashOcp)
++                return;
++
++        rtl8125_notify_dash_oob(tp, OOB_CMD_DRIVER_STOP);
++
++        rtl8125_wait_dash_fw_ready(tp);
++}
++
++void rtl8125_ephy_write(struct rtl8125_private *tp, int RegAddr, int value)
++{
++        int i;
++
++        RTL_W32(tp, EPHYAR,
++                EPHYAR_Write |
++                (RegAddr & EPHYAR_Reg_Mask_v2) << EPHYAR_Reg_shift |
++                (value & EPHYAR_Data_Mask));
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                /* Check if the RTL8125 has completed EPHY write */
++                if (!(RTL_R32(tp, EPHYAR) & EPHYAR_Flag))
++                        break;
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++}
++
++u16 rtl8125_ephy_read(struct rtl8125_private *tp, int RegAddr)
++{
++        int i;
++        u16 value = 0xffff;
++
++        RTL_W32(tp, EPHYAR,
++                EPHYAR_Read | (RegAddr & EPHYAR_Reg_Mask_v2) << EPHYAR_Reg_shift);
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                /* Check if the RTL8125 has completed EPHY read */
++                if (RTL_R32(tp, EPHYAR) & EPHYAR_Flag) {
++                        value = (u16) (RTL_R32(tp, EPHYAR) & EPHYAR_Data_Mask);
++                        break;
++                }
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++
++        return value;
++}
++
++static void ClearAndSetPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 clearmask, u16 setmask)
++{
++        u16 EphyValue;
++
++        EphyValue = rtl8125_ephy_read(tp, addr);
++        EphyValue &= ~clearmask;
++        EphyValue |= setmask;
++        rtl8125_ephy_write(tp, addr, EphyValue);
++}
++
++static void ClearPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 mask)
++{
++        ClearAndSetPCIePhyBit(tp,
++                              addr,
++                              mask,
++                              0);
++}
++
++static void SetPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 mask)
++{
++        ClearAndSetPCIePhyBit(tp,
++                              addr,
++                              0,
++                              mask);
++}
++
++static u32
++rtl8125_csi_other_fun_read(struct rtl8125_private *tp,
++                           u8 multi_fun_sel_bit,
++                           u32 addr)
++{
++        u32 cmd;
++        int i;
++        u32 value = 0xffffffff;
++
++        cmd = CSIAR_Read | CSIAR_ByteEn << CSIAR_ByteEn_shift | (addr & CSIAR_Addr_Mask);
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                multi_fun_sel_bit = 0;
++
++        if (multi_fun_sel_bit > 7)
++                goto exit;
++
++        cmd |= multi_fun_sel_bit << 16;
++
++        RTL_W32(tp, CSIAR, cmd);
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                /* Check if the RTL8125 has completed CSI read */
++                if (RTL_R32(tp, CSIAR) & CSIAR_Flag) {
++                        value = (u32)RTL_R32(tp, CSIDR);
++                        break;
++                }
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++
++exit:
++        return value;
++}
++
++static void
++rtl8125_csi_other_fun_write(struct rtl8125_private *tp,
++                            u8 multi_fun_sel_bit,
++                            u32 addr,
++                            u32 value)
++{
++        u32 cmd;
++        int i;
++
++        RTL_W32(tp, CSIDR, value);
++        cmd = CSIAR_Write | CSIAR_ByteEn << CSIAR_ByteEn_shift | (addr & CSIAR_Addr_Mask);
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                multi_fun_sel_bit = 0;
++
++        if (multi_fun_sel_bit > 7)
++                return;
++
++        cmd |= multi_fun_sel_bit << 16;
++
++        RTL_W32(tp, CSIAR, cmd);
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                /* Check if the RTL8125 has completed CSI write */
++                if (!(RTL_R32(tp, CSIAR) & CSIAR_Flag))
++                        break;
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++}
++
++static u32
++rtl8125_csi_read(struct rtl8125_private *tp,
++                 u32 addr)
++{
++        u8 multi_fun_sel_bit;
++
++        multi_fun_sel_bit = 0;
++
++        return rtl8125_csi_other_fun_read(tp, multi_fun_sel_bit, addr);
++}
++
++static void
++rtl8125_csi_write(struct rtl8125_private *tp,
++                  u32 addr,
++                  u32 value)
++{
++        u8 multi_fun_sel_bit;
++
++        multi_fun_sel_bit = 0;
++
++        rtl8125_csi_other_fun_write(tp, multi_fun_sel_bit, addr, value);
++}
++
++static u8
++rtl8125_csi_fun0_read_byte(struct rtl8125_private *tp,
++                           u32 addr)
++{
++        u8 RetVal = 0;
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT) {
++                struct pci_dev *pdev = tp->pci_dev;
++
++                pci_read_config_byte(pdev, addr, &RetVal);
++        } else {
++                u32 TmpUlong;
++                u16 RegAlignAddr;
++                u8 ShiftByte;
++
++                RegAlignAddr = addr & ~(0x3);
++                ShiftByte = addr & (0x3);
++                TmpUlong = rtl8125_csi_other_fun_read(tp, 0, RegAlignAddr);
++                TmpUlong >>= (8*ShiftByte);
++                RetVal = (u8)TmpUlong;
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++
++        return RetVal;
++}
++
++static void
++rtl8125_csi_fun0_write_byte(struct rtl8125_private *tp,
++                            u32 addr,
++                            u8 value)
++{
++        if (tp->mcfg == CFG_METHOD_DEFAULT) {
++                struct pci_dev *pdev = tp->pci_dev;
++
++                pci_write_config_byte(pdev, addr, value);
++        } else {
++                u32 TmpUlong;
++                u16 RegAlignAddr;
++                u8 ShiftByte;
++
++                RegAlignAddr = addr & ~(0x3);
++                ShiftByte = addr & (0x3);
++                TmpUlong = rtl8125_csi_other_fun_read(tp, 0, RegAlignAddr);
++                TmpUlong &= ~(0xFF << (8*ShiftByte));
++                TmpUlong |= (value << (8*ShiftByte));
++                rtl8125_csi_other_fun_write(tp, 0, RegAlignAddr, TmpUlong);
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++}
++
++u32 rtl8125_eri_read_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, int type, const u32 base_address)
++{
++        int i, val_shift, shift = 0;
++        u32 value1 = 0, value2 = 0, mask;
++        u32 eri_cmd;
++        const u32 transformed_base_address = ((base_address & 0x00FFF000) << 6) | (base_address & 0x000FFF);
++
++        if (len > 4 || len <= 0)
++                return -1;
++
++        while (len > 0) {
++                val_shift = addr % ERIAR_Addr_Align;
++                addr = addr & ~0x3;
++
++                eri_cmd = ERIAR_Read |
++                          transformed_base_address |
++                          type << ERIAR_Type_shift |
++                          ERIAR_ByteEn << ERIAR_ByteEn_shift |
++                          (addr & 0x0FFF);
++                if (addr & 0xF000) {
++                        u32 tmp;
++
++                        tmp = addr & 0xF000;
++                        tmp >>= 12;
++                        eri_cmd |= (tmp << 20) & 0x00F00000;
++                }
++
++                RTL_W32(tp, ERIAR, eri_cmd);
++
++                for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                        udelay(R8125_CHANNEL_WAIT_TIME);
++
++                        /* Check if the RTL8125 has completed ERI read */
++                        if (RTL_R32(tp, ERIAR) & ERIAR_Flag)
++                                break;
++                }
++
++                if (len == 1)       mask = (0xFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else if (len == 2)  mask = (0xFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else if (len == 3)  mask = (0xFFFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else            mask = (0xFFFFFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++
++                value1 = RTL_R32(tp, ERIDR) & mask;
++                value2 |= (value1 >> val_shift * 8) << shift * 8;
++
++                if (len <= 4 - val_shift) {
++                        len = 0;
++                } else {
++                        len -= (4 - val_shift);
++                        shift = 4 - val_shift;
++                        addr += 4;
++                }
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++
++        return value2;
++}
++
++u32 rtl8125_eri_read(struct rtl8125_private *tp, int addr, int len, int type)
++{
++        return rtl8125_eri_read_with_oob_base_address(tp, addr, len, type, 0);
++}
++
++int rtl8125_eri_write_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, u32 value, int type, const u32 base_address)
++{
++        int i, val_shift, shift = 0;
++        u32 value1 = 0, mask;
++        u32 eri_cmd;
++        const u32 transformed_base_address = ((base_address & 0x00FFF000) << 6) | (base_address & 0x000FFF);
++
++        if (len > 4 || len <= 0)
++                return -1;
++
++        while (len > 0) {
++                val_shift = addr % ERIAR_Addr_Align;
++                addr = addr & ~0x3;
++
++                if (len == 1)       mask = (0xFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else if (len == 2)  mask = (0xFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else if (len == 3)  mask = (0xFFFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++                else            mask = (0xFFFFFFFF << (val_shift * 8)) & 0xFFFFFFFF;
++
++                value1 = rtl8125_eri_read_with_oob_base_address(tp, addr, 4, type, base_address) & ~mask;
++                value1 |= ((value << val_shift * 8) >> shift * 8);
++
++                RTL_W32(tp, ERIDR, value1);
++
++                eri_cmd = ERIAR_Write |
++                          transformed_base_address |
++                          type << ERIAR_Type_shift |
++                          ERIAR_ByteEn << ERIAR_ByteEn_shift |
++                          (addr & 0x0FFF);
++                if (addr & 0xF000) {
++                        u32 tmp;
++
++                        tmp = addr & 0xF000;
++                        tmp >>= 12;
++                        eri_cmd |= (tmp << 20) & 0x00F00000;
++                }
++
++                RTL_W32(tp, ERIAR, eri_cmd);
++
++                for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                        udelay(R8125_CHANNEL_WAIT_TIME);
++
++                        /* Check if the RTL8125 has completed ERI write */
++                        if (!(RTL_R32(tp, ERIAR) & ERIAR_Flag))
++                                break;
++                }
++
++                if (len <= 4 - val_shift) {
++                        len = 0;
++                } else {
++                        len -= (4 - val_shift);
++                        shift = 4 - val_shift;
++                        addr += 4;
++                }
++        }
++
++        udelay(R8125_CHANNEL_EXIT_DELAY_TIME);
++
++        return 0;
++}
++
++int rtl8125_eri_write(struct rtl8125_private *tp, int addr, int len, u32 value, int type)
++{
++        return rtl8125_eri_write_with_oob_base_address(tp, addr, len, value, type, NO_BASE_ADDRESS);
++}
++
++static void
++rtl8125_enable_rxdvgate(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) | BIT_3);
++}
++
++static void
++rtl8125_disable_rxdvgate(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_3);
++}
++
++static u8
++rtl8125_is_gpio_low(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u8 gpio_low = FALSE;
++
++        switch (tp->HwSuppCheckPhyDisableModeVer) {
++        case 3:
++                if (!(rtl8125_mac_ocp_read(tp, 0xDC04) & BIT_13))
++                        gpio_low = TRUE;
++                break;
++        }
++
++        if (gpio_low)
++                dprintk("gpio is low.\n");
++
++        return gpio_low;
++}
++
++static u8
++rtl8125_is_phy_disable_mode_enabled(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u8 phy_disable_mode_enabled = FALSE;
++
++        switch (tp->HwSuppCheckPhyDisableModeVer) {
++        case 3:
++                if (RTL_R8(tp, 0xF2) & BIT_5)
++                        phy_disable_mode_enabled = TRUE;
++                break;
++        }
++
++        if (phy_disable_mode_enabled)
++                dprintk("phy disable mode enabled.\n");
++
++        return phy_disable_mode_enabled;
++}
++
++static u8
++rtl8125_is_in_phy_disable_mode(struct net_device *dev)
++{
++        u8 in_phy_disable_mode = FALSE;
++
++        if (rtl8125_is_phy_disable_mode_enabled(dev) && rtl8125_is_gpio_low(dev))
++                in_phy_disable_mode = TRUE;
++
++        if (in_phy_disable_mode)
++                dprintk("Hardware is in phy disable mode.\n");
++
++        return in_phy_disable_mode;
++}
++
++static bool
++rtl8125_stop_all_request(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                for (i = 0; i < 20; i++) {
++                        udelay(10);
++                        if (!(RTL_R8(tp, ChipCmd) & StopReq))
++                                break;
++                }
++
++                if (i == 20)
++                        return false;
++                break;
++        default:
++                udelay(200);
++                break;
++        }
++
++        return true;
++}
++
++static void
++rtl8125_clear_stop_all_request(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) & (CmdTxEnb | CmdRxEnb));
++}
++
++void
++rtl8125_wait_txrx_fifo_empty(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        /* Txfifo_empty require StopReq been set */
++        for (i = 0; i < 3000; i++) {
++                udelay(50);
++                if ((RTL_R8(tp, MCUCmd_reg) & (Txfifo_empty | Rxfifo_empty)) == (Txfifo_empty | Rxfifo_empty))
++                        break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                for (i = 0; i < 3000; i++) {
++                        udelay(50);
++                        if ((RTL_R16(tp, IntrMitigate) & (BIT_0 | BIT_1 | BIT_8)) == (BIT_0 | BIT_1 | BIT_8))
++                                break;
++                }
++                break;
++        }
++}
++
++#ifdef ENABLE_DASH_SUPPORT
++
++static inline void
++rtl8125_enable_dash2_interrupt(struct rtl8125_private *tp)
++{
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        if (!tp->DASH)
++                return;
++
++        rtl8125_set_ipc2_soc_imr_bit(tp, RISC_IPC2_INTR);
++}
++
++static inline void
++rtl8125_disable_dash2_interrupt(struct rtl8125_private *tp)
++{
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        rtl8125_clear_ipc2_soc_imr_bit(tp, RISC_IPC2_INTR);
++}
++#endif
++
++void
++rtl8125_enable_hw_linkchg_interrupt(struct rtl8125_private *tp)
++{
++        switch (tp->HwCurrIsrVer) {
++        case 7:
++                RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V7_LINKCHG);
++                break;
++        case 5:
++                RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V5_LINKCHG);
++                break;
++        case 4:
++                RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V4_LINKCHG);
++                break;
++        case 2:
++        case 3:
++                RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V2_LINKCHG);
++                break;
++        case 1:
++                RTL_W32(tp, tp->imr_reg[0], LinkChg | RTL_R32(tp, tp->imr_reg[0]));
++                break;
++        }
++
++#ifdef ENABLE_DASH_SUPPORT
++        if (tp->DASH)
++                rtl8125_enable_dash2_interrupt(tp);
++#endif
++}
++
++static inline void
++rtl8125_enable_hw_interrupt(struct rtl8125_private *tp)
++{
++        switch (tp->HwCurrIsrVer) {
++        case 2:
++        case 3:
++        case 4:
++        case 5:
++        case 7:
++                RTL_W32(tp, IMR_V2_SET_REG_8125, tp->intr_mask);
++                break;
++        case 1:
++                RTL_W32(tp, tp->imr_reg[0], tp->intr_mask);
++
++                if (R8125_MULTI_RX_Q(tp)) {
++                        int i;
++                        for (i=1; i<tp->num_rx_rings; i++)
++                                RTL_W16(tp, tp->imr_reg[i], other_q_intr_mask);
++                }
++                break;
++        }
++
++#ifdef ENABLE_DASH_SUPPORT
++        if (tp->DASH)
++                rtl8125_enable_dash2_interrupt(tp);
++#endif
++}
++
++static inline void rtl8125_clear_hw_isr_v2(struct rtl8125_private *tp,
++                u32 message_id)
++{
++        RTL_W32(tp, ISR_V2_8125, BIT(message_id));
++}
++
++static inline void
++rtl8125_disable_hw_interrupt(struct rtl8125_private *tp)
++{
++        if (tp->HwCurrIsrVer > 1) {
++                RTL_W32(tp, IMR_V2_CLEAR_REG_8125, 0xFFFFFFFF);
++                if (tp->HwCurrIsrVer > 3)
++                        RTL_W32(tp, IMR_V4_L2_CLEAR_REG_8125, 0xFFFFFFFF);
++        } else {
++                RTL_W32(tp, tp->imr_reg[0], 0x0000);
++
++                if (R8125_MULTI_RX_Q(tp)) {
++                        int i;
++                        for (i=1; i<tp->num_rx_rings; i++)
++                                RTL_W16(tp, tp->imr_reg[i], 0);
++                }
++        }
++
++#ifdef ENABLE_DASH_SUPPORT
++        rtl8125_disable_dash2_interrupt(tp);
++#endif
++}
++
++static inline void
++rtl8125_switch_to_hw_interrupt(struct rtl8125_private *tp)
++{
++        RTL_W32(tp, TIMER_INT0_8125, 0x0000);
++
++        rtl8125_enable_hw_interrupt(tp);
++}
++
++static inline void
++rtl8125_switch_to_timer_interrupt(struct rtl8125_private *tp)
++{
++        if (tp->use_timer_interrupt) {
++                RTL_W32(tp, TIMER_INT0_8125, timer_count);
++                RTL_W32(tp, TCTR0_8125, timer_count);
++                RTL_W32(tp, tp->imr_reg[0], tp->timer_intr_mask);
++        } else {
++                rtl8125_switch_to_hw_interrupt(tp);
++        }
++}
++
++static void
++rtl8125_irq_mask_and_ack(struct rtl8125_private *tp)
++{
++        rtl8125_disable_hw_interrupt(tp);
++
++        if (tp->HwCurrIsrVer > 1) {
++                RTL_W32(tp, ISR_V2_8125, 0xFFFFFFFF);
++                if (tp->HwCurrIsrVer > 3)
++                        RTL_W32(tp, ISR_V4_L2_8125, 0xFFFFFFFF);
++        } else {
++                RTL_W32(tp, tp->isr_reg[0], RTL_R32(tp, tp->isr_reg[0]));
++
++                if (R8125_MULTI_RX_Q(tp)) {
++                        int i;
++                        for (i=1; i<tp->num_rx_rings; i++)
++                                RTL_W16(tp, tp->isr_reg[i], RTL_R16(tp, tp->isr_reg[i]));
++                }
++        }
++
++#ifdef ENABLE_DASH_SUPPORT
++        rtl8125_clear_ipc2_isr(tp);
++#endif
++}
++
++static void
++rtl8125_disable_rx_packet_filter(struct rtl8125_private *tp)
++{
++
++        RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) &
++                ~(AcceptErr | AcceptRunt |AcceptBroadcast | AcceptMulticast |
++                  AcceptMyPhys |  AcceptAllPhys));
++}
++
++static void
++rtl8125_nic_reset(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        rtl8125_disable_rx_packet_filter(tp);
++
++        rtl8125_enable_rxdvgate(dev);
++
++        rtl8125_stop_all_request(dev);
++
++        rtl8125_wait_txrx_fifo_empty(dev);
++
++        rtl8125_clear_stop_all_request(dev);
++
++        /* Soft reset the chip. */
++        RTL_W8(tp, ChipCmd, CmdReset);
++
++        /* Check that the chip has finished the reset. */
++        for (i = 100; i > 0; i--) {
++                udelay(100);
++                if ((RTL_R8(tp, ChipCmd) & CmdReset) == 0)
++                        break;
++        }
++
++        /* reset rcr */
++        RTL_W32(tp, RxConfig, (RX_DMA_BURST_512 << RxCfgDMAShift));
++}
++
++static void
++rtl8125_hw_set_interrupt_type(struct rtl8125_private *tp, u8 isr_ver)
++{
++        u8 tmp;
++
++        if (tp->HwSuppIsrVer < 2)
++                return;
++
++        tmp = RTL_R8(tp, INT_CFG0_8125);
++
++        switch (tp->HwSuppIsrVer) {
++        case 7:
++                tmp &= ~INT_CFG0_AVOID_MISS_INTR;
++                fallthrough;
++        case 4:
++        case 5:
++                if (tp->HwSuppIsrVer == 7)
++                        tmp &= ~INT_CFG0_AUTO_CLEAR_IMR;
++                else
++                        tmp &= ~INT_CFG0_MSIX_ENTRY_NUM_MODE;
++                fallthrough;
++        case 2:
++        case 3:
++                tmp &= ~(INT_CFG0_ENABLE_8125);
++                if (isr_ver > 1)
++                        tmp |= INT_CFG0_ENABLE_8125;
++                break;
++        default:
++                return;
++        }
++
++        RTL_W8(tp, INT_CFG0_8125, tmp);
++}
++
++static void
++rtl8125_hw_clear_timer_int(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W32(tp, TIMER_INT0_8125, 0x0000);
++        RTL_W32(tp, TIMER_INT1_8125, 0x0000);
++        RTL_W32(tp, TIMER_INT2_8125, 0x0000);
++        RTL_W32(tp, TIMER_INT3_8125, 0x0000);
++}
++
++static void
++rtl8125_hw_clear_int_miti(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        switch (tp->HwSuppIntMitiVer) {
++        case 3:
++        case 6:
++                //IntMITI_0-IntMITI_31
++                for (i=0xA00; i<0xB00; i+=4)
++                        RTL_W32(tp, i, 0x0000);
++                break;
++        case 4:
++        case 5:
++                //IntMITI_0-IntMITI_15
++                for (i = 0xA00; i < 0xA80; i += 4)
++                        RTL_W32(tp, i, 0x0000);
++
++                if (tp->HwSuppIntMitiVer == 5)
++                        RTL_W8(tp, INT_CFG0_8125, RTL_R8(tp, INT_CFG0_8125) &
++                               ~(INT_CFG0_TIMEOUT0_BYPASS_8125 |
++                                 INT_CFG0_MITIGATION_BYPASS_8125 |
++                                 INT_CFG0_RDU_BYPASS_8126));
++                else
++                        RTL_W8(tp, INT_CFG0_8125, RTL_R8(tp, INT_CFG0_8125) &
++                               ~(INT_CFG0_TIMEOUT0_BYPASS_8125 | INT_CFG0_MITIGATION_BYPASS_8125));
++
++                RTL_W16(tp, INT_CFG1_8125, 0x0000);
++                break;
++        }
++}
++
++static bool
++rtl8125_vec_2_tx_q_num(
++        struct rtl8125_private *tp,
++        u32 messageId,
++        u32 *qnum
++)
++{
++        u32 whichQ = 0xffffffff;
++        bool rc = false;
++
++        switch (tp->HwSuppIsrVer) {
++        case 2:
++                if (messageId == 0x10)
++                        whichQ = 0;
++                else if (messageId == 0x12 && tp->num_tx_rings > 1)
++                        whichQ = 1;
++                break;
++        case 3:
++        case 4:
++                if (messageId == 0x00)
++                        whichQ = 0;
++                else if (messageId == 0x01 && tp->num_tx_rings > 1)
++                        whichQ = 1;
++                break;
++        case 5:
++                if (messageId == 0x10)
++                        whichQ = 0;
++                else if (messageId == 0x11 && tp->num_tx_rings > 1)
++                        whichQ = 1;
++                break;
++        case 6:
++                if (messageId == 0x08)
++                        whichQ = 0;
++                else if (messageId == 0x09 && tp->num_tx_rings > 1)
++                        whichQ = 1;
++                break;
++        case 7:
++                if (messageId == 0x1B)
++                        whichQ = 0;
++                else if (messageId == 0x1C && tp->num_tx_rings > 1)
++                        whichQ = 1;
++                break;
++        }
++
++        if (whichQ != 0xffffffff) {
++                *qnum = whichQ;
++                rc = true;
++        }
++
++        return rc;
++}
++
++static bool
++rtl8125_vec_2_rx_q_num(
++        struct rtl8125_private *tp,
++        u32 messageId,
++        u32 *qnum
++)
++{
++        u32 whichQ = 0xffffffff;
++        bool rc = false;
++
++        switch (tp->HwSuppIsrVer) {
++        case 2:
++        case 3:
++        case 4:
++        case 5:
++        case 6:
++        case 7:
++                if (messageId < tp->HwSuppNumRxQueues)
++                        whichQ = messageId;
++                break;
++        }
++
++        if (whichQ != 0xffffffff) {
++                *qnum = whichQ;
++                rc = true;
++        }
++
++        return rc;
++}
++
++void
++rtl8125_hw_set_timer_int(struct rtl8125_private *tp,
++                         u32 message_id,
++                         u8 timer_intmiti_val)
++{
++        u32 qnum;
++
++        switch (tp->HwSuppIntMitiVer) {
++        case 4:
++        case 5:
++        case 6:
++#ifdef ENABLE_LIB_SUPPORT
++                if (message_id < R8125_MAX_RX_QUEUES_VEC_V3)
++                        timer_intmiti_val = 0;
++#else
++                if ((tp->HwCurrIsrVer == 2) && (message_id < R8125_MAX_RX_QUEUES_VEC_V3))
++                        timer_intmiti_val = 0;
++#endif //ENABLE_LIB_SUPPORT
++                //ROK
++                if (rtl8125_vec_2_rx_q_num(tp, message_id, &qnum))
++                        RTL_W8(tp,INT_MITI_V2_0_RX + 8 * qnum, timer_intmiti_val);
++                //TOK
++                if (rtl8125_vec_2_tx_q_num(tp, message_id, &qnum))
++                        RTL_W8(tp,INT_MITI_V2_0_TX + 8 * qnum, timer_intmiti_val);
++                break;
++        }
++}
++
++void
++rtl8125_hw_reset(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_lib_reset_prepare(tp);
++
++        /* Disable interrupts */
++        rtl8125_irq_mask_and_ack(tp);
++
++        rtl8125_hw_clear_timer_int(dev);
++
++        rtl8125_nic_reset(dev);
++}
++
++static unsigned int
++rtl8125_xmii_reset_pending(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        unsigned int retval;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1f, 0x0000);
++        retval = rtl8125_mdio_read(tp, MII_BMCR) & BMCR_RESET;
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return retval;
++}
++
++static unsigned int
++_rtl8125_xmii_link_ok(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 status;
++
++        status = rtl8125_get_phy_status(tp);
++        if (status == UINT_MAX)
++                return 0;
++
++        return (status & LinkStatus) ? 1 : 0;
++}
++
++static unsigned int
++rtl8125_xmii_link_ok(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned int link_state;
++
++        link_state = _rtl8125_xmii_link_ok(dev);
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp) &&
++            link_state == R8125_LINK_STATE_ON)
++                return rtl8125_fiber_link_ok(dev);
++#else
++        (void)tp;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        return link_state;
++}
++
++static int
++rtl8125_wait_phy_reset_complete(struct rtl8125_private *tp)
++{
++        int i, val;
++
++        for (i = 0; i < 2500; i++) {
++                val = rtl8125_mdio_read(tp, MII_BMCR) & BMCR_RESET;
++                if (!val)
++                        return 0;
++
++                mdelay(1);
++        }
++
++        return -1;
++}
++
++static void
++rtl8125_xmii_reset_enable(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        int ret;
++
++        if (rtl8125_is_in_phy_disable_mode(dev))
++                return;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1f, 0x0000);
++        rtl8125_mdio_write(tp, MII_ADVERTISE, rtl8125_mdio_read(tp, MII_ADVERTISE) &
++                           ~(ADVERTISE_10HALF | ADVERTISE_10FULL |
++                             ADVERTISE_100HALF | ADVERTISE_100FULL));
++        rtl8125_mdio_write(tp, MII_CTRL1000, rtl8125_mdio_read(tp, MII_CTRL1000) &
++                           ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL));
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4) &
++                                          ~RTK_ADVERTISE_2500FULL);
++        rtl8125_mdio_write(tp, MII_BMCR, BMCR_RESET | BMCR_ANENABLE);
++
++        ret = rtl8125_wait_phy_reset_complete(tp);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        if (ret != 0 && netif_msg_link(tp))
++                printk(KERN_ERR "%s: PHY reset failed.\n", dev->name);
++}
++
++void
++rtl8125_init_ring_indexes(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->HwSuppNumTxQueues; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++                ring->dirty_tx = ring->cur_tx = 0;
++                ring->NextHwDesCloPtr = 0;
++                ring->BeginHwDesCloPtr = 0;
++                ring->index = i;
++                ring->priv = tp;
++                ring->netdev = tp->dev;
++
++                /* reset BQL for queue */
++                netdev_tx_reset_queue(txring_txq(ring));
++        }
++
++        for (i = 0; i < tp->HwSuppNumRxQueues; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++                ring->dirty_rx = ring->cur_rx = 0;
++                ring->index = i;
++                ring->priv = tp;
++                ring->netdev = tp->dev;
++        }
++
++#ifdef ENABLE_LIB_SUPPORT
++        for (i = 0; i < tp->HwSuppNumTxQueues; i++) {
++                struct rtl8125_ring *ring = &tp->lib_tx_ring[i];
++                ring->direction = RTL8125_CH_DIR_TX;
++                ring->queue_num = i;
++                ring->private = tp;
++        }
++
++        for (i = 0; i < tp->HwSuppNumRxQueues; i++) {
++                struct rtl8125_ring *ring = &tp->lib_rx_ring[i];
++                ring->direction = RTL8125_CH_DIR_RX;
++                ring->queue_num = i;
++                ring->private = tp;
++        }
++#endif
++}
++
++static void
++rtl8125_issue_offset_99_event(struct rtl8125_private *tp)
++{
++        rtl8125_mac_ocp_write(tp, 0xE09A,  rtl8125_mac_ocp_read(tp, 0xE09A) | BIT_0);
++}
++
++#ifdef ENABLE_DASH_SUPPORT
++static void
++rtl8125_check_and_enable_dash_interrupt(struct rtl8125_private *tp)
++{
++        if (!HW_DASH_SUPPORT_IPC2(tp))
++                return;
++
++        if (!tp->DASH)
++                return;
++
++        //
++        // even disconnected, enable dash interrupt mask bits for in-band/out-band communication
++        //
++        rtl8125_enable_dash2_interrupt(tp);
++        if (tp->HwCurrIsrVer > 1) {
++                RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V4_LAYER2_INTR_STS);
++                RTL_W32(tp, IMR_V4_L2_SET_REG_8125, ISRIMR_V4_L2_IPC2);
++        } else {
++                RTL_W16(tp, tp->imr_reg[0], ISRIMR_DASH_INTR_EN);
++        }
++}
++#endif
++
++static int rtl8125_enable_eee_plus(struct rtl8125_private *tp)
++{
++        rtl8125_mac_ocp_write(tp, 0xE080, rtl8125_mac_ocp_read(tp, 0xE080)|BIT_1);
++
++        return 0;
++}
++
++static int rtl8125_disable_eee_plus(struct rtl8125_private *tp)
++{
++        rtl8125_mac_ocp_write(tp, 0xE080, rtl8125_mac_ocp_read(tp, 0xE080)&~BIT_1);
++
++        return 0;
++}
++
++static void rtl8125_enable_double_vlan(struct rtl8125_private *tp)
++{
++        RTL_W16(tp, DOUBLE_VLAN_CONFIG, 0xf002);
++}
++
++static void rtl8125_disable_double_vlan(struct rtl8125_private *tp)
++{
++        RTL_W16(tp, DOUBLE_VLAN_CONFIG, 0);
++}
++
++static void
++rtl8125_set_pfm_patch(struct rtl8125_private *tp, bool enable)
++{
++        if (!tp->RequiredPfmPatch)
++                goto exit;
++
++        if (enable) {
++                rtl8125_set_mac_ocp_bit(tp, 0xD3F0, BIT_0);
++                rtl8125_set_mac_ocp_bit(tp, 0xD3F2, BIT_0);
++                rtl8125_set_mac_ocp_bit(tp, 0xE85A, BIT_6);
++        } else {
++                rtl8125_clear_mac_ocp_bit(tp, 0xD3F0, BIT_0);
++                rtl8125_clear_mac_ocp_bit(tp, 0xD3F2, BIT_0);
++                rtl8125_clear_mac_ocp_bit(tp, 0xE85A, BIT_6);
++        }
++
++exit:
++        return;
++}
++
++static void
++rtl8125_link_on_patch(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        u32 status;
++
++        rtl8125_hw_config(dev);
++
++        if ((tp->mcfg == CFG_METHOD_2) &&
++            netif_running(dev)) {
++                if (rtl8125_get_phy_status(tp)&FullDup)
++                        RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | (BIT_24 | BIT_25)) & ~BIT_19);
++                else
++                        RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | BIT_25) & ~(BIT_19 | BIT_24));
++        }
++
++        status = rtl8125_get_phy_status(tp);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_12:
++                if (status & _10bps)
++                        rtl8125_enable_eee_plus(tp);
++                break;
++        default:
++                break;
++        }
++
++        if (tp->RequiredPfmPatch)
++                rtl8125_set_pfm_patch(tp, (status & _10bps) ? 1 : 0);
++
++        rtl8125_hw_start(dev);
++
++        netif_carrier_on(dev);
++
++        netif_tx_wake_all_queues(dev);
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        tp->phy_reg_aner = rtl8125_mdio_read(tp, MII_EXPANSION);
++        tp->phy_reg_anlpar = rtl8125_mdio_read(tp, MII_LPA);
++        tp->phy_reg_gbsr = rtl8125_mdio_read(tp, MII_STAT1000);
++        tp->phy_reg_status_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D6);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++#ifdef ENABLE_PTP_SUPPORT
++        if (tp->HwSuppPtpVer == 3)
++                rtl8125_set_phy_local_time(tp);
++#endif // ENABLE_PTP_SUPPORT
++}
++
++static void
++rtl8125_link_down_patch(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        tp->phy_reg_aner = 0;
++        tp->phy_reg_anlpar = 0;
++        tp->phy_reg_gbsr = 0;
++        tp->phy_reg_status_2500 = 0;
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_12:
++                rtl8125_disable_eee_plus(tp);
++                break;
++        default:
++                break;
++        }
++
++        if (tp->RequiredPfmPatch)
++                rtl8125_set_pfm_patch(tp, 1);
++
++        netif_carrier_off(dev);
++
++        netif_tx_disable(dev);
++
++        rtl8125_hw_reset(dev);
++
++        rtl8125_tx_clear(tp);
++
++        rtl8125_rx_clear(tp);
++
++        rtl8125_init_ring(dev);
++
++        rtl8125_enable_hw_linkchg_interrupt(tp);
++
++        //rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising);
++
++#ifdef ENABLE_DASH_SUPPORT
++        rtl8125_check_and_enable_dash_interrupt(tp);
++#endif
++}
++
++static void
++_rtl8125_check_link_status(struct net_device *dev, unsigned int link_state)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (link_state != R8125_LINK_STATE_OFF &&
++            link_state != R8125_LINK_STATE_ON)
++                link_state = tp->link_ok(dev);
++
++        if (link_state == R8125_LINK_STATE_ON) {
++                rtl8125_link_on_patch(dev);
++
++                if (netif_msg_ifup(tp))
++                        printk(KERN_INFO PFX "%s: link up\n", dev->name);
++        } else {
++                if (netif_msg_ifdown(tp))
++                        printk(KERN_INFO PFX "%s: link down\n", dev->name);
++
++                rtl8125_link_down_patch(dev);
++        }
++}
++
++static void
++rtl8125_check_link_status(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned int link_status_on;
++
++        tp->resume_not_chg_speed = 0;
++
++        link_status_on = tp->link_ok(dev);
++        if (netif_carrier_ok(dev) == link_status_on)
++                rtl8125_enable_hw_linkchg_interrupt(tp);
++        else
++                _rtl8125_check_link_status(dev, link_status_on);
++}
++
++static bool
++rtl8125_is_autoneg_mode_valid(u32 autoneg)
++{
++        switch(autoneg) {
++        case AUTONEG_ENABLE:
++        case AUTONEG_DISABLE:
++                return true;
++        default:
++                return false;
++        }
++}
++
++static bool
++rtl8125_is_speed_mode_valid(u32 speed)
++{
++        switch(speed) {
++        case SPEED_2500:
++        case SPEED_1000:
++        case SPEED_100:
++        case SPEED_10:
++                return true;
++        default:
++                return false;
++        }
++}
++
++static bool
++rtl8125_is_duplex_mode_valid(u8 duplex)
++{
++        switch(duplex) {
++        case DUPLEX_FULL:
++        case DUPLEX_HALF:
++                return true;
++        default:
++                return false;
++        }
++}
++
++static void
++rtl8125_set_link_option(struct rtl8125_private *tp,
++                        u8 autoneg,
++                        u32 speed,
++                        u8 duplex,
++                        enum rtl8125_fc_mode fc)
++{
++        u64 adv;
++
++        if (!rtl8125_is_speed_mode_valid(speed))
++                speed = SPEED_2500;
++
++        if (!rtl8125_is_duplex_mode_valid(duplex))
++                duplex = DUPLEX_FULL;
++
++        if (!rtl8125_is_autoneg_mode_valid(autoneg))
++                autoneg = AUTONEG_ENABLE;
++
++        speed = min(speed, tp->HwSuppMaxPhyLinkSpeed);
++
++        adv = 0;
++        switch(speed) {
++        case SPEED_2500:
++                adv |= ADVERTISED_2500baseX_Full;
++                fallthrough;
++        default:
++                adv |= (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full |
++                        ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full |
++                        ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full);
++                break;
++        }
++
++        tp->autoneg = autoneg;
++        tp->speed = speed;
++        tp->duplex = duplex;
++        tp->advertising = adv;
++        tp->fcpause = fc;
++}
++
++/*
++static void
++rtl8125_enable_ocp_phy_power_saving(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 val;
++
++        if (tp->mcfg == CFG_METHOD_2 ||
++            tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6) {
++                val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC416);
++                if (val != 0x0050) {
++                        rtl8125_set_phy_mcu_patch_request(tp);
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0000);
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0050);
++                        rtl8125_clear_phy_mcu_patch_request(tp);
++                }
++        }
++}
++*/
++
++static void
++rtl8125_disable_ocp_phy_power_saving(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 val;
++
++        if (tp->mcfg == CFG_METHOD_2 ||
++            tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6) {
++                val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC416);
++                if (val != 0x0500) {
++                        rtl8125_set_phy_mcu_patch_request(tp);
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0000);
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0500);
++                        rtl8125_clear_phy_mcu_patch_request(tp);
++                }
++        }
++}
++
++static void
++rtl8125_wait_ll_share_fifo_ready(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        for (i = 0; i < 10; i++) {
++                udelay(100);
++                if (RTL_R16(tp, 0xD2) & BIT_9)
++                        break;
++        }
++}
++
++static void
++rtl8125_disable_pci_offset_99(struct rtl8125_private *tp)
++{
++        rtl8125_mac_ocp_write(tp, 0xE032,  rtl8125_mac_ocp_read(tp, 0xE032) & ~(BIT_0 | BIT_1));
++
++        rtl8125_csi_fun0_write_byte(tp, 0x99, 0x00);
++}
++
++static void
++rtl8125_enable_pci_offset_99(struct rtl8125_private *tp)
++{
++        u32 csi_tmp;
++
++        rtl8125_csi_fun0_write_byte(tp, 0x99, tp->org_pci_offset_99);
++
++        csi_tmp = rtl8125_mac_ocp_read(tp, 0xE032);
++        csi_tmp &= ~(BIT_0 | BIT_1);
++        if (tp->org_pci_offset_99 & (BIT_5 | BIT_6))
++                csi_tmp |= BIT_1;
++        if (tp->org_pci_offset_99 & BIT_2)
++                csi_tmp |= BIT_0;
++        rtl8125_mac_ocp_write(tp, 0xE032, csi_tmp);
++}
++
++static void
++rtl8125_init_pci_offset_99(struct rtl8125_private *tp)
++{
++        rtl8125_mac_ocp_write(tp, 0xCDD0, 0x9003);
++        rtl8125_set_mac_ocp_bit(tp, 0xE034, (BIT_15 | BIT_14));
++        rtl8125_mac_ocp_write(tp, 0xCDD2, 0x889C);
++        rtl8125_mac_ocp_write(tp, 0xCDD8, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDD4, 0x8C30);
++        rtl8125_mac_ocp_write(tp, 0xCDDA, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDD6, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDDC, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDE8, 0x883E);
++        rtl8125_mac_ocp_write(tp, 0xCDEA, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDEC, 0x889C);
++        rtl8125_mac_ocp_write(tp, 0xCDEE, 0x9003);
++        rtl8125_mac_ocp_write(tp, 0xCDF0, 0x8C09);
++        rtl8125_mac_ocp_write(tp, 0xCDF2, 0x9003);
++        rtl8125_set_mac_ocp_bit(tp, 0xE032, BIT_14);
++        rtl8125_set_mac_ocp_bit(tp, 0xE0A2, BIT_0);
++
++        rtl8125_enable_pci_offset_99(tp);
++}
++
++static void
++rtl8125_disable_pci_offset_180(struct rtl8125_private *tp)
++{
++        rtl8125_clear_mac_ocp_bit(tp, 0xE092, 0x00FF);
++}
++
++static void
++rtl8125_enable_pci_offset_180(struct rtl8125_private *tp)
++{
++        rtl8125_clear_mac_ocp_bit(tp, 0xE094, 0xFF00);
++
++        rtl8125_clear_set_mac_ocp_bit(tp, 0xE092, 0x00FF, BIT_2);
++}
++
++static void
++rtl8125_init_pci_offset_180(struct rtl8125_private *tp)
++{
++        rtl8125_enable_pci_offset_180(tp);
++}
++
++static void
++rtl8125_set_pci_99_exit_driver_para(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->org_pci_offset_99 & BIT_2)
++                rtl8125_issue_offset_99_event(tp);
++        rtl8125_disable_pci_offset_99(tp);
++}
++
++static void
++rtl8125_enable_cfg9346_write(struct rtl8125_private *tp)
++{
++        RTL_W8(tp, Cfg9346, RTL_R8(tp, Cfg9346) | Cfg9346_Unlock);
++}
++
++static void
++rtl8125_disable_cfg9346_write(struct rtl8125_private *tp)
++{
++        RTL_W8(tp, Cfg9346, RTL_R8(tp, Cfg9346) & ~Cfg9346_Unlock);
++}
++
++static void
++rtl8125_enable_exit_l1_mask(struct rtl8125_private *tp)
++{
++        //(1)ERI(0xD4)(OCP 0xC0AC).bit[7:12]=6'b111111, L1 Mask
++        rtl8125_set_mac_ocp_bit(tp, 0xC0AC, (BIT_7 | BIT_8 | BIT_9 | BIT_10 | BIT_11 | BIT_12));
++}
++
++static void
++rtl8125_disable_exit_l1_mask(struct rtl8125_private *tp)
++{
++        //(1)ERI(0xD4)(OCP 0xC0AC).bit[7:12]=6'b000000, L1 Mask
++        rtl8125_clear_mac_ocp_bit(tp, 0xC0AC, (BIT_7 | BIT_8 | BIT_9 | BIT_10 | BIT_11 | BIT_12));
++}
++
++static void
++rtl8125_enable_extend_tally_couter(struct rtl8125_private *tp)
++{
++        switch (tp->HwSuppExtendTallyCounterVer) {
++        case 1:
++                rtl8125_set_mac_ocp_bit(tp, 0xEA84, (BIT_1 | BIT_0));
++                break;
++        }
++}
++
++static void
++rtl8125_disable_extend_tally_couter(struct rtl8125_private *tp)
++{
++        switch (tp->HwSuppExtendTallyCounterVer) {
++        case 1:
++                rtl8125_clear_mac_ocp_bit(tp, 0xEA84, (BIT_1 | BIT_0));
++                break;
++        }
++}
++
++static void
++rtl8125_enable_force_clkreq(struct rtl8125_private *tp, bool enable)
++{
++        if (enable)
++                RTL_W8(tp, 0xF1, RTL_R8(tp, 0xF1) | BIT_7);
++        else
++                RTL_W8(tp, 0xF1, RTL_R8(tp, 0xF1) & ~BIT_7);
++}
++
++static void
++rtl8125_enable_aspm_clkreq_lock(struct rtl8125_private *tp, bool enable)
++{
++        bool unlock_cfg_wr;
++
++        if ((RTL_R8(tp, Cfg9346) & Cfg9346_EEM_MASK) == Cfg9346_Unlock)
++                unlock_cfg_wr = false;
++        else
++                unlock_cfg_wr = true;
++
++        if (unlock_cfg_wr)
++                rtl8125_enable_cfg9346_write(tp);
++
++        if (enable) {
++                RTL_W8(tp, Config2, RTL_R8(tp, Config2) | BIT_7);
++                RTL_W8(tp, Config5, RTL_R8(tp, Config5) | BIT_0);
++        } else {
++                RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~BIT_7);
++                RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~BIT_0);
++        }
++
++        if (unlock_cfg_wr)
++                rtl8125_disable_cfg9346_write(tp);
++}
++
++static void
++rtl8125_set_reg_oobs_en_sel(struct rtl8125_private *tp, bool enable)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                if (enable)
++                        rtl8125_set_mac_ocp_bit(tp, 0xD434, BIT_1);
++                else
++                        rtl8125_clear_mac_ocp_bit(tp, 0xD434, BIT_1);
++                break;
++        default:
++                break;
++        }
++}
++
++static void
++rtl8125_hw_d3_para(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W16(tp, RxMaxSize, RX_BUF_SIZE);
++
++        rtl8125_enable_force_clkreq(tp, 0);
++        rtl8125_enable_aspm_clkreq_lock(tp, 0);
++
++        rtl8125_disable_exit_l1_mask(tp);
++
++#ifdef ENABLE_REALWOW_SUPPORT
++        rtl8125_set_realwow_d3_para(dev);
++#endif
++
++        rtl8125_set_pci_99_exit_driver_para(dev);
++
++        /*disable ocp phy power saving*/
++        if (tp->mcfg == CFG_METHOD_2 ||
++            tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6)
++                rtl8125_disable_ocp_phy_power_saving(dev);
++
++        rtl8125_disable_rxdvgate(dev);
++
++        rtl8125_disable_extend_tally_couter(tp);
++
++        rtl8125_set_reg_oobs_en_sel(tp, false);
++}
++
++static void
++rtl8125_enable_magic_packet(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppMagicPktVer) {
++        case WAKEUP_MAGIC_PACKET_V3:
++                rtl8125_mac_ocp_write(tp, 0xC0B6, rtl8125_mac_ocp_read(tp, 0xC0B6) | BIT_0);
++                break;
++        }
++}
++static void
++rtl8125_disable_magic_packet(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppMagicPktVer) {
++        case WAKEUP_MAGIC_PACKET_V3:
++                rtl8125_mac_ocp_write(tp, 0xC0B6, rtl8125_mac_ocp_read(tp, 0xC0B6) & ~BIT_0);
++                break;
++        }
++}
++
++static void
++rtl8125_enable_linkchg_wakeup(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppLinkChgWakeUpVer) {
++        case 3:
++                RTL_W8(tp, Config3, RTL_R8(tp, Config3) | LinkUp);
++                rtl8125_clear_set_mac_ocp_bit(tp, 0xE0C6, (BIT_5 | BIT_3 | BIT_2), (BIT_4 | BIT_1 | BIT_0));
++                break;
++        }
++}
++
++static void
++rtl8125_disable_linkchg_wakeup(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppLinkChgWakeUpVer) {
++        case 3:
++                RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~LinkUp);
++                if (!(rtl8125_mac_ocp_read(tp, 0xE0C6) & BIT_0))
++                        rtl8125_clear_set_mac_ocp_bit(tp, 0xE0C6, (BIT_5 | BIT_3 | BIT_2 | BIT_1), BIT_4);
++                break;
++        }
++}
++
++#define WAKE_ANY (WAKE_PHY | WAKE_MAGIC | WAKE_UCAST | WAKE_BCAST | WAKE_MCAST)
++
++static u32
++rtl8125_get_hw_wol(struct rtl8125_private *tp)
++{
++        u8 options;
++        u32 csi_tmp;
++        u32 wol_opts = 0;
++
++        if (disable_wol_support)
++                goto out;
++
++        options = RTL_R8(tp, Config1);
++        if (!(options & PMEnable))
++                goto out;
++
++        options = RTL_R8(tp, Config3);
++        if (options & LinkUp)
++                wol_opts |= WAKE_PHY;
++
++        switch (tp->HwSuppMagicPktVer) {
++        case WAKEUP_MAGIC_PACKET_V3:
++                csi_tmp = rtl8125_mac_ocp_read(tp, 0xC0B6);
++                if (csi_tmp & BIT_0)
++                        wol_opts |= WAKE_MAGIC;
++                break;
++        }
++
++        options = RTL_R8(tp, Config5);
++        if (options & UWF)
++                wol_opts |= WAKE_UCAST;
++        if (options & BWF)
++                wol_opts |= WAKE_BCAST;
++        if (options & MWF)
++                wol_opts |= WAKE_MCAST;
++
++out:
++        return wol_opts;
++}
++
++static void
++rtl8125_enable_d0_speedup(struct rtl8125_private *tp)
++{
++        u16 clearmask;
++        u16 setmask;
++
++        if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp))
++                return;
++
++        if (tp->D0SpeedUpSpeed == D0_SPEED_UP_SPEED_DISABLE)
++                return;
++
++        if (tp->HwSuppD0SpeedUpVer == 1 || tp->HwSuppD0SpeedUpVer == 2) {
++                //speed up speed
++                clearmask = (BIT_10 | BIT_9 | BIT_8 | BIT_7);
++                if (tp->D0SpeedUpSpeed == D0_SPEED_UP_SPEED_2500)
++                        setmask = BIT_7;
++                else
++                        setmask = 0;
++                rtl8125_clear_set_mac_ocp_bit(tp, 0xE10A, clearmask, setmask);
++
++                //speed up flowcontrol
++                clearmask = (BIT_15 | BIT_14);
++                if (tp->HwSuppD0SpeedUpVer == 2)
++                        clearmask |= BIT_13;
++
++                if (tp->fcpause == rtl8125_fc_full) {
++                        setmask = (BIT_15 | BIT_14);
++                        if (tp->HwSuppD0SpeedUpVer == 2)
++                                setmask |= BIT_13;
++                } else
++                        setmask = 0;
++                rtl8125_clear_set_mac_ocp_bit(tp, 0xE860, clearmask, setmask);
++        }
++
++        RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) | BIT_3);
++}
++
++static void
++rtl8125_disable_d0_speedup(struct rtl8125_private *tp)
++{
++        if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp))
++                return;
++
++        RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) & ~BIT_3);
++}
++
++static void
++rtl8125_set_hw_wol(struct net_device *dev, u32 wolopts)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i,tmp;
++        static struct {
++                u32 opt;
++                u16 reg;
++                u8  mask;
++        } cfg[] = {
++                { WAKE_PHY,   Config3, LinkUp },
++                { WAKE_UCAST, Config5, UWF },
++                { WAKE_BCAST, Config5, BWF },
++                { WAKE_MCAST, Config5, MWF },
++                { WAKE_ANY,   Config5, LanWake },
++                { WAKE_MAGIC, Config3, MagicPacket },
++        };
++
++        switch (tp->HwSuppMagicPktVer) {
++        case WAKEUP_MAGIC_PACKET_V3:
++        default:
++                tmp = ARRAY_SIZE(cfg) - 1;
++
++                if (wolopts & WAKE_MAGIC)
++                        rtl8125_enable_magic_packet(dev);
++                else
++                        rtl8125_disable_magic_packet(dev);
++                break;
++        }
++
++        rtl8125_enable_cfg9346_write(tp);
++
++        for (i = 0; i < tmp; i++) {
++                u8 options = RTL_R8(tp, cfg[i].reg) & ~cfg[i].mask;
++                if (wolopts & cfg[i].opt)
++                        options |= cfg[i].mask;
++                RTL_W8(tp, cfg[i].reg, options);
++        }
++
++        switch (tp->HwSuppLinkChgWakeUpVer) {
++        case 3:
++                if (wolopts & WAKE_PHY)
++                        rtl8125_enable_linkchg_wakeup(dev);
++                else
++                        rtl8125_disable_linkchg_wakeup(dev);
++                break;
++        }
++
++        rtl8125_disable_cfg9346_write(tp);
++}
++
++static void
++rtl8125_phy_restart_nway(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (rtl8125_is_in_phy_disable_mode(dev))
++                return;
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE | BMCR_ANRESTART);
++}
++
++static void
++rtl8125_phy_setup_force_mode(struct net_device *dev, u32 speed, u8 duplex)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 bmcr_true_force = 0;
++
++        if (rtl8125_is_in_phy_disable_mode(dev))
++                return;
++
++        if ((speed == SPEED_10) && (duplex == DUPLEX_HALF)) {
++                bmcr_true_force = BMCR_SPEED10;
++        } else if ((speed == SPEED_10) && (duplex == DUPLEX_FULL)) {
++                bmcr_true_force = BMCR_SPEED10 | BMCR_FULLDPLX;
++        } else if ((speed == SPEED_100) && (duplex == DUPLEX_HALF)) {
++                bmcr_true_force = BMCR_SPEED100;
++        } else if ((speed == SPEED_100) && (duplex == DUPLEX_FULL)) {
++                bmcr_true_force = BMCR_SPEED100 | BMCR_FULLDPLX;
++        } else {
++                netif_err(tp, drv, dev, "Failed to set phy force mode!\n");
++                return;
++        }
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        rtl8125_mdio_write(tp, MII_BMCR, bmcr_true_force);
++}
++
++static void
++rtl8125_set_pci_pme(struct rtl8125_private *tp, int set)
++{
++        struct pci_dev *pdev = tp->pci_dev;
++        u16 pmc;
++
++        if (!pdev->pm_cap)
++                return;
++
++        pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmc);
++        pmc |= PCI_PM_CTRL_PME_STATUS;
++        if (set)
++                pmc |= PCI_PM_CTRL_PME_ENABLE;
++        else
++                pmc &= ~PCI_PM_CTRL_PME_ENABLE;
++        pci_write_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, pmc);
++}
++
++static void
++rtl8125_enable_giga_lite(struct rtl8125_private *tp, u64 adv)
++{
++        if (adv & ADVERTISED_1000baseT_Full)
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA428, BIT_9);
++        else
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_9);
++
++        if (adv & ADVERTISED_2500baseX_Full)
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0);
++        else
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0);
++}
++
++static void
++rtl8125_disable_giga_lite(struct rtl8125_private *tp)
++{
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_9);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0);
++}
++
++static int
++rtl8125_set_wol_link_speed(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++        int auto_nego = 0;
++        int giga_ctrl;
++        int ctrl_2500;
++        u64 adv;
++        u16 anlpar;
++        u16 gbsr;
++        u16 status_2500;
++        u16 aner;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (tp->autoneg != AUTONEG_ENABLE)
++                goto exit;
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++
++        auto_nego = rtl8125_mdio_read(tp, MII_ADVERTISE);
++        auto_nego &= ~(ADVERTISE_10HALF | ADVERTISE_10FULL
++                       | ADVERTISE_100HALF | ADVERTISE_100FULL);
++
++        giga_ctrl = rtl8125_mdio_read(tp, MII_CTRL1000);
++        giga_ctrl &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL);
++
++        ctrl_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4);
++        ctrl_2500 &= ~RTK_ADVERTISE_2500FULL;
++
++        aner = tp->phy_reg_aner;
++        anlpar = tp->phy_reg_anlpar;
++        gbsr = tp->phy_reg_gbsr;
++        status_2500 = tp->phy_reg_status_2500;
++        if (tp->link_ok(dev)) {
++                aner = rtl8125_mdio_read(tp, MII_EXPANSION);
++                anlpar = rtl8125_mdio_read(tp, MII_LPA);
++                gbsr = rtl8125_mdio_read(tp, MII_STAT1000);
++                status_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D6);
++        }
++
++        adv = tp->advertising;
++        if ((aner | anlpar | gbsr | status_2500) == 0) {
++                int auto_nego_tmp = 0;
++                if (adv & ADVERTISED_10baseT_Half)
++                        auto_nego_tmp |= ADVERTISE_10HALF;
++                if (adv & ADVERTISED_10baseT_Full)
++                        auto_nego_tmp |= ADVERTISE_10FULL;
++                if (adv & ADVERTISED_100baseT_Half)
++                        auto_nego_tmp |= ADVERTISE_100HALF;
++                if (adv & ADVERTISED_100baseT_Full)
++                        auto_nego_tmp |= ADVERTISE_100FULL;
++
++                if (auto_nego_tmp == 0)
++                        goto exit;
++
++                auto_nego |= auto_nego_tmp;
++                goto skip_check_lpa;
++        }
++        if (!(aner & EXPANSION_NWAY))
++                goto exit;
++
++        if ((adv & ADVERTISED_10baseT_Half) && (anlpar & LPA_10HALF))
++                auto_nego |= ADVERTISE_10HALF;
++        else if ((adv & ADVERTISED_10baseT_Full) && (anlpar & LPA_10FULL))
++                auto_nego |= ADVERTISE_10FULL;
++        else if ((adv & ADVERTISED_100baseT_Half) && (anlpar & LPA_100HALF))
++                auto_nego |= ADVERTISE_100HALF;
++        else if ((adv & ADVERTISED_100baseT_Full) && (anlpar & LPA_100FULL))
++                auto_nego |= ADVERTISE_100FULL;
++        else if (adv & ADVERTISED_1000baseT_Half && (gbsr & LPA_1000HALF))
++                giga_ctrl |= ADVERTISE_1000HALF;
++        else if (adv & ADVERTISED_1000baseT_Full && (gbsr & LPA_1000FULL))
++                giga_ctrl |= ADVERTISE_1000FULL;
++        else if (adv & ADVERTISED_2500baseX_Full && (status_2500 & RTK_LPA_ADVERTISE_2500FULL))
++                ctrl_2500 |= RTK_ADVERTISE_2500FULL;
++        else
++                goto exit;
++
++skip_check_lpa:
++        if (tp->DASH)
++                auto_nego |= (ADVERTISE_100FULL | ADVERTISE_100HALF | ADVERTISE_10HALF | ADVERTISE_10FULL);
++
++#ifdef CONFIG_DOWN_SPEED_100
++        auto_nego |= (ADVERTISE_100FULL | ADVERTISE_100HALF | ADVERTISE_10HALF | ADVERTISE_10FULL);
++#endif
++
++        rtl8125_mdio_write(tp, MII_ADVERTISE, auto_nego);
++        rtl8125_mdio_write(tp, MII_CTRL1000, giga_ctrl);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, ctrl_2500);
++
++        rtl8125_disable_giga_lite(tp);
++
++        rtl8125_phy_restart_nway(dev);
++
++exit:
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return auto_nego;
++}
++
++static bool
++rtl8125_keep_wol_link_speed(struct net_device *dev, u8 from_suspend)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (from_suspend && tp->link_ok(dev) && (tp->wol_opts & WAKE_PHY))
++                return 1;
++
++        if (!from_suspend && tp->resume_not_chg_speed)
++                return 1;
++
++        return 0;
++}
++static void
++rtl8125_powerdown_pll(struct net_device *dev, u8 from_suspend)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        /* Reboot not set wol link speed */
++        if (system_state == SYSTEM_RESTART)
++                return;
++
++        tp->check_keep_link_speed = 0;
++        if (tp->wol_enabled == WOL_ENABLED || tp->DASH || tp->EnableKCPOffload) {
++                int auto_nego;
++
++                rtl8125_set_hw_wol(dev, tp->wol_opts);
++
++                rtl8125_enable_cfg9346_write(tp);
++                RTL_W8(tp, Config2, RTL_R8(tp, Config2) | PMSTS_En);
++                rtl8125_disable_cfg9346_write(tp);
++
++                /* Enable the PME and clear the status */
++                rtl8125_set_pci_pme(tp, 1);
++
++#ifdef ENABLE_FIBER_SUPPORT
++                if (HW_FIBER_MODE_ENABLED(tp))
++                        return;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++                if (rtl8125_keep_wol_link_speed(dev, from_suspend)) {
++                        tp->check_keep_link_speed = 1;
++                } else {
++                        if (tp->D0SpeedUpSpeed != D0_SPEED_UP_SPEED_DISABLE) {
++                                rtl8125_enable_d0_speedup(tp);
++                                tp->check_keep_link_speed = 1;
++                        }
++
++                        auto_nego = rtl8125_set_wol_link_speed(dev);
++
++                        if (tp->RequiredPfmPatch)
++                                rtl8125_set_pfm_patch(tp,
++                                                      (auto_nego & (ADVERTISE_10HALF | ADVERTISE_10FULL)) ?
++                                                      1 : 0);
++                }
++
++                RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | AcceptBroadcast | AcceptMulticast | AcceptMyPhys);
++
++                return;
++        }
++
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp))
++                return;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        if (tp->DASH)
++                return;
++
++        rtl8125_phy_power_down(dev);
++
++        if (!tp->HwIcVerUnknown)
++                RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~BIT_7);
++
++        RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_6);
++}
++
++static void rtl8125_powerup_pll(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | BIT_7 | BIT_6);
++
++        if (tp->resume_not_chg_speed)
++                return;
++
++        rtl8125_phy_power_up(dev);
++}
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static void
++rtl8125_get_wol(struct net_device *dev,
++                struct ethtool_wolinfo *wol)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u8 options;
++
++        wol->wolopts = 0;
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT || disable_wol_support) {
++                wol->supported = 0;
++                return;
++        } else {
++                wol->supported = WAKE_ANY;
++        }
++
++        options = RTL_R8(tp, Config1);
++        if (!(options & PMEnable))
++                return;
++
++        wol->wolopts = tp->wol_opts;
++}
++
++static int
++rtl8125_set_wol(struct net_device *dev,
++                struct ethtool_wolinfo *wol)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT || disable_wol_support)
++                return -EOPNOTSUPP;
++
++        tp->wol_opts = wol->wolopts;
++
++        tp->wol_enabled = (tp->wol_opts) ? WOL_ENABLED : WOL_DISABLED;
++
++        device_set_wakeup_enable(tp_to_dev(tp), wol->wolopts);
++
++        return 0;
++}
++
++static void
++rtl8125_get_drvinfo(struct net_device *dev,
++                    struct ethtool_drvinfo *info)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct rtl8125_fw *rtl_fw = tp->rtl_fw;
++
++        strscpy(info->driver, MODULENAME, sizeof(info->driver));
++        strscpy(info->version, RTL8125_VERSION, sizeof(info->version));
++        strscpy(info->bus_info, pci_name(tp->pci_dev), sizeof(info->bus_info));
++        info->regdump_len = R8125_REGS_DUMP_SIZE;
++        info->eedump_len = tp->eeprom_len;
++        BUILD_BUG_ON(sizeof(info->fw_version) < sizeof(rtl_fw->version));
++        if (rtl_fw)
++                strscpy(info->fw_version, rtl_fw->version,
++                        sizeof(info->fw_version));
++}
++
++static int
++rtl8125_get_regs_len(struct net_device *dev)
++{
++        return R8125_REGS_DUMP_SIZE;
++}
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++
++static void
++rtl8125_set_d0_speedup_speed(struct rtl8125_private *tp)
++{
++        if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp))
++                return;
++
++        tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_DISABLE;
++        if (tp->autoneg == AUTONEG_ENABLE) {
++                if (tp->speed == SPEED_2500)
++                        tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_2500;
++                else if (tp->speed == SPEED_1000)
++                        tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_1000;
++        }
++}
++
++static int
++rtl8125_set_speed_xmii(struct net_device *dev,
++                       u8 autoneg,
++                       u32 speed,
++                       u8 duplex,
++                       u64 adv)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int auto_nego = 0;
++        int giga_ctrl = 0;
++        int ctrl_2500 = 0;
++        int rc = -EINVAL;
++
++        if (!rtl8125_is_speed_mode_valid(speed)) {
++                speed = SPEED_2500;
++                duplex = DUPLEX_FULL;
++                adv |= tp->advertising;
++        }
++
++        if (eee_giga_lite && (autoneg == AUTONEG_ENABLE))
++                rtl8125_enable_giga_lite(tp, adv);
++        else
++                rtl8125_disable_giga_lite(tp);
++
++        giga_ctrl = rtl8125_mdio_read(tp, MII_CTRL1000);
++        giga_ctrl &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL);
++        ctrl_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4);
++        ctrl_2500 &= ~RTK_ADVERTISE_2500FULL;
++
++        if (autoneg == AUTONEG_ENABLE) {
++                /*n-way force*/
++                auto_nego = rtl8125_mdio_read(tp, MII_ADVERTISE);
++                auto_nego &= ~(ADVERTISE_10HALF | ADVERTISE_10FULL |
++                               ADVERTISE_100HALF | ADVERTISE_100FULL |
++                               ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM);
++
++                if (adv & ADVERTISED_10baseT_Half)
++                        auto_nego |= ADVERTISE_10HALF;
++                if (adv & ADVERTISED_10baseT_Full)
++                        auto_nego |= ADVERTISE_10FULL;
++                if (adv & ADVERTISED_100baseT_Half)
++                        auto_nego |= ADVERTISE_100HALF;
++                if (adv & ADVERTISED_100baseT_Full)
++                        auto_nego |= ADVERTISE_100FULL;
++                if (adv & ADVERTISED_1000baseT_Half)
++                        giga_ctrl |= ADVERTISE_1000HALF;
++                if (adv & ADVERTISED_1000baseT_Full)
++                        giga_ctrl |= ADVERTISE_1000FULL;
++                if (adv & ADVERTISED_2500baseX_Full)
++                        ctrl_2500 |= RTK_ADVERTISE_2500FULL;
++
++                //flow control
++                if (tp->fcpause == rtl8125_fc_full)
++                        auto_nego |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
++
++                tp->phy_auto_nego_reg = auto_nego;
++                tp->phy_1000_ctrl_reg = giga_ctrl;
++
++                tp->phy_2500_ctrl_reg = ctrl_2500;
++
++                rtl8125_mdio_write(tp, 0x1f, 0x0000);
++                rtl8125_mdio_write(tp, MII_ADVERTISE, auto_nego);
++                rtl8125_mdio_write(tp, MII_CTRL1000, giga_ctrl);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, ctrl_2500);
++                rtl8125_phy_restart_nway(dev);
++        } else {
++                /*true force*/
++                if (speed == SPEED_10 || speed == SPEED_100)
++                        rtl8125_phy_setup_force_mode(dev, speed, duplex);
++                else
++                        goto out;
++        }
++
++        tp->autoneg = autoneg;
++        tp->speed = speed;
++        tp->duplex = duplex;
++        tp->advertising = adv;
++
++        rtl8125_set_d0_speedup_speed(tp);
++
++#ifdef ENABLE_FIBER_SUPPORT
++        rtl8125_hw_fiber_phy_config(tp);
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        rc = 0;
++out:
++        return rc;
++}
++
++static int
++rtl8125_set_speed(struct net_device *dev,
++                  u8 autoneg,
++                  u32 speed,
++                  u8 duplex,
++                  u64 adv)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret;
++
++        if (tp->resume_not_chg_speed)
++                return 0;
++
++        ret = tp->set_speed(dev, autoneg, speed, duplex, adv);
++
++        return ret;
++}
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static int
++rtl8125_set_settings(struct net_device *dev,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++                     struct ethtool_cmd *cmd
++#else
++                     const struct ethtool_link_ksettings *cmd
++#endif
++                    )
++{
++        int ret;
++        u8 autoneg;
++        u32 speed;
++        u8 duplex;
++        u64 supported = 0, advertising = 0;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++        autoneg = cmd->autoneg;
++        speed = cmd->speed;
++        duplex = cmd->duplex;
++        supported = cmd->supported;
++        advertising = cmd->advertising;
++#else
++        const struct ethtool_link_settings *base = &cmd->base;
++        autoneg = base->autoneg;
++        speed = base->speed;
++        duplex = base->duplex;
++        ethtool_convert_link_mode_to_legacy_u32((u32*)&supported,
++                                                cmd->link_modes.supported);
++        ethtool_convert_link_mode_to_legacy_u32((u32*)&advertising,
++                                                cmd->link_modes.advertising);
++        if (test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
++                     cmd->link_modes.supported))
++                supported |= ADVERTISED_2500baseX_Full;
++        if (test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
++                     cmd->link_modes.advertising))
++                advertising |= ADVERTISED_2500baseX_Full;
++#endif
++        if (advertising & ~supported)
++                return -EINVAL;
++
++        ret = rtl8125_set_speed(dev, autoneg, speed, duplex, advertising);
++
++        return ret;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++static u32
++rtl8125_get_tx_csum(struct net_device *dev)
++{
++        u32 ret;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++        ret = ((dev->features & NETIF_F_IP_CSUM) != 0);
++#else
++        ret = ((dev->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 0);
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++
++        return ret;
++}
++
++static u32
++rtl8125_get_rx_csum(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 ret;
++
++        ret = tp->cp_cmd & RxChkSum;
++
++        return ret;
++}
++
++static int
++rtl8125_set_tx_csum(struct net_device *dev,
++                    u32 data)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                return -EOPNOTSUPP;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++        if (data)
++                dev->features |= NETIF_F_IP_CSUM;
++        else
++                dev->features &= ~NETIF_F_IP_CSUM;
++#else
++        if (data)
++                dev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
++        else
++                dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++
++        return 0;
++}
++
++static int
++rtl8125_set_rx_csum(struct net_device *dev,
++                    u32 data)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                return -EOPNOTSUPP;
++
++        if (data)
++                tp->cp_cmd |= RxChkSum;
++        else
++                tp->cp_cmd &= ~RxChkSum;
++
++        RTL_W16(tp, CPlusCmd, tp->cp_cmd);
++
++        return 0;
++}
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++
++static u32
++rtl8125_rx_desc_opts1(struct rtl8125_private *tp,
++                      struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                return READ_ONCE(((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts1);
++        case RX_DESC_RING_TYPE_4:
++                return READ_ONCE(((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts1);
++        default:
++                return READ_ONCE(desc->opts1);
++        }
++}
++
++static u32
++rtl8125_rx_desc_opts2(struct rtl8125_private *tp,
++                      struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                return ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts2;
++        case RX_DESC_RING_TYPE_4:
++                return ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts2;
++        default:
++                return desc->opts2;
++        }
++}
++
++#ifdef CONFIG_R8125_VLAN
++
++static void
++rtl8125_clear_rx_desc_opts2(struct rtl8125_private *tp,
++                            struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts2 = 0;
++                break;
++        case RX_DESC_RING_TYPE_4:
++                ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts2 = 0;
++                break;
++        default:
++                desc->opts2 = 0;
++                break;
++        }
++}
++
++static inline u32
++rtl8125_tx_vlan_tag(struct rtl8125_private *tp,
++                    struct sk_buff *skb)
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++        return (tp->vlgrp && vlan_tx_tag_present(skb)) ?
++               TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
++        return (vlan_tx_tag_present(skb)) ?
++               TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
++#else
++        return (skb_vlan_tag_present(skb)) ?
++               TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
++#endif
++
++        return 0;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++
++static void
++rtl8125_vlan_rx_register(struct net_device *dev,
++                         struct vlan_group *grp)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        tp->vlgrp = grp;
++
++        if (tp->vlgrp) {
++                tp->rtl8125_rx_config |= (EnableInnerVlan | EnableOuterVlan);
++                RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | (EnableInnerVlan | EnableOuterVlan))
++        } else {
++                tp->rtl8125_rx_config &= ~(EnableInnerVlan | EnableOuterVlan);
++                RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) & ~(EnableInnerVlan | EnableOuterVlan))
++        }
++}
++
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++static void
++rtl8125_vlan_rx_kill_vid(struct net_device *dev,
++                         unsigned short vid)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21)
++        if (tp->vlgrp)
++                tp->vlgrp->vlan_devices[vid] = NULL;
++#else
++        vlan_group_set_device(tp->vlgrp, vid, NULL);
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21)
++}
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++
++static int
++rtl8125_rx_vlan_skb(struct rtl8125_private *tp,
++                    struct RxDesc *desc,
++                    struct sk_buff *skb)
++{
++        u32 opts2 = le32_to_cpu(rtl8125_rx_desc_opts2(tp, desc));
++        int ret = -1;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++        if (tp->vlgrp && (opts2 & RxVlanTag)) {
++                rtl8125_rx_hwaccel_skb(skb, tp->vlgrp,
++                                       swab16(opts2 & 0xffff));
++                ret = 0;
++        }
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
++        if (opts2 & RxVlanTag)
++                __vlan_hwaccel_put_tag(skb, swab16(opts2 & 0xffff));
++#else
++        if (opts2 & RxVlanTag)
++                __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), swab16(opts2 & 0xffff));
++#endif
++
++        rtl8125_clear_rx_desc_opts2(tp, desc);
++        return ret;
++}
++
++#else /* !CONFIG_R8125_VLAN */
++
++static inline u32
++rtl8125_tx_vlan_tag(struct rtl8125_private *tp,
++                    struct sk_buff *skb)
++{
++        return 0;
++}
++
++static int
++rtl8125_rx_vlan_skb(struct rtl8125_private *tp,
++                    struct RxDesc *desc,
++                    struct sk_buff *skb)
++{
++        return -1;
++}
++
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)
++
++static netdev_features_t rtl8125_fix_features(struct net_device *dev,
++                netdev_features_t features)
++{
++        if (dev->mtu > MSS_MAX || dev->mtu > ETH_DATA_LEN)
++                features &= ~NETIF_F_ALL_TSO;
++#ifndef CONFIG_R8125_VLAN
++        features &= ~NETIF_F_ALL_CSUM;
++#endif
++
++        return features;
++}
++
++static int rtl8125_hw_set_features(struct net_device *dev,
++                                   netdev_features_t features)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 rx_config;
++
++        rx_config = RTL_R32(tp, RxConfig);
++        if (features & NETIF_F_RXALL) {
++                tp->rtl8125_rx_config |= (AcceptErr | AcceptRunt);
++                rx_config |= (AcceptErr | AcceptRunt);
++        } else {
++                tp->rtl8125_rx_config &= ~(AcceptErr | AcceptRunt);
++                rx_config &= ~(AcceptErr | AcceptRunt);
++        }
++
++        if (features & NETIF_F_HW_VLAN_RX) {
++                tp->rtl8125_rx_config |= (EnableInnerVlan | EnableOuterVlan);
++                rx_config |= (EnableInnerVlan | EnableOuterVlan);
++        } else {
++                tp->rtl8125_rx_config &= ~(EnableInnerVlan | EnableOuterVlan);
++                rx_config &= ~(EnableInnerVlan | EnableOuterVlan);
++        }
++
++        RTL_W32(tp, RxConfig, rx_config);
++
++        if (features & NETIF_F_RXCSUM)
++                tp->cp_cmd |= RxChkSum;
++        else
++                tp->cp_cmd &= ~RxChkSum;
++
++        RTL_W16(tp, CPlusCmd, tp->cp_cmd);
++        RTL_R16(tp, CPlusCmd);
++
++        return 0;
++}
++
++static int rtl8125_set_features(struct net_device *dev,
++                                netdev_features_t features)
++{
++        features &= NETIF_F_RXALL | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX;
++
++        rtl8125_hw_set_features(dev, features);
++
++        return 0;
++}
++
++#endif
++
++static u8 rtl8125_get_mdi_status(struct rtl8125_private *tp)
++{
++        if (!tp->link_ok(tp->dev))
++                return ETH_TP_MDI_INVALID;
++
++        if (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA444) & BIT_1)
++                return ETH_TP_MDI;
++        else
++                return ETH_TP_MDI_X;
++}
++
++static void rtl8125_gset_xmii(struct net_device *dev,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++                              struct ethtool_cmd *cmd
++#else
++                              struct ethtool_link_ksettings *cmd
++#endif
++                             )
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 aner = tp->phy_reg_aner;
++        u16 anlpar = tp->phy_reg_anlpar;
++        u16 gbsr = tp->phy_reg_gbsr;
++        u16 status_2500 = tp->phy_reg_status_2500;
++        u64 lpa_adv = 0;
++        u32 status;
++        u8 autoneg, duplex;
++        u32 speed = 0;
++        u16 bmcr;
++        u64 supported, advertising;
++        unsigned long flags;
++        u8 report_lpa = 0;
++
++        supported = SUPPORTED_10baseT_Half |
++                    SUPPORTED_10baseT_Full |
++                    SUPPORTED_100baseT_Half |
++                    SUPPORTED_100baseT_Full |
++                    SUPPORTED_1000baseT_Full |
++                    SUPPORTED_2500baseX_Full |
++                    SUPPORTED_Autoneg |
++                    SUPPORTED_TP |
++                    SUPPORTED_Pause |
++                    SUPPORTED_Asym_Pause;
++
++        if (!HW_SUPP_PHY_LINK_SPEED_2500M(tp))
++                supported &= ~SUPPORTED_2500baseX_Full;
++
++        advertising = tp->advertising;
++        if (tp->phy_auto_nego_reg || tp->phy_1000_ctrl_reg ||
++            tp->phy_2500_ctrl_reg) {
++                advertising = 0;
++                if (tp->phy_auto_nego_reg & ADVERTISE_10HALF)
++                        advertising |= ADVERTISED_10baseT_Half;
++                if (tp->phy_auto_nego_reg & ADVERTISE_10FULL)
++                        advertising |= ADVERTISED_10baseT_Full;
++                if (tp->phy_auto_nego_reg & ADVERTISE_100HALF)
++                        advertising |= ADVERTISED_100baseT_Half;
++                if (tp->phy_auto_nego_reg & ADVERTISE_100FULL)
++                        advertising |= ADVERTISED_100baseT_Full;
++                if (tp->phy_1000_ctrl_reg & ADVERTISE_1000FULL)
++                        advertising |= ADVERTISED_1000baseT_Full;
++                if (tp->phy_2500_ctrl_reg & RTK_ADVERTISE_2500FULL)
++                        advertising |= ADVERTISED_2500baseX_Full;
++        }
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        bmcr = rtl8125_mdio_read(tp, MII_BMCR);
++        if (bmcr & BMCR_ANENABLE) {
++                autoneg = AUTONEG_ENABLE;
++                advertising |= ADVERTISED_Autoneg;
++        } else {
++                autoneg = AUTONEG_DISABLE;
++        }
++
++        advertising |= ADVERTISED_TP;
++
++        status = rtl8125_get_phy_status(tp);
++        if (netif_running(dev) && (status & LinkStatus))
++                report_lpa = 1;
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp) &&
++            rtl8125_fiber_link_ok(dev) != R8125_LINK_STATE_ON)
++                report_lpa = 0;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        if (report_lpa) {
++                /*link on*/
++                speed = rtl8125_convert_link_speed(status);
++
++                if (status & TxFlowCtrl)
++                        advertising |= ADVERTISED_Asym_Pause;
++
++                if (status & RxFlowCtrl)
++                        advertising |= ADVERTISED_Pause;
++
++                duplex = ((status & (_1000bpsF | _2500bpsF)) ||
++                          (status & FullDup)) ?
++                         DUPLEX_FULL : DUPLEX_HALF;
++
++                /*link partner*/
++                if (aner & EXPANSION_NWAY)
++                        lpa_adv |= ADVERTISED_Autoneg;
++                if (anlpar & LPA_10HALF)
++                        lpa_adv |= ADVERTISED_10baseT_Half;
++                if (anlpar & LPA_10FULL)
++                        lpa_adv |= ADVERTISED_10baseT_Full;
++                if (anlpar & LPA_100HALF)
++                        lpa_adv |= ADVERTISED_100baseT_Half;
++                if (anlpar & LPA_100FULL)
++                        lpa_adv |= ADVERTISED_100baseT_Full;
++                if (anlpar & LPA_PAUSE_CAP)
++                        lpa_adv |= ADVERTISED_Pause;
++                if (anlpar & LPA_PAUSE_ASYM)
++                        lpa_adv |= ADVERTISED_Asym_Pause;
++                if (gbsr & LPA_1000HALF)
++                        lpa_adv |= ADVERTISED_1000baseT_Half;
++                if (gbsr & LPA_1000FULL)
++                        lpa_adv |= ADVERTISED_1000baseT_Full;
++                if (status_2500 & RTK_LPA_ADVERTISE_2500FULL)
++                        lpa_adv |= ADVERTISED_2500baseX_Full;
++        } else {
++                /*link down*/
++                speed = SPEED_UNKNOWN;
++                duplex = DUPLEX_UNKNOWN;
++                lpa_adv = 0;
++        }
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++        cmd->supported = (u32)supported;
++        cmd->advertising = (u32)advertising;
++        cmd->autoneg = autoneg;
++        cmd->speed = speed;
++        cmd->duplex = duplex;
++        cmd->port = PORT_TP;
++        cmd->lp_advertising = (u32)lpa_adv;
++        cmd->eth_tp_mdix = rtl8125_get_mdi_status(tp);
++#else
++        ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
++                                                supported);
++        ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
++                                                advertising);
++        ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
++                                                lpa_adv);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
++        if (supported & SUPPORTED_2500baseX_Full) {
++                linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
++                                 cmd->link_modes.supported, 0);
++                linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
++                                 cmd->link_modes.supported, 1);
++        }
++        if (advertising & ADVERTISED_2500baseX_Full) {
++                linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
++                                 cmd->link_modes.advertising, 0);
++                linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
++                                 cmd->link_modes.advertising, 1);
++        }
++        if (report_lpa) {
++                if (lpa_adv & ADVERTISED_2500baseX_Full) {
++                        linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
++                                         cmd->link_modes.lp_advertising, 0);
++                        linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
++                                         cmd->link_modes.lp_advertising, 1);
++                }
++        }
++#endif
++        cmd->base.autoneg = autoneg;
++        cmd->base.speed = speed;
++        cmd->base.duplex = duplex;
++        cmd->base.port = PORT_TP;
++        cmd->base.eth_tp_mdix = rtl8125_get_mdi_status(tp);
++#endif
++        r8125_spin_unlock(&tp->phy_lock, flags);
++}
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static int
++rtl8125_get_settings(struct net_device *dev,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++                     struct ethtool_cmd *cmd
++#else
++                     struct ethtool_link_ksettings *cmd
++#endif
++                    )
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        tp->get_settings(dev, cmd);
++
++        return 0;
++}
++
++static void rtl8125_get_regs(struct net_device *dev, struct ethtool_regs *regs,
++                             void *p)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++        unsigned int i;
++        u8 *data = p;
++
++        if (regs->len < R8125_REGS_DUMP_SIZE)
++                return /* -EINVAL */;
++
++        memset(p, 0, regs->len);
++
++        for (i = 0; i < R8125_MAC_REGS_SIZE; i++)
++                *data++ = readb(ioaddr + i);
++        data = (u8*)p + 256;
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        for (i = 0; i < R8125_PHY_REGS_SIZE/2; i++) {
++                *(u16*)data = rtl8125_mdio_read(tp, i);
++                data += 2;
++        }
++        data = (u8*)p + 256 * 2;
++
++        for (i = 0; i < R8125_EPHY_REGS_SIZE/2; i++) {
++                *(u16*)data = rtl8125_ephy_read(tp, i);
++                data += 2;
++        }
++        data = (u8*)p + 256 * 3;
++
++        for (i = 0; i < R8125_ERI_REGS_SIZE; i+=4) {
++                *(u32*)data = rtl8125_eri_read(tp, i , 4, ERIAR_ExGMAC);
++                data += 4;
++        }
++}
++
++static void rtl8125_get_pauseparam(struct net_device *dev,
++                                   struct ethtool_pauseparam *pause)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        pause->autoneg = (tp->autoneg ? AUTONEG_ENABLE : AUTONEG_DISABLE);
++        if (tp->fcpause == rtl8125_fc_rx_pause)
++                pause->rx_pause = 1;
++        else if (tp->fcpause == rtl8125_fc_tx_pause)
++                pause->tx_pause = 1;
++        else if (tp->fcpause == rtl8125_fc_full) {
++                pause->rx_pause = 1;
++                pause->tx_pause = 1;
++        }
++}
++
++static int rtl8125_set_pauseparam(struct net_device *dev,
++                                  struct ethtool_pauseparam *pause)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        enum rtl8125_fc_mode newfc;
++
++        if (pause->tx_pause || pause->rx_pause)
++                newfc = rtl8125_fc_full;
++        else
++                newfc = rtl8125_fc_none;
++
++        if (tp->fcpause != newfc) {
++                tp->fcpause = newfc;
++
++                rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising);
++        }
++
++        return 0;
++}
++
++static u32
++rtl8125_get_msglevel(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        return tp->msg_enable;
++}
++
++static void
++rtl8125_set_msglevel(struct net_device *dev,
++                     u32 value)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        tp->msg_enable = value;
++}
++
++static const char rtl8125_gstrings[][ETH_GSTRING_LEN] = {
++        /* legacy */
++        "tx_packets",
++        "rx_packets",
++        "tx_errors",
++        "rx_errors",
++        "rx_missed",
++        "align_errors",
++        "tx_single_collisions",
++        "tx_multi_collisions",
++        "unicast",
++        "broadcast",
++        "multicast",
++        "tx_aborted",
++        "tx_underrun",
++
++        /* extended */
++        "tx_octets",
++        "rx_octets",
++        "rx_multicast64",
++        "tx_unicast64",
++        "tx_broadcast64",
++        "tx_multicast64",
++        "tx_pause_on",
++        "tx_pause_off",
++        "tx_pause_all",
++        "tx_deferred",
++        "tx_late_collision",
++        "tx_all_collision",
++        "tx_aborted32",
++        "align_errors32",
++        "rx_frame_too_long",
++        "rx_runt",
++        "rx_pause_on",
++        "rx_pause_off",
++        "rx_pause_all",
++        "rx_unknown_opcode",
++        "rx_mac_error",
++        "tx_underrun32",
++        "rx_mac_missed",
++        "rx_tcam_dropped",
++        "tdu",
++        "rdu",
++};
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static int rtl8125_get_stats_count(struct net_device *dev)
++{
++        return ARRAY_SIZE(rtl8125_gstrings);
++}
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++#else
++static int rtl8125_get_sset_count(struct net_device *dev, int sset)
++{
++        switch (sset) {
++        case ETH_SS_STATS:
++                return ARRAY_SIZE(rtl8125_gstrings);
++        default:
++                return -EOPNOTSUPP;
++        }
++}
++#endif
++
++static void
++rtl8125_set_ring_size(struct rtl8125_private *tp, u32 rx, u32 tx)
++{
++        int i;
++
++        for (i = 0; i < R8125_MAX_RX_QUEUES; i++)
++                tp->rx_ring[i].num_rx_desc = rx;
++
++        for (i = 0; i < R8125_MAX_TX_QUEUES; i++)
++                tp->tx_ring[i].num_tx_desc = tx;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++static void rtl8125_get_ringparam(struct net_device *dev,
++                                  struct ethtool_ringparam *ring,
++                                  struct kernel_ethtool_ringparam *kernel_ring,
++                                  struct netlink_ext_ack *extack)
++#else
++static void rtl8125_get_ringparam(struct net_device *dev,
++                                  struct ethtool_ringparam *ring)
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        ring->rx_max_pending = MAX_NUM_TX_DESC;
++        ring->tx_max_pending = MAX_NUM_RX_DESC;
++        ring->rx_pending = tp->rx_ring[0].num_rx_desc;
++        ring->tx_pending = tp->tx_ring[0].num_tx_desc;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++static int rtl8125_set_ringparam(struct net_device *dev,
++                                 struct ethtool_ringparam *ring,
++                                 struct kernel_ethtool_ringparam *kernel_ring,
++                                 struct netlink_ext_ack *extack)
++#else
++static int rtl8125_set_ringparam(struct net_device *dev,
++                                 struct ethtool_ringparam *ring)
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 new_rx_count, new_tx_count;
++        int rc = 0;
++
++        if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
++                return -EINVAL;
++
++        new_tx_count = clamp_t(u32, ring->tx_pending,
++                               MIN_NUM_TX_DESC, MAX_NUM_TX_DESC);
++
++        new_rx_count = clamp_t(u32, ring->rx_pending,
++                               MIN_NUM_RX_DESC, MAX_NUM_RX_DESC);
++
++        if ((new_rx_count == tp->rx_ring[0].num_rx_desc) &&
++            (new_tx_count == tp->tx_ring[0].num_tx_desc)) {
++                /* nothing to do */
++                return 0;
++        }
++
++        if (netif_running(dev)) {
++                rtl8125_wait_for_quiescence(dev);
++                rtl8125_close(dev);
++        }
++
++        rtl8125_set_ring_size(tp, new_rx_count, new_tx_count);
++
++        if (netif_running(dev))
++                rc = rtl8125_open(dev);
++
++        return rc;
++}
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static void
++rtl8125_get_ethtool_stats(struct net_device *dev,
++                          struct ethtool_stats *stats,
++                          u64 *data)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct rtl8125_counters *counters;
++        dma_addr_t paddr;
++
++        ASSERT_RTNL();
++
++        counters = tp->tally_vaddr;
++        paddr = tp->tally_paddr;
++        if (!counters)
++                return;
++
++        rtl8125_dump_tally_counter(tp, paddr);
++
++        data[0] = le64_to_cpu(counters->tx_packets);
++        data[1] = le64_to_cpu(counters->rx_packets);
++        data[2] = le64_to_cpu(counters->tx_errors);
++        data[3] = le32_to_cpu(counters->rx_errors);
++        data[4] = le16_to_cpu(counters->rx_missed);
++        data[5] = le16_to_cpu(counters->align_errors);
++        data[6] = le32_to_cpu(counters->tx_one_collision);
++        data[7] = le32_to_cpu(counters->tx_multi_collision);
++        data[8] = le64_to_cpu(counters->rx_unicast);
++        data[9] = le64_to_cpu(counters->rx_broadcast);
++        data[10] = le32_to_cpu(counters->rx_multicast);
++        data[11] = le16_to_cpu(counters->tx_aborted);
++        data[12] = le16_to_cpu(counters->tx_underrun);
++
++        data[13] = le64_to_cpu(counters->tx_octets);
++        data[14] = le64_to_cpu(counters->rx_octets);
++        data[15] = le64_to_cpu(counters->rx_multicast64);
++        data[16] = le64_to_cpu(counters->tx_unicast64);
++        data[17] = le64_to_cpu(counters->tx_broadcast64);
++        data[18] = le64_to_cpu(counters->tx_multicast64);
++        data[19] = le32_to_cpu(counters->tx_pause_on);
++        data[20] = le32_to_cpu(counters->tx_pause_off);
++        data[21] = le32_to_cpu(counters->tx_pause_all);
++        data[22] = le32_to_cpu(counters->tx_deferred);
++        data[23] = le32_to_cpu(counters->tx_late_collision);
++        data[24] = le32_to_cpu(counters->tx_all_collision);
++        data[25] = le32_to_cpu(counters->tx_aborted32);
++        data[26] = le32_to_cpu(counters->align_errors32);
++        data[27] = le32_to_cpu(counters->rx_frame_too_long);
++        data[28] = le32_to_cpu(counters->rx_runt);
++        data[29] = le32_to_cpu(counters->rx_pause_on);
++        data[30] = le32_to_cpu(counters->rx_pause_off);
++        data[31] = le32_to_cpu(counters->rx_pause_all);
++        data[32] = le32_to_cpu(counters->rx_unknown_opcode);
++        data[33] = le32_to_cpu(counters->rx_mac_error);
++        data[34] = le32_to_cpu(counters->tx_underrun32);
++        data[35] = le32_to_cpu(counters->rx_mac_missed);
++        data[36] = le32_to_cpu(counters->rx_tcam_dropped);
++        data[37] = le32_to_cpu(counters->tdu);
++        data[38] = le32_to_cpu(counters->rdu);
++}
++
++static void
++rtl8125_get_strings(struct net_device *dev,
++                    u32 stringset,
++                    u8 *data)
++{
++        switch (stringset) {
++        case ETH_SS_STATS:
++                memcpy(data, rtl8125_gstrings, sizeof(rtl8125_gstrings));
++                break;
++        }
++}
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++
++static int rtl_get_eeprom_len(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        return tp->eeprom_len;
++}
++
++static int rtl_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *buf)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i,j,ret;
++        int start_w, end_w;
++        int VPD_addr, VPD_data;
++        u32 *eeprom_buff;
++        u16 tmp;
++
++        if (tp->eeprom_type == EEPROM_TYPE_NONE) {
++                dev_printk(KERN_DEBUG, tp_to_dev(tp), "Detect none EEPROM\n");
++                return -EOPNOTSUPP;
++        } else if (eeprom->len == 0 || (eeprom->offset+eeprom->len) > tp->eeprom_len) {
++                dev_printk(KERN_DEBUG, tp_to_dev(tp), "Invalid parameter\n");
++                return -EINVAL;
++        }
++
++        VPD_addr = 0xD2;
++        VPD_data = 0xD4;
++
++        start_w = eeprom->offset >> 2;
++        end_w = (eeprom->offset + eeprom->len - 1) >> 2;
++
++        eeprom_buff = kmalloc(sizeof(u32)*(end_w - start_w + 1), GFP_KERNEL);
++        if (!eeprom_buff)
++                return -ENOMEM;
++
++        rtl8125_enable_cfg9346_write(tp);
++        ret = -EFAULT;
++        for (i=start_w; i<=end_w; i++) {
++                pci_write_config_word(tp->pci_dev, VPD_addr, (u16)i*4);
++                ret = -EFAULT;
++                for (j = 0; j < 10; j++) {
++                        udelay(400);
++                        pci_read_config_word(tp->pci_dev, VPD_addr, &tmp);
++                        if (tmp&0x8000) {
++                                ret = 0;
++                                break;
++                        }
++                }
++
++                if (ret)
++                        break;
++
++                pci_read_config_dword(tp->pci_dev, VPD_data, &eeprom_buff[i-start_w]);
++        }
++        rtl8125_disable_cfg9346_write(tp);
++
++        if (!ret)
++                memcpy(buf, (u8 *)eeprom_buff + (eeprom->offset & 3), eeprom->len);
++
++        kfree(eeprom_buff);
++
++        return ret;
++}
++
++#undef ethtool_op_get_link
++#define ethtool_op_get_link _kc_ethtool_op_get_link
++static u32 _kc_ethtool_op_get_link(struct net_device *dev)
++{
++        return netif_carrier_ok(dev) ? 1 : 0;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++#undef ethtool_op_get_sg
++#define ethtool_op_get_sg _kc_ethtool_op_get_sg
++static u32 _kc_ethtool_op_get_sg(struct net_device *dev)
++{
++#ifdef NETIF_F_SG
++        return (dev->features & NETIF_F_SG) != 0;
++#else
++        return 0;
++#endif
++}
++
++#undef ethtool_op_set_sg
++#define ethtool_op_set_sg _kc_ethtool_op_set_sg
++static int _kc_ethtool_op_set_sg(struct net_device *dev, u32 data)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                return -EOPNOTSUPP;
++
++#ifdef NETIF_F_SG
++        if (data)
++                dev->features |= NETIF_F_SG;
++        else
++                dev->features &= ~NETIF_F_SG;
++#endif
++
++        return 0;
++}
++#endif
++
++static void
++rtl8125_set_eee_lpi_timer(struct rtl8125_private *tp)
++{
++        u16 dev_lpi_timer;
++
++        dev_lpi_timer = tp->eee.tx_lpi_timer;
++
++        RTL_W16(tp, EEE_TXIDLE_TIMER_8125, dev_lpi_timer);
++}
++
++static bool rtl8125_is_adv_eee_enabled(struct rtl8125_private *tp)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        //case CFG_METHOD_10:
++        //case CFG_METHOD_11:
++        case CFG_METHOD_12:
++                //case CFG_METHOD_13:
++                if (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA430) & BIT_15)
++                        return true;
++                break;
++        default:
++                break;
++        }
++
++        return false;
++}
++
++static void _rtl8125_disable_adv_eee(struct rtl8125_private *tp)
++{
++        bool lock;
++
++        if (rtl8125_is_adv_eee_enabled(tp))
++                lock = true;
++        else
++                lock = false;
++
++        if (lock)
++                rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_clear_mac_ocp_bit(tp, 0xE052, BIT_0);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA442, BIT_12 | BIT_13);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA430, BIT_15);
++
++        if (lock)
++                rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void rtl8125_disable_adv_eee(struct rtl8125_private *tp)
++{
++        rtl8125_oob_mutex_lock(tp);
++
++        _rtl8125_disable_adv_eee(tp);
++
++        rtl8125_oob_mutex_unlock(tp);
++}
++
++static int rtl8125_enable_eee(struct rtl8125_private *tp)
++{
++        struct ethtool_keee *eee = &tp->eee;
++        u16 eee_adv_cap1_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(eee->advertised);
++        u16 eee_adv_cap2_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t(eee->advertised);
++        int ret;
++
++        ret = 0;
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_set_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0));
++                rtl8125_set_mac_ocp_bit(tp, 0xEB62, (BIT_2|BIT_1));
++
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA5D0,
++                                                      MDIO_EEE_100TX | MDIO_EEE_1000T,
++                                                      eee_adv_cap1_t);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, MDIO_EEE_2_5GT);
++
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9);
++                break;
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                rtl8125_set_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0));
++
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4);
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA5D0,
++                                                      MDIO_EEE_100TX | MDIO_EEE_1000T,
++                                                      eee_adv_cap1_t);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA6D4,
++                                                      MDIO_EEE_2_5GT,
++                                                      eee_adv_cap2_t);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9);
++                break;
++        default:
++                ret = -EOPNOTSUPP;
++                break;
++        }
++
++        /*Advanced EEE*/
++        rtl8125_disable_adv_eee(tp);
++
++        return ret;
++}
++
++static int rtl8125_disable_eee(struct rtl8125_private *tp)
++{
++        int ret;
++
++        ret = 0;
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_clear_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0));
++                rtl8125_clear_mac_ocp_bit(tp, 0xEB62, (BIT_2|BIT_1));
++
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA432, BIT_4);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D0, (MDIO_EEE_100TX | MDIO_EEE_1000T));
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, BIT_0);
++
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9);
++                break;
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                rtl8125_clear_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0));
++
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4);
++
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D0, (MDIO_EEE_100TX | MDIO_EEE_1000T));
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, MDIO_EEE_2_5GT);
++
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9);
++                break;
++        default:
++                ret = -EOPNOTSUPP;
++                break;
++        }
++
++        /*Advanced EEE*/
++        rtl8125_disable_adv_eee(tp);
++
++        return ret;
++}
++
++static int rtl_nway_reset(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret, bmcr;
++
++        if (unlikely(tp->rtk_enable_diag))
++                return -EBUSY;
++
++        /* if autoneg is off, it's an error */
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        bmcr = rtl8125_mdio_read(tp, MII_BMCR);
++
++        if (bmcr & BMCR_ANENABLE) {
++                bmcr |= BMCR_ANRESTART;
++                rtl8125_mdio_write(tp, MII_BMCR, bmcr);
++                ret = 0;
++        } else {
++                ret = -EINVAL;
++        }
++
++        return ret;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
++static u32
++rtl8125_device_lpi_t_to_ethtool_lpi_t(struct rtl8125_private *tp , u32 lpi_timer)
++{
++        u32 to_us;
++        u32 status;
++
++        to_us = lpi_timer * 80;
++        status = rtl8125_get_phy_status(tp);
++        if (status & LinkStatus) {
++                /*link on*/
++                //2.5G : lpi_timer * 3.2ns
++                //Giga: lpi_timer * 8ns
++                //100M : lpi_timer * 80ns
++                if (status & _2500bpsF)
++                        to_us = (lpi_timer * 32) / 10;
++                else if (status & _1000bpsF)
++                        to_us = lpi_timer * 8;
++        }
++
++        //ns to us
++        to_us /= 1000;
++
++        return to_us;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0)
++static void
++rtl8125_adv_to_linkmode(unsigned long *mode, u64 adv)
++{
++        linkmode_zero(mode);
++
++        if (adv & ADVERTISED_10baseT_Half)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, mode);
++        if (adv & ADVERTISED_10baseT_Full)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, mode);
++        if (adv & ADVERTISED_100baseT_Half)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, mode);
++        if (adv & ADVERTISED_100baseT_Full)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, mode);
++        if (adv & ADVERTISED_1000baseT_Half)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, mode);
++        if (adv & ADVERTISED_1000baseT_Full)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, mode);
++        if (adv & ADVERTISED_2500baseX_Full)
++                linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, mode);
++}
++
++static int
++rtl_ethtool_get_eee(struct net_device *net, struct ethtool_keee *edata)
++{
++        __ETHTOOL_DECLARE_LINK_MODE_MASK(common);
++        struct rtl8125_private *tp = netdev_priv(net);
++        struct ethtool_keee *eee = &tp->eee;
++        unsigned long flags;
++        u32 tx_lpi_timer;
++        u16 val;
++
++        if (unlikely(tp->rtk_enable_diag))
++                return -EBUSY;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        /* Get LP advertisement EEE */
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D2);
++        mii_eee_cap1_mod_linkmode_t(edata->lp_advertised, val);
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA6D0);
++        mii_eee_cap2_mod_linkmode_sup_t(edata->lp_advertised, val);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        /* Get EEE Tx LPI timer*/
++        tx_lpi_timer = rtl8125_device_lpi_t_to_ethtool_lpi_t(tp, eee->tx_lpi_timer);
++
++        val = rtl8125_mac_ocp_read(tp, 0xE040);
++        val &= BIT_1 | BIT_0;
++
++        edata->eee_enabled = !!val;
++        linkmode_copy(edata->supported, eee->supported);
++        linkmode_copy(edata->advertised, eee->advertised);
++        edata->tx_lpi_enabled = edata->eee_enabled;
++        edata->tx_lpi_timer = tx_lpi_timer;
++        linkmode_and(common, edata->advertised, edata->lp_advertised);
++        edata->eee_active = !linkmode_empty(common);
++
++        return 0;
++}
++
++static int
++rtl_ethtool_set_eee(struct net_device *net, struct ethtool_keee *edata)
++{
++        __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
++        __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp);
++        struct rtl8125_private *tp = netdev_priv(net);
++        struct ethtool_keee *eee = &tp->eee;
++        unsigned long flags;
++        int rc = 0;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (!HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp) ||
++            tp->DASH) {
++                rc = -EOPNOTSUPP;
++                goto out;
++        }
++
++        if (unlikely(tp->rtk_enable_diag)) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "Diag Enabled\n");
++                rc = -EBUSY;
++                goto out;
++        }
++
++        if (tp->autoneg != AUTONEG_ENABLE) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE requires autoneg\n");
++                rc = -EINVAL;
++                goto out;
++        }
++
++        /*
++        if (edata->tx_lpi_enabled) {
++        if (edata->tx_lpi_timer > tp->max_jumbo_frame_size ||
++            edata->tx_lpi_timer < ETH_MIN_MTU) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "Valid LPI timer range is %d to %d. \n",
++                           ETH_MIN_MTU, tp->max_jumbo_frame_size);
++                rc = -EINVAL;
++                goto out;
++        }
++        }
++        */
++
++        rtl8125_adv_to_linkmode(advertising, tp->advertising);
++        if (linkmode_empty(edata->advertised)) {
++                linkmode_and(edata->advertised, advertising, eee->supported);
++        } else if (linkmode_andnot(tmp, edata->advertised, advertising)) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised must be a subset of autoneg advertised speeds\n");
++                rc = -EINVAL;
++                goto out;
++        }
++
++        if (linkmode_andnot(tmp, edata->advertised, eee->supported)) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised must be a subset of support \n");
++                rc = -EINVAL;
++                goto out;
++        }
++
++        //tp->eee.eee_enabled = edata->eee_enabled;
++        //tp->eee_adv_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(edata->advertised);
++
++        linkmode_copy(eee->advertised, edata->advertised);
++        //eee->tx_lpi_enabled = edata->tx_lpi_enabled;
++        //eee->tx_lpi_timer = edata->tx_lpi_timer;
++        eee->eee_enabled = edata->eee_enabled;
++
++        if (eee->eee_enabled)
++                rtl8125_enable_eee(tp);
++        else
++                rtl8125_disable_eee(tp);
++
++        rtl_nway_reset(net);
++
++out:
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return rc;
++}
++#else
++static int
++rtl_ethtool_get_eee(struct net_device *net, struct ethtool_eee *edata)
++{
++        struct rtl8125_private *tp = netdev_priv(net);
++        struct ethtool_eee *eee = &tp->eee;
++        u32 lp, adv, tx_lpi_timer, supported = 0;
++        unsigned long flags;
++        u16 val;
++
++        if (unlikely(tp->rtk_enable_diag))
++                return -EBUSY;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        /* Get Supported EEE */
++        //val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5C4);
++        //supported = mmd_eee_cap_to_ethtool_sup_t(val);
++        supported = eee->supported;
++
++        /* Get advertisement EEE */
++        adv = eee->advertised;
++
++        /* Get LP advertisement EEE */
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D2);
++        lp = mmd_eee_adv_to_ethtool_adv_t(val);
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA6D0);
++        if (val & RTK_LPA_EEE_ADVERTISE_2500FULL)
++                lp |= ADVERTISED_2500baseX_Full;
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        /* Get EEE Tx LPI timer*/
++        tx_lpi_timer = rtl8125_device_lpi_t_to_ethtool_lpi_t(tp, eee->tx_lpi_timer);
++
++        val = rtl8125_mac_ocp_read(tp, 0xE040);
++        val &= BIT_1 | BIT_0;
++
++        edata->eee_enabled = !!val;
++        edata->eee_active = !!(supported & adv & lp);
++        edata->supported = supported;
++        edata->advertised = adv;
++        edata->lp_advertised = lp;
++        edata->tx_lpi_enabled = edata->eee_enabled;
++        edata->tx_lpi_timer = tx_lpi_timer;
++
++        return 0;
++}
++
++static int
++rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata)
++{
++        struct rtl8125_private *tp = netdev_priv(net);
++        struct ethtool_eee *eee = &tp->eee;
++        unsigned long flags;
++        u64 advertising;
++        int rc = 0;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (!HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp) ||
++            tp->DASH) {
++                rc = -EOPNOTSUPP;
++                goto out;
++        }
++
++        if (unlikely(tp->rtk_enable_diag)) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "Diag Enabled\n");
++                rc = -EBUSY;
++                goto out;
++        }
++
++        if (tp->autoneg != AUTONEG_ENABLE) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE requires autoneg\n");
++                rc = -EINVAL;
++                goto out;
++        }
++
++        /*
++        if (edata->tx_lpi_enabled) {
++        if (edata->tx_lpi_timer > tp->max_jumbo_frame_size ||
++            edata->tx_lpi_timer < ETH_MIN_MTU) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "Valid LPI timer range is %d to %d. \n",
++                           ETH_MIN_MTU, tp->max_jumbo_frame_size);
++                rc = -EINVAL;
++                goto out;
++        }
++        }
++        */
++
++        advertising = tp->advertising;
++        if (!edata->advertised) {
++                edata->advertised = advertising & eee->supported;
++        } else if (edata->advertised & ~advertising) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised %x must be a subset of autoneg advertised speeds %llu\n",
++                           edata->advertised, advertising);
++                rc = -EINVAL;
++                goto out;
++        }
++
++        if (edata->advertised & ~eee->supported) {
++                dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised %x must be a subset of support %x\n",
++                           edata->advertised, eee->supported);
++                rc = -EINVAL;
++                goto out;
++        }
++
++        //tp->eee.eee_enabled = edata->eee_enabled;
++        //tp->eee_adv_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(edata->advertised);
++
++        eee->advertised = edata->advertised;
++        //eee->tx_lpi_enabled = edata->tx_lpi_enabled;
++        //eee->tx_lpi_timer = edata->tx_lpi_timer;
++        eee->eee_enabled = edata->eee_enabled;
++
++        if (eee->eee_enabled)
++                rtl8125_enable_eee(tp);
++        else
++                rtl8125_disable_eee(tp);
++
++        rtl_nway_reset(net);
++
++out:
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return rc;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) */
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) */
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)
++static void rtl8125_get_channels(struct net_device *dev,
++                                 struct ethtool_channels *channel)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        channel->max_rx = tp->HwSuppNumRxQueues;
++        channel->max_tx = tp->HwSuppNumTxQueues;
++        channel->rx_count = tp->num_rx_rings;
++        channel->tx_count = tp->num_tx_rings;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) */
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++static const struct ethtool_ops rtl8125_ethtool_ops = {
++        .get_drvinfo        = rtl8125_get_drvinfo,
++        .get_regs_len       = rtl8125_get_regs_len,
++        .get_link       = ethtool_op_get_link,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++        .get_ringparam      = rtl8125_get_ringparam,
++        .set_ringparam      = rtl8125_set_ringparam,
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++        .get_settings       = rtl8125_get_settings,
++        .set_settings       = rtl8125_set_settings,
++#else
++        .get_link_ksettings       = rtl8125_get_settings,
++        .set_link_ksettings       = rtl8125_set_settings,
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++        .get_pauseparam     = rtl8125_get_pauseparam,
++        .set_pauseparam     = rtl8125_set_pauseparam,
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
++        .get_msglevel       = rtl8125_get_msglevel,
++        .set_msglevel       = rtl8125_set_msglevel,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++        .get_rx_csum        = rtl8125_get_rx_csum,
++        .set_rx_csum        = rtl8125_set_rx_csum,
++        .get_tx_csum        = rtl8125_get_tx_csum,
++        .set_tx_csum        = rtl8125_set_tx_csum,
++        .get_sg         = ethtool_op_get_sg,
++        .set_sg         = ethtool_op_set_sg,
++#ifdef NETIF_F_TSO
++        .get_tso        = ethtool_op_get_tso,
++        .set_tso        = ethtool_op_set_tso,
++#endif //NETIF_F_TSO
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
++        .get_regs       = rtl8125_get_regs,
++        .get_wol        = rtl8125_get_wol,
++        .set_wol        = rtl8125_set_wol,
++        .get_strings        = rtl8125_get_strings,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
++        .get_stats_count    = rtl8125_get_stats_count,
++#else
++        .get_sset_count     = rtl8125_get_sset_count,
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
++        .get_ethtool_stats  = rtl8125_get_ethtool_stats,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
++#ifdef ETHTOOL_GPERMADDR
++        .get_perm_addr      = ethtool_op_get_perm_addr,
++#endif //ETHTOOL_GPERMADDR
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
++        .get_eeprom     = rtl_get_eeprom,
++        .get_eeprom_len     = rtl_get_eeprom_len,
++#ifdef ENABLE_RSS_SUPPORT
++        .get_rxnfc		= rtl8125_get_rxnfc,
++        .set_rxnfc		= rtl8125_set_rxnfc,
++        .get_rxfh_indir_size	= rtl8125_rss_indir_size,
++        .get_rxfh_key_size	= rtl8125_get_rxfh_key_size,
++        .get_rxfh		= rtl8125_get_rxfh,
++        .set_rxfh		= rtl8125_set_rxfh,
++#endif //ENABLE_RSS_SUPPORT
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
++#ifdef ENABLE_PTP_SUPPORT
++        .get_ts_info        = rtl8125_get_ts_info,
++#else
++        .get_ts_info        = ethtool_op_get_ts_info,
++#endif //ENABLE_PTP_SUPPORT
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
++        .get_eee = rtl_ethtool_get_eee,
++        .set_eee = rtl_ethtool_set_eee,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)
++        .get_channels		= rtl8125_get_channels,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) */
++        .nway_reset = rtl_nway_reset,
++
++};
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++
++static void rtl8125_get_mac_version(struct rtl8125_private *tp)
++{
++        u32 reg,val32;
++        u32 ICVerID;
++        struct pci_dev *pdev = tp->pci_dev;
++
++        val32 = RTL_R32(tp, TxConfig);
++        reg = val32 & 0x7c800000;
++        ICVerID = val32 & 0x00700000;
++
++        switch (reg) {
++        case 0x60800000:
++                if (ICVerID == 0x00000000) {
++                        tp->mcfg = CFG_METHOD_2;
++                } else if (ICVerID == 0x100000) {
++                        tp->mcfg = CFG_METHOD_3;
++                } else {
++                        tp->mcfg = CFG_METHOD_3;
++                        tp->HwIcVerUnknown = TRUE;
++                }
++
++                tp->efuse_ver = EFUSE_SUPPORT_V4;
++                break;
++        case 0x64000000:
++                if (ICVerID == 0x00000000) {
++                        tp->mcfg = CFG_METHOD_4;
++                } else if (ICVerID == 0x100000) {
++                        tp->mcfg = CFG_METHOD_5;
++                } else {
++                        tp->mcfg = CFG_METHOD_5;
++                        tp->HwIcVerUnknown = TRUE;
++                }
++
++                tp->efuse_ver = EFUSE_SUPPORT_V4;
++                break;
++        case 0x68000000:
++                if (ICVerID == 0x00000000) {
++                        tp->mcfg = CFG_METHOD_8;
++                } else if (ICVerID == 0x100000) {
++                        tp->mcfg = CFG_METHOD_9;
++                } else {
++                        tp->mcfg = CFG_METHOD_9;
++                        tp->HwIcVerUnknown = TRUE;
++                }
++
++                tp->efuse_ver = EFUSE_SUPPORT_V4;
++                break;
++        case 0x68800000:
++                if (ICVerID == 0x00000000) {
++                        tp->mcfg = CFG_METHOD_10;
++                } else if (ICVerID == 0x100000) {
++                        tp->mcfg = CFG_METHOD_11;
++                } else {
++                        tp->mcfg = CFG_METHOD_11;
++                        tp->HwIcVerUnknown = TRUE;
++                }
++
++                tp->efuse_ver = EFUSE_SUPPORT_V4;
++                break;
++        case 0x70800000:
++                if (ICVerID == 0x00000000) {
++                        tp->mcfg = CFG_METHOD_12;
++                } else {
++                        tp->mcfg = CFG_METHOD_12;
++                        tp->HwIcVerUnknown = TRUE;
++                }
++
++                tp->efuse_ver = EFUSE_SUPPORT_V4;
++                break;
++        default:
++                printk("unknown chip version (%x)\n",reg);
++                tp->mcfg = CFG_METHOD_DEFAULT;
++                tp->HwIcVerUnknown = TRUE;
++                tp->efuse_ver = EFUSE_NOT_SUPPORT;
++                break;
++        }
++
++        if (pdev->device == 0x8162) {
++                if (tp->mcfg == CFG_METHOD_3)
++                        tp->mcfg = CFG_METHOD_6;
++                else if (tp->mcfg == CFG_METHOD_5)
++                        tp->mcfg = CFG_METHOD_7;
++                else if (tp->mcfg == CFG_METHOD_11)
++                        tp->mcfg = CFG_METHOD_13;
++        }
++}
++
++static void
++rtl8125_print_mac_version(struct rtl8125_private *tp)
++{
++        int i;
++        for (i = ARRAY_SIZE(rtl_chip_info) - 1; i >= 0; i--) {
++                if (tp->mcfg == rtl_chip_info[i].mcfg) {
++                        dprintk("Realtek %s Ethernet controller mcfg = %04d\n",
++                                MODULENAME, rtl_chip_info[i].mcfg);
++                        return;
++                }
++        }
++
++        dprintk("mac_version == Unknown\n");
++}
++
++static void
++rtl8125_tally_counter_addr_fill(struct rtl8125_private *tp)
++{
++        if (!tp->tally_paddr)
++                return;
++
++        RTL_W32(tp, CounterAddrHigh, (u64)tp->tally_paddr >> 32);
++        RTL_W32(tp, CounterAddrLow, (u64)tp->tally_paddr & (DMA_BIT_MASK(32)));
++}
++
++static void
++rtl8125_tally_counter_clear(struct rtl8125_private *tp)
++{
++        if (!tp->tally_paddr)
++                return;
++
++        RTL_W32(tp, CounterAddrHigh, (u64)tp->tally_paddr >> 32);
++        RTL_W32(tp, CounterAddrLow, ((u64)tp->tally_paddr & (DMA_BIT_MASK(32))) | CounterReset);
++}
++
++static void
++rtl8125_clear_phy_ups_reg(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xA466, BIT_0);
++                break;
++        };
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA468, BIT_3 | BIT_1);
++}
++
++static int
++rtl8125_is_ups_resume(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        return (rtl8125_mac_ocp_read(tp, 0xD42C) & BIT_8);
++}
++
++static void
++rtl8125_clear_ups_resume_bit(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_clear_mac_ocp_bit(tp, 0xD42C, BIT_8);
++}
++
++static u8
++rtl8125_get_phy_state(struct rtl8125_private *tp)
++{
++        return (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA420) & 0x7);
++}
++
++static bool
++rtl8125_wait_phy_state_ready(struct rtl8125_private *tp, u16 state,
++                             u32 ms)
++{
++        u16 tmp_state;
++        u32 wait_cnt;
++        bool ready;
++        u32 i;
++
++        if (ms >= 1000)
++                wait_cnt = ms / 1000;
++        else
++                wait_cnt = 100;
++
++        i = 0;
++        do {
++                tmp_state = rtl8125_get_phy_state(tp);
++                mdelay(1);
++                i++;
++        } while ((i < wait_cnt) && (tmp_state != state));
++
++        ready = (i == wait_cnt && tmp_state != state) ? FALSE : TRUE;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(i == wait_cnt);
++#endif
++        return ready;
++}
++
++static void
++rtl8125_wait_phy_ups_resume(struct net_device *dev, u16 PhyState)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        for (i=0; i< 100; i++) {
++                if (rtl8125_get_phy_state(tp) == PhyState)
++                        break;
++                else
++                        mdelay(1);
++        }
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
++        WARN_ON_ONCE(i == 100);
++#endif
++}
++
++static void
++rtl8125_set_mcu_d3_stack(struct rtl8125_private *tp)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                rtl8125_mac_ocp_write(tp, 0xD018, 0xD116);
++                rtl8125_mac_ocp_write(tp, 0xD116, 0x45E0);
++                break;
++        case CFG_METHOD_9:
++                rtl8125_mac_ocp_write(tp, 0xD018, 0xD116);
++                rtl8125_mac_ocp_write(tp, 0xD116, 0x4782);
++                break;
++        case CFG_METHOD_10:
++                rtl8125_mac_ocp_write(tp, 0xD018, 0xD116);
++                rtl8125_mac_ocp_write(tp, 0xD116, 0x4836);
++                break;
++        case CFG_METHOD_11:
++                rtl8125_mac_ocp_write(tp, 0xD018, 0xD116);
++                rtl8125_mac_ocp_write(tp, 0xD116, 0x4848);
++                break;
++        case CFG_METHOD_12:
++                rtl8125_mac_ocp_write(tp, 0xD018, 0xD116);
++                rtl8125_mac_ocp_write(tp, 0xD116, 0x4C76);
++                break;
++        default:
++                return;
++        }
++}
++
++static void
++_rtl8125_enable_now_is_oob(struct rtl8125_private *tp)
++{
++        if (tp->HwSuppNowIsOobVer == 1)
++                RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) | Now_is_oob);
++}
++
++void
++rtl8125_enable_now_is_oob(struct rtl8125_private *tp)
++{
++        rtl8125_set_mcu_d3_stack(tp);
++        _rtl8125_enable_now_is_oob(tp);
++}
++
++void
++rtl8125_disable_now_is_oob(struct rtl8125_private *tp)
++{
++        if (tp->HwSuppNowIsOobVer == 1)
++                RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) & ~Now_is_oob);
++}
++
++static void
++rtl8125_exit_oob(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 data16;
++
++        rtl8125_disable_rx_packet_filter(tp);
++
++        if (HW_DASH_SUPPORT_DASH(tp))
++                rtl8125_driver_start(tp);
++
++#ifdef ENABLE_REALWOW_SUPPORT
++        rtl8125_realwow_hw_init(dev);
++#else
++        //Disable realwow  function
++        rtl8125_mac_ocp_write(tp, 0xC0BC, 0x00FF);
++#endif //ENABLE_REALWOW_SUPPORT
++
++        rtl8125_nic_reset(dev);
++
++        rtl8125_disable_now_is_oob(tp);
++
++        data16 = rtl8125_mac_ocp_read(tp, 0xE8DE) & ~BIT_14;
++        rtl8125_mac_ocp_write(tp, 0xE8DE, data16);
++        rtl8125_wait_ll_share_fifo_ready(dev);
++
++        rtl8125_mac_ocp_write(tp, 0xC0AA, 0x07D0);
++#ifdef ENABLE_LIB_SUPPORT
++        rtl8125_mac_ocp_write(tp, 0xC0A6, 0x04E2);
++#else
++        rtl8125_mac_ocp_write(tp, 0xC0A6, 0x01B5);
++#endif
++        rtl8125_mac_ocp_write(tp, 0xC01E, 0x5555);
++
++        rtl8125_wait_ll_share_fifo_ready(dev);
++
++        //wait ups resume (phy state 2)
++        if (rtl8125_is_ups_resume(dev)) {
++                rtl8125_wait_phy_ups_resume(dev, 2);
++                rtl8125_clear_ups_resume_bit(dev);
++                rtl8125_clear_phy_ups_reg(dev);
++        }
++}
++
++void
++rtl8125_hw_disable_mac_mcu_bps(struct net_device *dev)
++{
++        u16 regAddr;
++
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_enable_aspm_clkreq_lock(tp, 0);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x0000);
++
++        for (regAddr = 0xFC28; regAddr < 0xFC48; regAddr += 2) {
++                rtl8125_mac_ocp_write(tp, regAddr, 0x0000);
++        }
++
++        fsleep(3000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x0000);
++}
++
++#ifndef ENABLE_USE_FIRMWARE_FILE
++static void
++rtl8125_switch_mac_mcu_ram_code_page(struct rtl8125_private *tp, u16 page)
++{
++        u16 tmpUshort;
++
++        page &= (BIT_1 | BIT_0);
++        tmpUshort = rtl8125_mac_ocp_read(tp, 0xE446);
++        tmpUshort &= ~(BIT_1 | BIT_0);
++        tmpUshort |= page;
++        rtl8125_mac_ocp_write(tp, 0xE446, tmpUshort);
++}
++
++static void
++_rtl8125_set_hw_mcu_patch_code_ver(struct rtl8125_private *tp, u64 ver)
++{
++        int i;
++
++        /* Switch to page 2 */
++        rtl8125_switch_mac_mcu_ram_code_page(tp, 2);
++
++        for (i = 0; i < 8; i += 2) {
++                rtl8125_mac_ocp_write(tp, 0xF9F8 + 6 - i, (u16)ver);
++                ver >>= 16;
++        }
++
++        /* Switch back to page 0 */
++        rtl8125_switch_mac_mcu_ram_code_page(tp, 0);
++}
++
++static void
++rtl8125_set_hw_mcu_patch_code_ver(struct rtl8125_private *tp, u64 ver)
++{
++        _rtl8125_set_hw_mcu_patch_code_ver(tp, ver);
++
++        tp->hw_mcu_patch_code_ver = ver;
++}
++
++static u64
++rtl8125_get_hw_mcu_patch_code_ver(struct rtl8125_private *tp)
++{
++        u64 ver;
++        int i;
++
++        /* Switch to page 2 */
++        rtl8125_switch_mac_mcu_ram_code_page(tp, 2);
++
++        ver = 0;
++        for (i = 0; i < 8; i += 2) {
++                ver <<= 16;
++                ver |= rtl8125_mac_ocp_read(tp, 0xF9F8 + i);
++        }
++
++        /* Switch back to page 0 */
++        rtl8125_switch_mac_mcu_ram_code_page(tp, 0);
++
++        return ver;
++}
++
++static u64
++rtl8125_get_bin_mcu_patch_code_ver(const u16 *entry, u16 entry_cnt)
++{
++        u64 ver;
++        int i;
++
++        if (entry == NULL || entry_cnt == 0 || entry_cnt < 4)
++                return 0;
++
++        ver = 0;
++        for (i = 0; i < 4; i++) {
++                ver <<= 16;
++                ver |= entry[entry_cnt - 4 + i];
++        }
++
++        return ver;
++}
++
++static void
++_rtl8125_write_mac_mcu_ram_code(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt)
++{
++        u16 i;
++
++        for (i = 0; i < entry_cnt; i++)
++                rtl8125_mac_ocp_write(tp, 0xF800 + i * 2, entry[i]);
++}
++
++static void
++_rtl8125_write_mac_mcu_ram_code_with_page(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt, u16 page_size)
++{
++        u16 i;
++        u16 offset;
++
++        if (page_size == 0)
++                return;
++
++        for (i = 0; i < entry_cnt; i++) {
++                offset = i % page_size;
++                if (offset == 0) {
++                        u16 page = (i / page_size);
++                        rtl8125_switch_mac_mcu_ram_code_page(tp, page);
++                }
++                rtl8125_mac_ocp_write(tp, 0xF800 + offset * 2, entry[i]);
++        }
++}
++
++static void
++rtl8125_write_mac_mcu_ram_code(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt)
++{
++        if (FALSE == HW_SUPPORT_MAC_MCU(tp))
++                return;
++
++        if (entry == NULL || entry_cnt == 0)
++                return;
++
++        if (tp->MacMcuPageSize > 0)
++                _rtl8125_write_mac_mcu_ram_code_with_page(tp, entry, entry_cnt, tp->MacMcuPageSize);
++        else
++                _rtl8125_write_mac_mcu_ram_code(tp, entry, entry_cnt);
++
++        if (tp->bin_mcu_patch_code_ver > 0)
++                rtl8125_set_hw_mcu_patch_code_ver(tp, tp->bin_mcu_patch_code_ver);
++}
++
++static void
++rtl8125_set_mac_mcu_8125a_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE062, 0xE072, 0xE074, 0xE079, 0xE07B, 0xE0E4, 0xE0ED, 0xE0EF,
++                0xE0FA, 0xE105, 0xE116, 0xE11C, 0xE121, 0xE126, 0xE12A, 0xB400, 0xB401,
++                0xB402, 0xB403, 0xB404, 0xB405, 0xC03F, 0x7206, 0x49AE, 0xF1FE, 0xC13C,
++                0x9904, 0xC13B, 0x9906, 0x7206, 0x49AE, 0xF1FE, 0x7200, 0x49A0, 0xF10D,
++                0xC534, 0xC133, 0xC238, 0xC338, 0xE817, 0xC337, 0xE815, 0xC336, 0xE813,
++                0xC335, 0xE811, 0xE01B, 0xC129, 0xC22D, 0xC528, 0xC32C, 0xE80B, 0xC526,
++                0xC32A, 0xE808, 0xC524, 0xC328, 0xE805, 0xC522, 0xC326, 0xE802, 0xE00C,
++                0x740E, 0x49CE, 0xF1FE, 0x9908, 0x9D0A, 0x9A0C, 0x9B0E, 0x740E, 0x49CE,
++                0xF1FE, 0xFF80, 0xB005, 0xB004, 0xB003, 0xB002, 0xB001, 0xB000, 0xC604,
++                0xC002, 0xB800, 0x3044, 0xE000, 0xE8E0, 0xF128, 0x0002, 0xFFFF, 0x10EC,
++                0x816A, 0x816F, 0x8164, 0x816D, 0xF000, 0x8001, 0x8002, 0x8003, 0x8004,
++                0xC60F, 0x73C4, 0x49B3, 0xF106, 0x73C2, 0xC608, 0xB406, 0xC609, 0xFF80,
++                0xC605, 0xB406, 0xC605, 0xFF80, 0x0544, 0x0568, 0xE906, 0xCDE8, 0xC602,
++                0xBE00, 0x0000, 0x48C1, 0x48C2, 0x9C46, 0xC402, 0xBC00, 0x0A12, 0xC602,
++                0xBE00, 0x0EBA, 0x1501, 0xF02A, 0x1500, 0xF15D, 0xC661, 0x75C8, 0x49D5,
++                0xF00A, 0x49D6, 0xF008, 0x49D7, 0xF006, 0x49D8, 0xF004, 0x75D2, 0x49D9,
++                0xF150, 0xC553, 0x77A0, 0x75C8, 0x4855, 0x4856, 0x4857, 0x4858, 0x48DA,
++                0x48DB, 0x49FE, 0xF002, 0x485A, 0x49FF, 0xF002, 0x485B, 0x9DC8, 0x75D2,
++                0x4859, 0x9DD2, 0xC643, 0x75C0, 0x49D4, 0xF033, 0x49D1, 0xF137, 0xE030,
++                0xC63A, 0x75C8, 0x49D5, 0xF00E, 0x49D6, 0xF00C, 0x49D7, 0xF00A, 0x49D8,
++                0xF008, 0x75D2, 0x49D9, 0xF005, 0xC62E, 0x75C0, 0x49D7, 0xF125, 0xC528,
++                0x77A0, 0xC627, 0x75C8, 0x4855, 0x4856, 0x4857, 0x4858, 0x48DA, 0x48DB,
++                0x49FE, 0xF002, 0x485A, 0x49FF, 0xF002, 0x485B, 0x9DC8, 0x75D2, 0x4859,
++                0x9DD2, 0xC616, 0x75C0, 0x4857, 0x9DC0, 0xC613, 0x75C0, 0x49DA, 0xF003,
++                0x49D1, 0xF107, 0xC60B, 0xC50E, 0x48D9, 0x9DC0, 0x4859, 0x9DC0, 0xC608,
++                0xC702, 0xBF00, 0x3AE0, 0xE860, 0xB400, 0xB5D4, 0xE908, 0xE86C, 0x1200,
++                0xC409, 0x6780, 0x48F1, 0x8F80, 0xC404, 0xC602, 0xBE00, 0x10AA, 0xC010,
++                0xEA7C, 0xC602, 0xBE00, 0x0000, 0x740A, 0x4846, 0x4847, 0x9C0A, 0xC607,
++                0x74C0, 0x48C6, 0x9CC0, 0xC602, 0xBE00, 0x13FE, 0xE054, 0x72CA, 0x4826,
++                0x4827, 0x9ACA, 0xC607, 0x72C0, 0x48A6, 0x9AC0, 0xC602, 0xBE00, 0x07DC,
++                0xE054, 0xC60F, 0x74C4, 0x49CC, 0xF109, 0xC60C, 0x74CA, 0x48C7, 0x9CCA,
++                0xC609, 0x74C0, 0x4846, 0x9CC0, 0xC602, 0xBE00, 0x2480, 0xE092, 0xE0C0,
++                0xE054, 0x7420, 0x48C0, 0x9C20, 0x7444, 0xC602, 0xBE00, 0x12F8, 0x1BFF,
++                0x46EB, 0x1BFF, 0xC102, 0xB900, 0x0D5A, 0x1BFF, 0x46EB, 0x1BFF, 0xC102,
++                0xB900, 0x0E2A, 0xC104, 0xC202, 0xBA00, 0x21DE, 0xD116, 0xC602, 0xBE00,
++                0x0000, 0x6486, 0x0119, 0x0606, 0x1327
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC2A, 0x0540);
++        rtl8125_mac_ocp_write(tp, 0xFC2E, 0x0A06);
++        rtl8125_mac_ocp_write(tp, 0xFC30, 0x0EB8);
++        rtl8125_mac_ocp_write(tp, 0xFC32, 0x3A5C);
++        rtl8125_mac_ocp_write(tp, 0xFC34, 0x10A8);
++        rtl8125_mac_ocp_write(tp, 0xFC40, 0x0D54);
++        rtl8125_mac_ocp_write(tp, 0xFC42, 0x0E24);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x307A);
++}
++
++static void
++rtl8125_set_mac_mcu_8125b_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE01B, 0xE026, 0xE037, 0xE03D, 0xE057, 0xE05B, 0xE060, 0xE0B6,
++                0xE103, 0xE14C, 0xE150, 0xE153, 0xE156, 0xE158, 0xE15A, 0x740A, 0x4846,
++                0x4847, 0x9C0A, 0xC607, 0x74C0, 0x48C6, 0x9CC0, 0xC602, 0xBE00, 0x13F0,
++                0xE054, 0x72CA, 0x4826, 0x4827, 0x9ACA, 0xC607, 0x72C0, 0x48A6, 0x9AC0,
++                0xC602, 0xBE00, 0x081C, 0xE054, 0xC60F, 0x74C4, 0x49CC, 0xF109, 0xC60C,
++                0x74CA, 0x48C7, 0x9CCA, 0xC609, 0x74C0, 0x4846, 0x9CC0, 0xC602, 0xBE00,
++                0x2494, 0xE092, 0xE0C0, 0xE054, 0x7420, 0x48C0, 0x9C20, 0x7444, 0xC602,
++                0xBE00, 0x12DC, 0x733A, 0x21B5, 0x25BC, 0x1304, 0xF111, 0x1B12, 0x1D2A,
++                0x3168, 0x3ADA, 0x31AB, 0x1A00, 0x9AC0, 0x1300, 0xF1FB, 0x7620, 0x236E,
++                0x276F, 0x1A3C, 0x22A1, 0x41B5, 0x9EE2, 0x76E4, 0x486F, 0x9EE4, 0xC602,
++                0xBE00, 0x4A26, 0x733A, 0x49BB, 0xC602, 0xBE00, 0x47A2, 0x48C1, 0x48C2,
++                0x9C46, 0xC402, 0xBC00, 0x0A52, 0xC74B, 0x76E2, 0xC54A, 0x402E, 0xF034,
++                0x76E0, 0x402E, 0xF006, 0xC703, 0xC403, 0xBC00, 0xC0BC, 0x0980, 0x76F0,
++                0x1601, 0xF023, 0xC741, 0x1E04, 0x9EE0, 0x1E40, 0x9EE4, 0xC63D, 0x9EE8,
++                0xC73D, 0x76E0, 0x4863, 0x9EE0, 0xC73A, 0x76E0, 0x48EA, 0x48EB, 0x9EE0,
++                0xC736, 0x1E01, 0x9EE2, 0xC72D, 0x76E0, 0x486F, 0x9EE0, 0xC72D, 0x76E0,
++                0x48E3, 0x9EE0, 0xC728, 0x1E0E, 0x9EE0, 0xC71D, 0x1E01, 0x9EE4, 0xE00D,
++                0x1E00, 0x9EF0, 0x1E05, 0xC715, 0x9EE0, 0xE00A, 0x1E00, 0x9EE2, 0xC614,
++                0x75CC, 0x48D2, 0x9DCC, 0x1E04, 0xC70B, 0x9EE0, 0xB000, 0xB001, 0xB002,
++                0xB003, 0xB004, 0xB005, 0xB006, 0xB007, 0xFFC0, 0xE428, 0xD3C0, 0xBEEF,
++                0x473E, 0xDC46, 0xE0CC, 0xE84E, 0xC0A2, 0x0100, 0xC010, 0xE85A, 0xE812,
++                0xC0B4, 0xC5F4, 0x74A0, 0xC6F3, 0x4026, 0xF107, 0x74A2, 0xC6EF, 0x4026,
++                0xF107, 0xC6ED, 0xBE00, 0x753A, 0xC602, 0xBE00, 0x462E, 0x7520, 0x49DE,
++                0xF102, 0xE7F9, 0xC6A1, 0x67C6, 0x7520, 0x22D2, 0x26DD, 0x1500, 0xF002,
++                0xE7F1, 0x7532, 0x26D5, 0x0530, 0x0D6C, 0xC42D, 0x308D, 0x7540, 0x4025,
++                0xF11E, 0x7542, 0x4025, 0xF11B, 0x7544, 0x4025, 0xF118, 0xC423, 0x7546,
++                0x4025, 0xF114, 0x7548, 0x4025, 0xF111, 0x754A, 0x4025, 0xF10E, 0xC5C0,
++                0xC4C0, 0x9CA2, 0xC6C0, 0x75CC, 0x4852, 0x9DCC, 0xC6B8, 0x1D7D, 0x9DC2,
++                0x1D01, 0x9DC0, 0xE7C9, 0xC40B, 0x7546, 0x4025, 0xF1FC, 0x7548, 0x4025,
++                0xF1F9, 0x754A, 0x4025, 0xF1F6, 0xE7C0, 0xFFFF, 0xEEEE, 0xC2A6, 0x7340,
++                0xC2A5, 0x4013, 0xF013, 0xC2AC, 0x7340, 0x4835, 0x9B40, 0xC240, 0x7358,
++                0x48B7, 0x48B2, 0x9B58, 0x7346, 0x48B7, 0x48B2, 0x9B46, 0x7340, 0x48B7,
++                0x48B2, 0x9B40, 0xE012, 0xC29A, 0x7340, 0x48B5, 0x9B40, 0xC22E, 0x7358,
++                0x4837, 0x4832, 0x9B58, 0x7346, 0x4837, 0x4832, 0x9B46, 0x7340, 0x4837,
++                0x4832, 0x9B40, 0xC283, 0x7340, 0x49BF, 0xF010, 0xC21B, 0x7344, 0x1300,
++                0xF104, 0x1B00, 0xC217, 0x9B40, 0x1B01, 0xC213, 0x9B44, 0xC213, 0x734C,
++                0x48B7, 0x9B4C, 0xE008, 0xC20C, 0x1B00, 0x9B44, 0xC20B, 0x734C, 0x4837,
++                0x9B4C, 0xC204, 0xC302, 0xBB00, 0x2230, 0xE092, 0xD3C0, 0xE428, 0xDC46,
++                0xC104, 0xC202, 0xBA00, 0x21F8, 0xD116, 0x49D1, 0xC602, 0xBE00, 0x3E7A,
++                0x49D1, 0xC602, 0xBE00, 0x3EDA, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00,
++                0x0000, 0xC602, 0xBE00, 0x0000, 0x6637, 0x0119, 0x0604, 0x1203
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x13E6);
++        rtl8125_mac_ocp_write(tp, 0xFC2A, 0x0812);
++        rtl8125_mac_ocp_write(tp, 0xFC2C, 0x248C);
++        rtl8125_mac_ocp_write(tp, 0xFC2E, 0x12DA);
++        rtl8125_mac_ocp_write(tp, 0xFC30, 0x4A20);
++        rtl8125_mac_ocp_write(tp, 0xFC32, 0x47A0);
++        //rtl8125_mac_ocp_write(tp, 0xFC34, 0x0A46);
++        //rtl8125_mac_ocp_write(tp, 0xFC36, 0x097E);
++        //rtl8125_mac_ocp_write(tp, 0xFC38, 0x462C);
++        //rtl8125_mac_ocp_write(tp, 0xFC3A, 0x222E);
++        rtl8125_mac_ocp_write(tp, 0xFC3C, 0x21F6);
++        rtl8125_mac_ocp_write(tp, 0xFC3E, 0x3E78);
++        rtl8125_mac_ocp_write(tp, 0xFC40, 0x3ED8);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x1C7B);
++}
++
++static void
++rtl8125_set_mac_mcu_8125bp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE014, 0xE027, 0xE04A, 0xE04D, 0xE050, 0xE052, 0xE054, 0xE056,
++                0xE058, 0xE05A, 0xE05C, 0xE05E, 0xE060, 0xE062, 0xE064, 0x1BC8, 0x46EB,
++                0xC302, 0xBB00, 0x0F14, 0xC211, 0x400A, 0xF00A, 0xC20F, 0x400A, 0xF007,
++                0x73A4, 0xC20C, 0x400A, 0xF102, 0x48B0, 0x9B20, 0x1B00, 0x9BA0, 0xC602,
++                0xBE00, 0x4364, 0xE6E0, 0xE6E2, 0xC01C, 0xB406, 0x1000, 0xF016, 0xC61F,
++                0x400E, 0xF012, 0x218E, 0x25BE, 0x1300, 0xF007, 0x7340, 0xC618, 0x400E,
++                0xF102, 0x48B0, 0x8320, 0xB400, 0x2402, 0x1000, 0xF003, 0x7342, 0x8322,
++                0xB000, 0xE007, 0x7322, 0x9B42, 0x7320, 0x9B40, 0x0300, 0x0300, 0xB006,
++                0xC302, 0xBB00, 0x413E, 0xE6E0, 0xC01C, 0x49D1, 0xC602, 0xBE00, 0x3F94,
++                0x49D1, 0xC602, 0xBE00, 0x4030, 0xC602, 0xBE00, 0x3FDA, 0xC102, 0xB900,
++                0x401A, 0xC102, 0xB900, 0x0000, 0xC002, 0xB800, 0x0000, 0xC602, 0xBE00,
++                0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00,
++                0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00,
++                0x0000, 0x6936, 0x0A18, 0x0C02, 0x0D21
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x0f10);
++        rtl8125_mac_ocp_write(tp, 0xFC2A, 0x435c);
++        rtl8125_mac_ocp_write(tp, 0xFC2C, 0x4112);
++        rtl8125_mac_ocp_write(tp, 0xFC2E, 0x3F92);
++        rtl8125_mac_ocp_write(tp, 0xFC30, 0x402E);
++        rtl8125_mac_ocp_write(tp, 0xFC32, 0x3FD6);
++        rtl8125_mac_ocp_write(tp, 0xFC34, 0x4018);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x007F);
++}
++
++static void
++rtl8125_set_mac_mcu_8125bp_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE033, 0xE046, 0xE04A, 0xE04D, 0xE050, 0xE054, 0xE056, 0xE058,
++                0xE05A, 0xE05C, 0xE05E, 0xE060, 0xE062, 0xE064, 0xE066, 0xB406, 0x1000,
++                0xF016, 0xC61F, 0x400E, 0xF012, 0x218E, 0x25BE, 0x1300, 0xF007, 0x7340,
++                0xC618, 0x400E, 0xF102, 0x48B0, 0x8320, 0xB400, 0x2402, 0x1000, 0xF003,
++                0x7342, 0x8322, 0xB000, 0xE007, 0x7322, 0x9B42, 0x7320, 0x9B40, 0x0300,
++                0x0300, 0xB006, 0xC302, 0xBB00, 0x4168, 0xE6E0, 0xC01C, 0xC211, 0x400A,
++                0xF00A, 0xC20F, 0x400A, 0xF007, 0x73A4, 0xC20C, 0x400A, 0xF102, 0x48B0,
++                0x9B20, 0x1B00, 0x9BA0, 0xC602, 0xBE00, 0x4392, 0xE6E0, 0xE6E2, 0xC01C,
++                0x4166, 0x9CF6, 0xC002, 0xB800, 0x143C, 0x49D1, 0xC602, 0xBE00, 0x3FC4,
++                0x49D1, 0xC602, 0xBE00, 0x405A, 0xC104, 0xC202, 0xBA00, 0x22E6, 0xD116,
++                0xC602, 0xBE00, 0x0000, 0xC102, 0xB900, 0x0000, 0xC002, 0xB800, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0x6936, 0x0119, 0x030E, 0x0B18
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x413C);
++        rtl8125_mac_ocp_write(tp, 0xFC2A, 0x438A);
++        rtl8125_mac_ocp_write(tp, 0xFC2C, 0x143A);
++        rtl8125_mac_ocp_write(tp, 0xFC2E, 0x3FC2);
++        rtl8125_mac_ocp_write(tp, 0xFC30, 0x4058);
++        rtl8125_mac_ocp_write(tp, 0xFC32, 0x22E4);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x003F);
++}
++
++static void
++rtl8125_set_mac_mcu_8125d_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE014, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022, 0xE024,
++                0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xE032, 0x4166, 0x9CF6,
++                0xC002, 0xB800, 0x14A4, 0xC104, 0xC202, 0xBA00, 0x2378, 0xD116, 0xC602,
++                0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602,
++                0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602,
++                0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602,
++                0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602,
++                0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x6938,
++                0x0A19, 0x030E, 0x0B2B
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x14A2);
++        rtl8125_mac_ocp_write(tp, 0xFC2A, 0x2376);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x0003);
++}
++
++static void
++rtl8125_set_mac_mcu_8125d_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE014, 0xE016, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022,
++                0xE024, 0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xC104, 0xC202,
++                0xBA00, 0x2384, 0xD116, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
++                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x6938,
++                0x0A19, 0x030E, 0x0B2F
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x2382);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x0001);
++}
++
++
++static void
++rtl8125_set_mac_mcu_8125cp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        static const u16 mcu_patch_code[] = {
++                0xE010, 0xE014, 0xE016, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022,
++                0xE024, 0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xC104, 0xC202,
++                0xBA00, 0x2438, 0xD116, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000,
++                0xC602, 0xBE00, 0x0000, 0x7023, 0x0019, 0x031A, 0x0E20
++        };
++
++        /* Get BIN mac mcu patch code version */
++        tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver)
++                rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code));
++
++        rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000);
++
++        rtl8125_mac_ocp_write(tp, 0xFC28, 0x2436);
++
++        rtl8125_mac_ocp_write(tp, 0xFC48, 0x0001);
++}
++
++static void
++rtl8125_hw_mac_mcu_config(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (tp->NotWrMcuPatchCode == TRUE)
++                return;
++
++        rtl8125_hw_disable_mac_mcu_bps(dev);
++
++        /* Get H/W mac mcu patch code version */
++        tp->hw_mcu_patch_code_ver = rtl8125_get_hw_mcu_patch_code_ver(tp);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_set_mac_mcu_8125a_2(dev);
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                rtl8125_set_mac_mcu_8125b_2(dev);
++                break;
++        case CFG_METHOD_8:
++                rtl8125_set_mac_mcu_8125bp_1(dev);
++                break;
++        case CFG_METHOD_9:
++                rtl8125_set_mac_mcu_8125bp_2(dev);
++                break;
++        case CFG_METHOD_10:
++                rtl8125_set_mac_mcu_8125d_1(dev);
++                break;
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                rtl8125_set_mac_mcu_8125d_2(dev);
++                break;
++        case CFG_METHOD_12:
++                rtl8125_set_mac_mcu_8125cp_1(dev);
++                break;
++        case CFG_METHOD_2:
++        case CFG_METHOD_4:
++                /* no mac mcu patch code */
++                break;
++        default:
++                break;
++        }
++}
++#endif
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++static void rtl8125_release_firmware(struct rtl8125_private *tp)
++{
++        if (tp->rtl_fw) {
++                rtl8125_fw_release_firmware(tp->rtl_fw);
++                kfree(tp->rtl_fw);
++                tp->rtl_fw = NULL;
++        }
++}
++
++static void rtl8125_apply_firmware(struct rtl8125_private *tp)
++{
++        unsigned long flags;
++
++        /* TODO: release firmware if rtl_fw_write_firmware signals failure. */
++        if (tp->rtl_fw) {
++                r8125_spin_lock(&tp->phy_lock, flags);
++
++                rtl8125_fw_write_firmware(tp, tp->rtl_fw);
++                /* At least one firmware doesn't reset tp->ocp_base. */
++                tp->ocp_base = OCP_STD_PHY_BASE;
++
++                /* PHY soft reset may still be in progress */
++                //phy_read_poll_timeout(tp->phydev, MII_BMCR, val,
++                //		      !(val & BMCR_RESET),
++                //		      50000, 600000, true);
++                rtl8125_wait_phy_reset_complete(tp);
++
++                tp->hw_ram_code_ver = rtl8125_get_hw_phy_mcu_code_ver(tp);
++                tp->sw_ram_code_ver = tp->hw_ram_code_ver;
++                tp->HwHasWrRamCodeToMicroP = TRUE;
++
++                r8125_spin_unlock(&tp->phy_lock, flags);
++        }
++}
++#endif
++
++static void
++rtl8125_hw_init(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 csi_tmp;
++
++        rtl8125_enable_aspm_clkreq_lock(tp, 0);
++        rtl8125_enable_force_clkreq(tp, 0);
++
++        rtl8125_set_reg_oobs_en_sel(tp, true);
++
++        //Disable UPS
++        rtl8125_mac_ocp_write(tp, 0xD40A, rtl8125_mac_ocp_read(tp, 0xD40A) & ~(BIT_4));
++
++#ifndef ENABLE_USE_FIRMWARE_FILE
++        if (!tp->rtl_fw)
++                rtl8125_hw_mac_mcu_config(dev);
++#endif
++
++        /*disable ocp phy power saving*/
++        if (tp->mcfg == CFG_METHOD_2 ||
++            tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6)
++                rtl8125_disable_ocp_phy_power_saving(dev);
++
++        //Set PCIE uncorrectable error status mask pcie 0x108
++        csi_tmp = rtl8125_csi_read(tp, 0x108);
++        csi_tmp |= BIT_20;
++        rtl8125_csi_write(tp, 0x108, csi_tmp);
++
++        rtl8125_enable_cfg9346_write(tp);
++        rtl8125_disable_linkchg_wakeup(dev);
++        rtl8125_disable_cfg9346_write(tp);
++        rtl8125_disable_magic_packet(dev);
++        rtl8125_disable_d0_speedup(tp);
++        rtl8125_set_pci_pme(tp, 0);
++        if (s0_magic_packet == 1)
++                rtl8125_enable_magic_packet(dev);
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++        if (tp->rtl_fw && !tp->resume_not_chg_speed)
++                rtl8125_apply_firmware(tp);
++#endif
++}
++
++static void
++rtl8125_hw_ephy_config(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                rtl8125_ephy_write(tp, 0x01, 0xA812);
++                rtl8125_ephy_write(tp, 0x09, 0x520C);
++                rtl8125_ephy_write(tp, 0x04, 0xD000);
++                rtl8125_ephy_write(tp, 0x0D, 0xF702);
++                rtl8125_ephy_write(tp, 0x0A, 0x8653);
++                rtl8125_ephy_write(tp, 0x06, 0x001E);
++                rtl8125_ephy_write(tp, 0x08, 0x3595);
++                rtl8125_ephy_write(tp, 0x20, 0x9455);
++                rtl8125_ephy_write(tp, 0x21, 0x99FF);
++                rtl8125_ephy_write(tp, 0x02, 0x6046);
++                rtl8125_ephy_write(tp, 0x29, 0xFE00);
++                rtl8125_ephy_write(tp, 0x23, 0xAB62);
++
++                rtl8125_ephy_write(tp, 0x41, 0xA80C);
++                rtl8125_ephy_write(tp, 0x49, 0x520C);
++                rtl8125_ephy_write(tp, 0x44, 0xD000);
++                rtl8125_ephy_write(tp, 0x4D, 0xF702);
++                rtl8125_ephy_write(tp, 0x4A, 0x8653);
++                rtl8125_ephy_write(tp, 0x46, 0x001E);
++                rtl8125_ephy_write(tp, 0x48, 0x3595);
++                rtl8125_ephy_write(tp, 0x60, 0x9455);
++                rtl8125_ephy_write(tp, 0x61, 0x99FF);
++                rtl8125_ephy_write(tp, 0x42, 0x6046);
++                rtl8125_ephy_write(tp, 0x69, 0xFE00);
++                rtl8125_ephy_write(tp, 0x63, 0xAB62);
++                break;
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_ephy_write(tp, 0x04, 0xD000);
++                rtl8125_ephy_write(tp, 0x0A, 0x8653);
++                rtl8125_ephy_write(tp, 0x23, 0xAB66);
++                rtl8125_ephy_write(tp, 0x20, 0x9455);
++                rtl8125_ephy_write(tp, 0x21, 0x99FF);
++                rtl8125_ephy_write(tp, 0x29, 0xFE04);
++
++                rtl8125_ephy_write(tp, 0x44, 0xD000);
++                rtl8125_ephy_write(tp, 0x4A, 0x8653);
++                rtl8125_ephy_write(tp, 0x63, 0xAB66);
++                rtl8125_ephy_write(tp, 0x60, 0x9455);
++                rtl8125_ephy_write(tp, 0x61, 0x99FF);
++                rtl8125_ephy_write(tp, 0x69, 0xFE04);
++
++                ClearAndSetPCIePhyBit(tp,
++                                      0x2A,
++                                      (BIT_14 | BIT_13 | BIT_12),
++                                      (BIT_13 | BIT_12));
++                ClearPCIePhyBit(tp, 0x19, BIT_6);
++                SetPCIePhyBit(tp, 0x1B, (BIT_11 | BIT_10 | BIT_9));
++                ClearPCIePhyBit(tp, 0x1B, (BIT_14 | BIT_13 | BIT_12));
++                rtl8125_ephy_write(tp, 0x02, 0x6042);
++                rtl8125_ephy_write(tp, 0x06, 0x0014);
++
++                ClearAndSetPCIePhyBit(tp,
++                                      0x6A,
++                                      (BIT_14 | BIT_13 | BIT_12),
++                                      (BIT_13 | BIT_12));
++                ClearPCIePhyBit(tp, 0x59, BIT_6);
++                SetPCIePhyBit(tp, 0x5B, (BIT_11 | BIT_10 | BIT_9));
++                ClearPCIePhyBit(tp, 0x5B, (BIT_14 | BIT_13 | BIT_12));
++                rtl8125_ephy_write(tp, 0x42, 0x6042);
++                rtl8125_ephy_write(tp, 0x46, 0x0014);
++                break;
++        case CFG_METHOD_4:
++                rtl8125_ephy_write(tp, 0x06, 0x001F);
++                rtl8125_ephy_write(tp, 0x0A, 0xB66B);
++                rtl8125_ephy_write(tp, 0x01, 0xA852);
++                rtl8125_ephy_write(tp, 0x24, 0x0008);
++                rtl8125_ephy_write(tp, 0x2F, 0x6052);
++                rtl8125_ephy_write(tp, 0x0D, 0xF716);
++                rtl8125_ephy_write(tp, 0x20, 0xD477);
++                rtl8125_ephy_write(tp, 0x21, 0x4477);
++                rtl8125_ephy_write(tp, 0x22, 0x0013);
++                rtl8125_ephy_write(tp, 0x23, 0xBB66);
++                rtl8125_ephy_write(tp, 0x0B, 0xA909);
++                rtl8125_ephy_write(tp, 0x29, 0xFF04);
++                rtl8125_ephy_write(tp, 0x1B, 0x1EA0);
++
++                rtl8125_ephy_write(tp, 0x46, 0x001F);
++                rtl8125_ephy_write(tp, 0x4A, 0xB66B);
++                rtl8125_ephy_write(tp, 0x41, 0xA84A);
++                rtl8125_ephy_write(tp, 0x64, 0x000C);
++                rtl8125_ephy_write(tp, 0x6F, 0x604A);
++                rtl8125_ephy_write(tp, 0x4D, 0xF716);
++                rtl8125_ephy_write(tp, 0x60, 0xD477);
++                rtl8125_ephy_write(tp, 0x61, 0x4477);
++                rtl8125_ephy_write(tp, 0x62, 0x0013);
++                rtl8125_ephy_write(tp, 0x63, 0xBB66);
++                rtl8125_ephy_write(tp, 0x4B, 0xA909);
++                rtl8125_ephy_write(tp, 0x69, 0xFF04);
++                rtl8125_ephy_write(tp, 0x5B, 0x1EA0);
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                rtl8125_ephy_write(tp, 0x0B, 0xA908);
++                rtl8125_ephy_write(tp, 0x1E, 0x20EB);
++                rtl8125_ephy_write(tp, 0x22, 0x0023);
++                rtl8125_ephy_write(tp, 0x02, 0x60C2);
++                rtl8125_ephy_write(tp, 0x29, 0xFF00);
++
++                rtl8125_ephy_write(tp, 0x4B, 0xA908);
++                rtl8125_ephy_write(tp, 0x5E, 0x28EB);
++                rtl8125_ephy_write(tp, 0x62, 0x0023);
++                rtl8125_ephy_write(tp, 0x42, 0x60C2);
++                rtl8125_ephy_write(tp, 0x69, 0xFF00);
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                /* nothing to do */
++                break;
++        }
++}
++
++static u16
++rtl8125_get_hw_phy_mcu_code_ver(struct rtl8125_private *tp)
++{
++        u16 hw_ram_code_ver;
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801E);
++        hw_ram_code_ver = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438);
++
++        return hw_ram_code_ver;
++}
++
++static int
++rtl8125_check_hw_phy_mcu_code_ver(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        tp->hw_ram_code_ver = rtl8125_get_hw_phy_mcu_code_ver(tp);
++
++        if (tp->hw_ram_code_ver == tp->sw_ram_code_ver) {
++                tp->HwHasWrRamCodeToMicroP = TRUE;
++                return 1;
++        } else {
++                tp->HwHasWrRamCodeToMicroP = FALSE;
++                return 0;
++        }
++}
++
++bool
++rtl8125_set_phy_mcu_patch_request(struct rtl8125_private *tp)
++{
++        u16 gphy_val;
++        u16 WaitCount;
++        bool bSuccess = TRUE;
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_4);
++
++        WaitCount = 0;
++        do {
++                gphy_val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB800);
++                udelay(100);
++                WaitCount++;
++        } while (!(gphy_val & BIT_6) && (WaitCount < 1000));
++
++        if (!(gphy_val & BIT_6) && (WaitCount == 1000))
++                bSuccess = FALSE;
++
++        if (!bSuccess)
++                dprintk("rtl8125_set_phy_mcu_patch_request fail.\n");
++
++        return bSuccess;
++}
++
++bool
++rtl8125_clear_phy_mcu_patch_request(struct rtl8125_private *tp)
++{
++        u16 gphy_val;
++        u16 WaitCount;
++        bool bSuccess = TRUE;
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_4);
++
++        WaitCount = 0;
++        do {
++                gphy_val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB800);
++                udelay(100);
++                WaitCount++;
++        } while ((gphy_val & BIT_6) && (WaitCount < 1000));
++
++        if ((gphy_val & BIT_6) && (WaitCount == 1000))
++                bSuccess = FALSE;
++
++        if (!bSuccess)
++                dprintk("rtl8125_clear_phy_mcu_patch_request fail.\n");
++
++        return bSuccess;
++}
++
++#ifndef ENABLE_USE_FIRMWARE_FILE
++static void
++rtl8125_write_hw_phy_mcu_code_ver(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, tp->sw_ram_code_ver);
++        tp->hw_ram_code_ver = tp->sw_ram_code_ver;
++}
++
++static void
++rtl8125_acquire_phy_mcu_patch_key_lock(struct rtl8125_private *tp)
++{
++        u16 PatchKey;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                PatchKey = 0x8600;
++                break;
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                PatchKey = 0x8601;
++                break;
++        case CFG_METHOD_4:
++                PatchKey = 0x3700;
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                PatchKey = 0x3701;
++                break;
++        default:
++                return;
++        }
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, PatchKey);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xB82E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0001);
++}
++
++static void
++rtl8125_release_phy_mcu_patch_key_lock(struct rtl8125_private *tp)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x0000);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xB82E, BIT_0);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8024);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++                break;
++        default:
++                break;
++        }
++}
++
++static void
++rtl8125_set_phy_mcu_ram_code(struct net_device *dev, const u16 *ramcode, u16 codesize)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 i;
++        u16 addr;
++        u16 val;
++
++        if (ramcode == NULL || codesize % 2) {
++                goto out;
++        }
++
++        for (i = 0; i < codesize; i += 2) {
++                addr = ramcode[i];
++                val = ramcode[i + 1];
++                if (addr == 0xFFFF && val == 0xFFFF) {
++                        break;
++                }
++                rtl8125_mdio_direct_write_phy_ocp(tp, addr, val);
++        }
++
++out:
++        return;
++}
++
++static void
++rtl8125_enable_phy_disable_mode(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppCheckPhyDisableModeVer) {
++        case 3:
++                RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) | BIT_5);
++                break;
++        }
++
++        dprintk("enable phy disable mode.\n");
++}
++
++static void
++rtl8125_disable_phy_disable_mode(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        switch (tp->HwSuppCheckPhyDisableModeVer) {
++        case 3:
++                RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_5);
++                break;
++        }
++
++        mdelay(1);
++
++        dprintk("disable phy disable mode.\n");
++}
++
++static void
++rtl8125_set_hw_phy_before_init_phy_mcu(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u16 PhyRegValue;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xBF86, 0x9000);
++
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++                rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++
++                PhyRegValue = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBF86);
++                PhyRegValue &= (BIT_1 | BIT_0);
++                if (PhyRegValue != 0)
++                        dprintk("PHY watch dog not clear, value = 0x%x \n", PhyRegValue);
++
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xBD86, 0x1010);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xBD88, 0x1010);
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBD4E,
++                                                      BIT_11 | BIT_10,
++                                                      BIT_11);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBF46,
++                                                      BIT_11 | BIT_10 | BIT_9 | BIT_8,
++                                                      BIT_10 | BIT_9 | BIT_8);
++                break;
++        }
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125a_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_acquire_phy_mcu_patch_key_lock(tp);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_7);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8013);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8021);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x802f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x803d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8042);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa088);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a50);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8008);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1a3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd707);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f8b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a6c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8080);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd019);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1a2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd707);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f8b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a84);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8970);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c07);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0901);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcf09);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd705);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xceff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf0a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1213);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8401);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8580);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1253);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd064);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd181);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4018);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc50f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd706);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2c59);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x804d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc60f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc605);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10fd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA026);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA022);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10f4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA020);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1252);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA006);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1206);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA004);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a78);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a60);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a4f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA008);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3f00);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8066);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x807c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8089);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80b2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80c2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62db);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x655c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0505);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x653c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0506);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0505);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0506);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0508);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0508);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0346);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8208);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x609d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x001a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x001a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x607d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ab);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x017b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x017b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01e0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01e0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0231);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0231);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0221);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ce);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA088);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0169);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA086);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00a6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA084);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA082);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0308);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA080);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x029f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA090);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x007f);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0020);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8017);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8029);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8054);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x805a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8064);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9430);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9480);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb408);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd120);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd057);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb80);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9906);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0567);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb94);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8406);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8dff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0773);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb91);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd139);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07dc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa110);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa2a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4045);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa180);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07ec);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07dc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fa7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0481);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x94bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa00a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa280);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8220);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x078e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb92);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd150);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6121);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61a2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6223);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d10);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d20);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d30);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf005);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d40);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa008);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4046);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07f7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3ad4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0537);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8301);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa70c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9402);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x890c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0642);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0686);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0788);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA108);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x047b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA106);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x065c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA104);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0769);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA102);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0565);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x06f9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA110);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ff);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8530);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3caf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8593);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9caf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x85a5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5afb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe083);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfb0c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x021b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86d7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbe0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x83fc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1b10);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xda02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdd02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5afb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe083);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfd0c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x021b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86dd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbe0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x83fe);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1b10);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf2f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2cac);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0286);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x65af);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x212b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x022c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86b6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf21);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cd1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8710);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8719);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8716);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x871f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x871c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8728);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8725);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8707);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbad);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x281c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1302);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2202);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2b02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae1a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd101);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1302);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2202);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2b02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd101);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3402);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3102);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3d02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3a02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4302);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4c02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4902);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2e02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4602);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4f02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf35);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7ff8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfaef);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x69bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86fb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86fe);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86ec);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86ef);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cef);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x96fe);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfc04);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf8fa);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef69);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf202);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf502);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf802);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef96);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfefc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0420);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb540);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x53b5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4086);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb540);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb9b5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb03a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8b0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbac8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb13a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8b1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xba77);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd26);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffbd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2677);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd28);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffbd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd26);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8bd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2640);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd28);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8bd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x28bb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa430);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x98b0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1eba);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb01e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdcb0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e98);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbab0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9edc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x98b1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1eba);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb11e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdcb1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e98);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbab1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9edc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11b0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e22);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb01e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x33b0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e11);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x22b0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9e33);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11b1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e22);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb11e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x33b1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e11);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x22b1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9e33);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb85e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2f71);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb860);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x20d9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb862);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2109);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb864);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x34e7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb878);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000f);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_7);
++
++
++        rtl8125_release_phy_mcu_patch_key_lock(tp);
++}
++
++static void
++rtl8125_set_phy_mcu_8125a_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125a_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125a_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_acquire_phy_mcu_patch_key_lock(tp);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_7);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8093);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8097);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x809d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80aa);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x607b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x42da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x615b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf034);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac11);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa410);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4779);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf034);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac22);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa420);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4559);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf023);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac44);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa440);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4339);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac88);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa480);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4119);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf001);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fac);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc48f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x141b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x121a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd0b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1bb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0898);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd0b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1bb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a0e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd064);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd18a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0b7e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa804);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8804);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x053b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa301);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0648);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc520);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa201);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x252d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1646);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd708);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4006);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1646);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0308);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA026);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0307);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1645);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA022);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0647);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA020);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x053a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA006);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0b7c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA004);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a0c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0896);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11a1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA008);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xff00);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8015);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xad02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02d7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ed);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x008f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA088);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA086);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA084);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA082);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x008d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA080);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00eb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA090);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0103);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0020);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8014);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8018);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8055);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8072);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80dc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfffd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfffd);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8301);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa70c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9402);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x890c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa380);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb91);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd139);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa110);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa2a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4085);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa180);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8280);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07f0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd158);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd04d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03d4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x94bc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8380);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd10d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fb4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa190);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa00a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa280);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa220);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd130);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fb4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbb80);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1c4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd074);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa301);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x604b);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa90c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0556);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb92);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd116);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd119);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6241);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x63e2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6583);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf054);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d10);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d50);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d20);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf021);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d60);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d30);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf013);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d70);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d40);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf005);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d80);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ff4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa008);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4046);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07fb);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f6f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f4e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f2d);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f0c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3ad4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0556);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1f5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd049);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ec);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ea);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x06a9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x078a);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA108);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03d2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA106);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x067f);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA104);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0665);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA102);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA110);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00fc);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8530);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3caf);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8545);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x45af);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8545);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xee82);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf900);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0103);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf03);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7f8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0a6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00e1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa601);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef01);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x58f0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa080);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x37a1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8402);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae16);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa185);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02ae);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11a1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8702);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae0c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa188);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02ae);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07a1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8902);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae02);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae1c);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6901);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe4b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6901);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe4b4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfc04);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb85e);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03b3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb860);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb862);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb864);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb878);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0001);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_7);
++
++
++        rtl8125_release_phy_mcu_patch_key_lock(tp);
++}
++
++static void
++rtl8125_set_phy_mcu_8125a_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125a_2(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static const u16 phy_mcu_ram_code_8125b_1[] = {
++        0xa436, 0x8024, 0xa438, 0x3700, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x8025, 0xa438, 0x1800, 0xa438, 0x803a,
++        0xa438, 0x1800, 0xa438, 0x8044, 0xa438, 0x1800, 0xa438, 0x8083,
++        0xa438, 0x1800, 0xa438, 0x808d, 0xa438, 0x1800, 0xa438, 0x808d,
++        0xa438, 0x1800, 0xa438, 0x808d, 0xa438, 0xd712, 0xa438, 0x4077,
++        0xa438, 0xd71e, 0xa438, 0x4159, 0xa438, 0xd71e, 0xa438, 0x6099,
++        0xa438, 0x7f44, 0xa438, 0x1800, 0xa438, 0x1a14, 0xa438, 0x9040,
++        0xa438, 0x9201, 0xa438, 0x1800, 0xa438, 0x1b1a, 0xa438, 0xd71e,
++        0xa438, 0x2425, 0xa438, 0x1a14, 0xa438, 0xd71f, 0xa438, 0x3ce5,
++        0xa438, 0x1afb, 0xa438, 0x1800, 0xa438, 0x1b00, 0xa438, 0xd712,
++        0xa438, 0x4077, 0xa438, 0xd71e, 0xa438, 0x4159, 0xa438, 0xd71e,
++        0xa438, 0x60b9, 0xa438, 0x2421, 0xa438, 0x1c17, 0xa438, 0x1800,
++        0xa438, 0x1a14, 0xa438, 0x9040, 0xa438, 0x1800, 0xa438, 0x1c2c,
++        0xa438, 0xd71e, 0xa438, 0x2425, 0xa438, 0x1a14, 0xa438, 0xd71f,
++        0xa438, 0x3ce5, 0xa438, 0x1c0f, 0xa438, 0x1800, 0xa438, 0x1c13,
++        0xa438, 0xd702, 0xa438, 0xd501, 0xa438, 0x6072, 0xa438, 0x8401,
++        0xa438, 0xf002, 0xa438, 0xa401, 0xa438, 0x1000, 0xa438, 0x146e,
++        0xa438, 0x1800, 0xa438, 0x0b77, 0xa438, 0xd703, 0xa438, 0x665d,
++        0xa438, 0x653e, 0xa438, 0x641f, 0xa438, 0xd700, 0xa438, 0x62c4,
++        0xa438, 0x6185, 0xa438, 0x6066, 0xa438, 0x1800, 0xa438, 0x165a,
++        0xa438, 0xc101, 0xa438, 0xcb00, 0xa438, 0x1000, 0xa438, 0x1945,
++        0xa438, 0xd700, 0xa438, 0x7fa6, 0xa438, 0x1800, 0xa438, 0x807d,
++        0xa438, 0xc102, 0xa438, 0xcb00, 0xa438, 0x1000, 0xa438, 0x1945,
++        0xa438, 0xd700, 0xa438, 0x2569, 0xa438, 0x8058, 0xa438, 0x1800,
++        0xa438, 0x807d, 0xa438, 0xc104, 0xa438, 0xcb00, 0xa438, 0x1000,
++        0xa438, 0x1945, 0xa438, 0xd700, 0xa438, 0x7fa4, 0xa438, 0x1800,
++        0xa438, 0x807d, 0xa438, 0xc120, 0xa438, 0xcb00, 0xa438, 0x1000,
++        0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbf, 0xa438, 0x1800,
++        0xa438, 0x807d, 0xa438, 0xc140, 0xa438, 0xcb00, 0xa438, 0x1000,
++        0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbe, 0xa438, 0x1800,
++        0xa438, 0x807d, 0xa438, 0xc180, 0xa438, 0xcb00, 0xa438, 0x1000,
++        0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbd, 0xa438, 0xc100,
++        0xa438, 0xcb00, 0xa438, 0xd708, 0xa438, 0x6018, 0xa438, 0x1800,
++        0xa438, 0x165a, 0xa438, 0x1000, 0xa438, 0x14f6, 0xa438, 0xd014,
++        0xa438, 0xd1e3, 0xa438, 0x1000, 0xa438, 0x1356, 0xa438, 0xd705,
++        0xa438, 0x5fbe, 0xa438, 0x1800, 0xa438, 0x1559, 0xa436, 0xA026,
++        0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0xffff, 0xa436, 0xA022,
++        0xa438, 0xffff, 0xa436, 0xA020, 0xa438, 0x1557, 0xa436, 0xA006,
++        0xa438, 0x1677, 0xa436, 0xA004, 0xa438, 0x0b75, 0xa436, 0xA002,
++        0xa438, 0x1c17, 0xa436, 0xA000, 0xa438, 0x1b04, 0xa436, 0xA008,
++        0xa438, 0x1f00, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x817f, 0xa438, 0x1800, 0xa438, 0x82ab,
++        0xa438, 0x1800, 0xa438, 0x83f8, 0xa438, 0x1800, 0xa438, 0x8444,
++        0xa438, 0x1800, 0xa438, 0x8454, 0xa438, 0x1800, 0xa438, 0x8459,
++        0xa438, 0x1800, 0xa438, 0x8465, 0xa438, 0xcb11, 0xa438, 0xa50c,
++        0xa438, 0x8310, 0xa438, 0xd701, 0xa438, 0x4076, 0xa438, 0x0c03,
++        0xa438, 0x0903, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f,
++        0xa438, 0x0d00, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00,
++        0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0x1000, 0xa438, 0x0a4d,
++        0xa438, 0xcb12, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5f84, 0xa438, 0xd102, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd701,
++        0xa438, 0x60f3, 0xa438, 0xd413, 0xa438, 0x1000, 0xa438, 0x0a37,
++        0xa438, 0xd410, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb13,
++        0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8108,
++        0xa438, 0xa00a, 0xa438, 0xa910, 0xa438, 0xa780, 0xa438, 0xd14a,
++        0xa438, 0xd048, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701,
++        0xa438, 0x6255, 0xa438, 0xd700, 0xa438, 0x5f74, 0xa438, 0x6326,
++        0xa438, 0xd702, 0xa438, 0x5f07, 0xa438, 0x800a, 0xa438, 0xa004,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03,
++        0xa438, 0x0902, 0xa438, 0xffe2, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x5fab, 0xa438, 0xba08, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8b, 0xa438, 0x9a08,
++        0xa438, 0x800a, 0xa438, 0xd702, 0xa438, 0x6535, 0xa438, 0xd40d,
++        0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb14, 0xa438, 0xa004,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0xa00a,
++        0xa438, 0xa780, 0xa438, 0xd14a, 0xa438, 0xd048, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x6206,
++        0xa438, 0xd702, 0xa438, 0x5f47, 0xa438, 0x800a, 0xa438, 0xa004,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03,
++        0xa438, 0x0902, 0xa438, 0x1800, 0xa438, 0x8064, 0xa438, 0x800a,
++        0xa438, 0xd40e, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac,
++        0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x7f8c, 0xa438, 0xd701, 0xa438, 0x6073, 0xa438, 0xd701,
++        0xa438, 0x4216, 0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0a42,
++        0xa438, 0x8004, 0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0a42,
++        0xa438, 0x8001, 0xa438, 0xd120, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x8504,
++        0xa438, 0xcb21, 0xa438, 0xa301, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5f9f, 0xa438, 0x8301, 0xa438, 0xd704,
++        0xa438, 0x40e0, 0xa438, 0xd196, 0xa438, 0xd04d, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb22,
++        0xa438, 0x1000, 0xa438, 0x0a6d, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xa640, 0xa438, 0x9503, 0xa438, 0x8910, 0xa438, 0x8720,
++        0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d01,
++        0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0x1000,
++        0xa438, 0x0a7d, 0xa438, 0x0c1f, 0xa438, 0x0f14, 0xa438, 0xcb23,
++        0xa438, 0x8fc0, 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xaf40,
++        0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0x0cc0, 0xa438, 0x0f80,
++        0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xafc0, 0xa438, 0x1000,
++        0xa438, 0x0a25, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701,
++        0xa438, 0x5dee, 0xa438, 0xcb24, 0xa438, 0x8f1f, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x7f6e, 0xa438, 0xa111,
++        0xa438, 0xa215, 0xa438, 0xa401, 0xa438, 0x8404, 0xa438, 0xa720,
++        0xa438, 0xcb25, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640,
++        0xa438, 0x9503, 0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000,
++        0xa438, 0x0b86, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xb920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac,
++        0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x7f8c, 0xa438, 0xcb26, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x5f82, 0xa438, 0x8111, 0xa438, 0x8205,
++        0xa438, 0x8404, 0xa438, 0xcb27, 0xa438, 0xd404, 0xa438, 0x1000,
++        0xa438, 0x0a37, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f,
++        0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02,
++        0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa710, 0xa438, 0xa104,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8104, 0xa438, 0xa001,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0xa120,
++        0xa438, 0xaa0f, 0xa438, 0x8110, 0xa438, 0xa284, 0xa438, 0xa404,
++        0xa438, 0xa00a, 0xa438, 0xd193, 0xa438, 0xd046, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb28,
++        0xa438, 0xa110, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fa8, 0xa438, 0x8110, 0xa438, 0x8284, 0xa438, 0xa404,
++        0xa438, 0x800a, 0xa438, 0x8710, 0xa438, 0xb804, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f82, 0xa438, 0x9804,
++        0xa438, 0xcb29, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5f85, 0xa438, 0xa710, 0xa438, 0xb820, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f65, 0xa438, 0x9820,
++        0xa438, 0xcb2a, 0xa438, 0xa190, 0xa438, 0xa284, 0xa438, 0xa404,
++        0xa438, 0xa00a, 0xa438, 0xd13d, 0xa438, 0xd04a, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8149,
++        0xa438, 0xa220, 0xa438, 0xd1a0, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8151,
++        0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xcb2f, 0xa438, 0xa302,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd708, 0xa438, 0x5f63,
++        0xa438, 0xd411, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8302,
++        0xa438, 0xd409, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac,
++        0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x7f8c, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5fa3, 0xa438, 0x8190, 0xa438, 0x82a4, 0xa438, 0x8404,
++        0xa438, 0x800a, 0xa438, 0xb808, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x7fa3, 0xa438, 0x9808, 0xa438, 0x1800,
++        0xa438, 0x0433, 0xa438, 0xcb15, 0xa438, 0xa508, 0xa438, 0xd700,
++        0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0xf003,
++        0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0x1000, 0xa438, 0x0a7d,
++        0xa438, 0x1000, 0xa438, 0x0a4d, 0xa438, 0xa301, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5f9f, 0xa438, 0x8301,
++        0xa438, 0xd704, 0xa438, 0x40e0, 0xa438, 0xd115, 0xa438, 0xd04f,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4,
++        0xa438, 0xd413, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb16,
++        0xa438, 0x1000, 0xa438, 0x0a6d, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xa640, 0xa438, 0x9503, 0xa438, 0x8720, 0xa438, 0xd17a,
++        0xa438, 0xd04c, 0xa438, 0x0c1f, 0xa438, 0x0f14, 0xa438, 0xcb17,
++        0xa438, 0x8fc0, 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xaf40,
++        0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0x0cc0, 0xa438, 0x0f80,
++        0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xafc0, 0xa438, 0x1000,
++        0xa438, 0x0a25, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701,
++        0xa438, 0x61ce, 0xa438, 0xd700, 0xa438, 0x5db4, 0xa438, 0xcb18,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640, 0xa438, 0x9503,
++        0xa438, 0xa720, 0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xffd6, 0xa438, 0x8f1f, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x7f8e, 0xa438, 0xa131,
++        0xa438, 0xaa0f, 0xa438, 0xa2d5, 0xa438, 0xa407, 0xa438, 0xa720,
++        0xa438, 0x8310, 0xa438, 0xa308, 0xa438, 0x8308, 0xa438, 0xcb19,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640, 0xa438, 0x9503,
++        0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000, 0xa438, 0x0b86,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xb920, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c,
++        0xa438, 0xcb1a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5f82, 0xa438, 0x8111, 0xa438, 0x82c5, 0xa438, 0xa404,
++        0xa438, 0x8402, 0xa438, 0xb804, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x7f82, 0xa438, 0x9804, 0xa438, 0xcb1b,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f85,
++        0xa438, 0xa710, 0xa438, 0xb820, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x7f65, 0xa438, 0x9820, 0xa438, 0xcb1c,
++        0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d02,
++        0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, 0xa438, 0x1000,
++        0xa438, 0x0a7d, 0xa438, 0xa110, 0xa438, 0xa284, 0xa438, 0xa404,
++        0xa438, 0x8402, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fa8, 0xa438, 0xcb1d, 0xa438, 0xa180, 0xa438, 0xa402,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa8,
++        0xa438, 0xa220, 0xa438, 0xd1f5, 0xa438, 0xd049, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8221,
++        0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xb920, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fa3,
++        0xa438, 0xa504, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f,
++        0xa438, 0x0d00, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00,
++        0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa00a, 0xa438, 0x8190,
++        0xa438, 0x82a4, 0xa438, 0x8402, 0xa438, 0xa404, 0xa438, 0xb808,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7fa3,
++        0xa438, 0x9808, 0xa438, 0xcb2b, 0xa438, 0xcb2c, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f84, 0xa438, 0xd14a,
++        0xa438, 0xd048, 0xa438, 0xa780, 0xa438, 0xcb2d, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5f94, 0xa438, 0x6208,
++        0xa438, 0xd702, 0xa438, 0x5f27, 0xa438, 0x800a, 0xa438, 0xa004,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001,
++        0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03,
++        0xa438, 0x0902, 0xa438, 0xa00a, 0xa438, 0xffe9, 0xa438, 0xcb2e,
++        0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d02,
++        0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, 0xa438, 0x1000,
++        0xa438, 0x0a7d, 0xa438, 0xa190, 0xa438, 0xa284, 0xa438, 0xa406,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa8,
++        0xa438, 0xa220, 0xa438, 0xd1a0, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x827d,
++        0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xcb2f, 0xa438, 0xa302,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd708, 0xa438, 0x5f63,
++        0xa438, 0xd411, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8302,
++        0xa438, 0xd409, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac,
++        0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x7f8c, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5fa3, 0xa438, 0x8190, 0xa438, 0x82a4, 0xa438, 0x8406,
++        0xa438, 0x800a, 0xa438, 0xb808, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x7fa3, 0xa438, 0x9808, 0xa438, 0x1800,
++        0xa438, 0x0433, 0xa438, 0xcb30, 0xa438, 0x8380, 0xa438, 0xcb31,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f86,
++        0xa438, 0x9308, 0xa438, 0xb204, 0xa438, 0xb301, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x5fa2, 0xa438, 0xb302,
++        0xa438, 0x9204, 0xa438, 0xcb32, 0xa438, 0xd408, 0xa438, 0x1000,
++        0xa438, 0x0a37, 0xa438, 0xd141, 0xa438, 0xd043, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd704,
++        0xa438, 0x4ccc, 0xa438, 0xd700, 0xa438, 0x4c81, 0xa438, 0xd702,
++        0xa438, 0x609e, 0xa438, 0xd1e5, 0xa438, 0xd04d, 0xa438, 0xf003,
++        0xa438, 0xd1e5, 0xa438, 0xd04d, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd700, 0xa438, 0x6083,
++        0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0xf003, 0xa438, 0x0c1f,
++        0xa438, 0x0d01, 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0x8710,
++        0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8108,
++        0xa438, 0xa203, 0xa438, 0x8120, 0xa438, 0x8a0f, 0xa438, 0xa111,
++        0xa438, 0x8204, 0xa438, 0xa140, 0xa438, 0x1000, 0xa438, 0x0a42,
++        0xa438, 0x8140, 0xa438, 0xd17a, 0xa438, 0xd04b, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xa204,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa7,
++        0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x5fac, 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x7f8c, 0xa438, 0xd404, 0xa438, 0x1000,
++        0xa438, 0x0a37, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f,
++        0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02,
++        0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa710, 0xa438, 0x8101,
++        0xa438, 0x8201, 0xa438, 0xa104, 0xa438, 0x1000, 0xa438, 0x0a42,
++        0xa438, 0x8104, 0xa438, 0xa120, 0xa438, 0xaa0f, 0xa438, 0x8110,
++        0xa438, 0xa284, 0xa438, 0xa404, 0xa438, 0xa00a, 0xa438, 0xd193,
++        0xa438, 0xd047, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0xa110, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5fa8, 0xa438, 0xa180, 0xa438, 0xd13d,
++        0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0xf024, 0xa438, 0xa710, 0xa438, 0xa00a,
++        0xa438, 0x8190, 0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa404,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa7,
++        0xa438, 0x8710, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c, 0xa438, 0x800a,
++        0xa438, 0x8190, 0xa438, 0x8284, 0xa438, 0x8406, 0xa438, 0xd700,
++        0xa438, 0x4121, 0xa438, 0xd701, 0xa438, 0x60f3, 0xa438, 0xd1e5,
++        0xa438, 0xd04d, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0x8710, 0xa438, 0xa00a, 0xa438, 0x8190,
++        0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa404, 0xa438, 0xb920,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac,
++        0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f,
++        0xa438, 0x7f8c, 0xa438, 0xcb33, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd71f, 0xa438, 0x5f85, 0xa438, 0xa710, 0xa438, 0xb820,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f65,
++        0xa438, 0x9820, 0xa438, 0xcb34, 0xa438, 0xa00a, 0xa438, 0xa190,
++        0xa438, 0xa284, 0xa438, 0xa404, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5fa9, 0xa438, 0xd701, 0xa438, 0x6853,
++        0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d00,
++        0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00, 0xa438, 0x1000,
++        0xa438, 0x0a7d, 0xa438, 0x8190, 0xa438, 0x8284, 0xa438, 0xcb35,
++        0xa438, 0xd407, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8110,
++        0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa00a, 0xa438, 0xd704,
++        0xa438, 0x4215, 0xa438, 0xa304, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5fb8, 0xa438, 0xd1c3, 0xa438, 0xd043,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4,
++        0xa438, 0x8304, 0xa438, 0xd700, 0xa438, 0x4109, 0xa438, 0xf01e,
++        0xa438, 0xcb36, 0xa438, 0xd412, 0xa438, 0x1000, 0xa438, 0x0a37,
++        0xa438, 0xd700, 0xa438, 0x6309, 0xa438, 0xd702, 0xa438, 0x42c7,
++        0xa438, 0x800a, 0xa438, 0x8180, 0xa438, 0x8280, 0xa438, 0x8404,
++        0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004,
++        0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001,
++        0xa438, 0x0c03, 0xa438, 0x0902, 0xa438, 0xa00a, 0xa438, 0xd14a,
++        0xa438, 0xd048, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f,
++        0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02,
++        0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xcc55, 0xa438, 0xcb37,
++        0xa438, 0xa00a, 0xa438, 0xa190, 0xa438, 0xa2a4, 0xa438, 0xa404,
++        0xa438, 0xd700, 0xa438, 0x6041, 0xa438, 0xa402, 0xa438, 0xd13d,
++        0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700,
++        0xa438, 0x5fa9, 0xa438, 0xd702, 0xa438, 0x5f71, 0xa438, 0xcb38,
++        0xa438, 0x8224, 0xa438, 0xa288, 0xa438, 0x8180, 0xa438, 0xa110,
++        0xa438, 0xa404, 0xa438, 0x800a, 0xa438, 0xd700, 0xa438, 0x6041,
++        0xa438, 0x8402, 0xa438, 0xd415, 0xa438, 0x1000, 0xa438, 0x0a37,
++        0xa438, 0xd13d, 0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e,
++        0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb39, 0xa438, 0xa00a,
++        0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0xd700,
++        0xa438, 0x6041, 0xa438, 0xa402, 0xa438, 0xd17a, 0xa438, 0xd047,
++        0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4,
++        0xa438, 0x1800, 0xa438, 0x0560, 0xa438, 0xa111, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xd3f5,
++        0xa438, 0xd219, 0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708,
++        0xa438, 0x5fa5, 0xa438, 0xa215, 0xa438, 0xd30e, 0xa438, 0xd21a,
++        0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708, 0xa438, 0x63e9,
++        0xa438, 0xd708, 0xa438, 0x5f65, 0xa438, 0xd708, 0xa438, 0x7f36,
++        0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0c35, 0xa438, 0x8004,
++        0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0c35, 0xa438, 0x8001,
++        0xa438, 0xd708, 0xa438, 0x4098, 0xa438, 0xd102, 0xa438, 0x9401,
++        0xa438, 0xf003, 0xa438, 0xd103, 0xa438, 0xb401, 0xa438, 0x1000,
++        0xa438, 0x0c27, 0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0c35,
++        0xa438, 0x8108, 0xa438, 0x8110, 0xa438, 0x8294, 0xa438, 0xa202,
++        0xa438, 0x1800, 0xa438, 0x0bdb, 0xa438, 0xd39c, 0xa438, 0xd210,
++        0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708, 0xa438, 0x5fa5,
++        0xa438, 0xd39c, 0xa438, 0xd210, 0xa438, 0x1000, 0xa438, 0x0c31,
++        0xa438, 0xd708, 0xa438, 0x5fa5, 0xa438, 0x1000, 0xa438, 0x0c31,
++        0xa438, 0xd708, 0xa438, 0x29b5, 0xa438, 0x840e, 0xa438, 0xd708,
++        0xa438, 0x5f4a, 0xa438, 0x0c1f, 0xa438, 0x1014, 0xa438, 0x1000,
++        0xa438, 0x0c31, 0xa438, 0xd709, 0xa438, 0x7fa4, 0xa438, 0x901f,
++        0xa438, 0x1800, 0xa438, 0x0c23, 0xa438, 0xcb43, 0xa438, 0xa508,
++        0xa438, 0xd701, 0xa438, 0x3699, 0xa438, 0x844a, 0xa438, 0xa504,
++        0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0xa00a,
++        0xa438, 0xd700, 0xa438, 0x2109, 0xa438, 0x05ea, 0xa438, 0xa402,
++        0xa438, 0x1800, 0xa438, 0x05ea, 0xa438, 0xcb90, 0xa438, 0x0cf0,
++        0xa438, 0x0ca0, 0xa438, 0x1800, 0xa438, 0x06db, 0xa438, 0xd1ff,
++        0xa438, 0xd052, 0xa438, 0xa508, 0xa438, 0x8718, 0xa438, 0xa00a,
++        0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0x0cf0,
++        0xa438, 0x0c50, 0xa438, 0x1800, 0xa438, 0x09ef, 0xa438, 0x1000,
++        0xa438, 0x0a5e, 0xa438, 0xd704, 0xa438, 0x2e70, 0xa438, 0x06da,
++        0xa438, 0xd700, 0xa438, 0x5f55, 0xa438, 0xa90c, 0xa438, 0x1800,
++        0xa438, 0x0645, 0xa436, 0xA10E, 0xa438, 0x0644, 0xa436, 0xA10C,
++        0xa438, 0x09e9, 0xa436, 0xA10A, 0xa438, 0x06da, 0xa436, 0xA108,
++        0xa438, 0x05e1, 0xa436, 0xA106, 0xa438, 0x0be4, 0xa436, 0xA104,
++        0xa438, 0x0435, 0xa436, 0xA102, 0xa438, 0x0141, 0xa436, 0xA100,
++        0xa438, 0x026d, 0xa436, 0xA110, 0xa438, 0x00ff, 0xa436, 0xb87c,
++        0xa438, 0x85fe, 0xa436, 0xb87e, 0xa438, 0xaf86, 0xa438, 0x16af,
++        0xa438, 0x8699, 0xa438, 0xaf86, 0xa438, 0xe5af, 0xa438, 0x86f9,
++        0xa438, 0xaf87, 0xa438, 0x7aaf, 0xa438, 0x883a, 0xa438, 0xaf88,
++        0xa438, 0x58af, 0xa438, 0x8b6c, 0xa438, 0xd48b, 0xa438, 0x7c02,
++        0xa438, 0x8644, 0xa438, 0x2c00, 0xa438, 0x503c, 0xa438, 0xffd6,
++        0xa438, 0xac27, 0xa438, 0x18e1, 0xa438, 0x82fe, 0xa438, 0xad28,
++        0xa438, 0x0cd4, 0xa438, 0x8b84, 0xa438, 0x0286, 0xa438, 0x442c,
++        0xa438, 0x003c, 0xa438, 0xac27, 0xa438, 0x06ee, 0xa438, 0x8299,
++        0xa438, 0x01ae, 0xa438, 0x04ee, 0xa438, 0x8299, 0xa438, 0x00af,
++        0xa438, 0x23dc, 0xa438, 0xf9fa, 0xa438, 0xcefa, 0xa438, 0xfbef,
++        0xa438, 0x79fb, 0xa438, 0xc4bf, 0xa438, 0x8b76, 0xa438, 0x026c,
++        0xa438, 0x6dac, 0xa438, 0x2804, 0xa438, 0xd203, 0xa438, 0xae02,
++        0xa438, 0xd201, 0xa438, 0xbdd8, 0xa438, 0x19d9, 0xa438, 0xef94,
++        0xa438, 0x026c, 0xa438, 0x6d78, 0xa438, 0x03ef, 0xa438, 0x648a,
++        0xa438, 0x0002, 0xa438, 0xbdd8, 0xa438, 0x19d9, 0xa438, 0xef94,
++        0xa438, 0x026c, 0xa438, 0x6d78, 0xa438, 0x03ef, 0xa438, 0x7402,
++        0xa438, 0x72cd, 0xa438, 0xac50, 0xa438, 0x02ef, 0xa438, 0x643a,
++        0xa438, 0x019f, 0xa438, 0xe4ef, 0xa438, 0x4678, 0xa438, 0x03ac,
++        0xa438, 0x2002, 0xa438, 0xae02, 0xa438, 0xd0ff, 0xa438, 0xffef,
++        0xa438, 0x97ff, 0xa438, 0xfec6, 0xa438, 0xfefd, 0xa438, 0x041f,
++        0xa438, 0x771f, 0xa438, 0x221c, 0xa438, 0x450d, 0xa438, 0x481f,
++        0xa438, 0x00ac, 0xa438, 0x7f04, 0xa438, 0x1a94, 0xa438, 0xae08,
++        0xa438, 0x1a94, 0xa438, 0xac7f, 0xa438, 0x03d7, 0xa438, 0x0100,
++        0xa438, 0xef46, 0xa438, 0x0d48, 0xa438, 0x1f00, 0xa438, 0x1c45,
++        0xa438, 0xef69, 0xa438, 0xef57, 0xa438, 0xef74, 0xa438, 0x0272,
++        0xa438, 0xe8a7, 0xa438, 0xffff, 0xa438, 0x0d1a, 0xa438, 0x941b,
++        0xa438, 0x979e, 0xa438, 0x072d, 0xa438, 0x0100, 0xa438, 0x1a64,
++        0xa438, 0xef76, 0xa438, 0xef97, 0xa438, 0x0d98, 0xa438, 0xd400,
++        0xa438, 0xff1d, 0xa438, 0x941a, 0xa438, 0x89cf, 0xa438, 0x1a75,
++        0xa438, 0xaf74, 0xa438, 0xf9bf, 0xa438, 0x8b79, 0xa438, 0x026c,
++        0xa438, 0x6da1, 0xa438, 0x0005, 0xa438, 0xe180, 0xa438, 0xa0ae,
++        0xa438, 0x03e1, 0xa438, 0x80a1, 0xa438, 0xaf26, 0xa438, 0x9aac,
++        0xa438, 0x284d, 0xa438, 0xe08f, 0xa438, 0xffef, 0xa438, 0x10c0,
++        0xa438, 0xe08f, 0xa438, 0xfe10, 0xa438, 0x1b08, 0xa438, 0xa000,
++        0xa438, 0x04c8, 0xa438, 0xaf40, 0xa438, 0x67c8, 0xa438, 0xbf8b,
++        0xa438, 0x8c02, 0xa438, 0x6c4e, 0xa438, 0xc4bf, 0xa438, 0x8b8f,
++        0xa438, 0x026c, 0xa438, 0x6def, 0xa438, 0x74e0, 0xa438, 0x830c,
++        0xa438, 0xad20, 0xa438, 0x0302, 0xa438, 0x74ac, 0xa438, 0xccef,
++        0xa438, 0x971b, 0xa438, 0x76ad, 0xa438, 0x5f02, 0xa438, 0xae13,
++        0xa438, 0xef69, 0xa438, 0xef30, 0xa438, 0x1b32, 0xa438, 0xc4ef,
++        0xa438, 0x46e4, 0xa438, 0x8ffb, 0xa438, 0xe58f, 0xa438, 0xfce7,
++        0xa438, 0x8ffd, 0xa438, 0xcc10, 0xa438, 0x11ae, 0xa438, 0xb8d1,
++        0xa438, 0x00a1, 0xa438, 0x1f03, 0xa438, 0xaf40, 0xa438, 0x4fbf,
++        0xa438, 0x8b8c, 0xa438, 0x026c, 0xa438, 0x4ec4, 0xa438, 0xbf8b,
++        0xa438, 0x8f02, 0xa438, 0x6c6d, 0xa438, 0xef74, 0xa438, 0xe083,
++        0xa438, 0x0cad, 0xa438, 0x2003, 0xa438, 0x0274, 0xa438, 0xaccc,
++        0xa438, 0xef97, 0xa438, 0x1b76, 0xa438, 0xad5f, 0xa438, 0x02ae,
++        0xa438, 0x04ef, 0xa438, 0x69ef, 0xa438, 0x3111, 0xa438, 0xaed1,
++        0xa438, 0x0287, 0xa438, 0x80af, 0xa438, 0x2293, 0xa438, 0xf8f9,
++        0xa438, 0xfafb, 0xa438, 0xef59, 0xa438, 0xe080, 0xa438, 0x13ad,
++        0xa438, 0x252f, 0xa438, 0xbf88, 0xa438, 0x2802, 0xa438, 0x6c6d,
++        0xa438, 0xef64, 0xa438, 0x1f44, 0xa438, 0xe18f, 0xa438, 0xb91b,
++        0xa438, 0x64ad, 0xa438, 0x4f1d, 0xa438, 0xd688, 0xa438, 0x2bd7,
++        0xa438, 0x882e, 0xa438, 0x0274, 0xa438, 0x73ad, 0xa438, 0x5008,
++        0xa438, 0xbf88, 0xa438, 0x3102, 0xa438, 0x737c, 0xa438, 0xae03,
++        0xa438, 0x0287, 0xa438, 0xd0bf, 0xa438, 0x882b, 0xa438, 0x0273,
++        0xa438, 0x73e0, 0xa438, 0x824c, 0xa438, 0xf621, 0xa438, 0xe482,
++        0xa438, 0x4cbf, 0xa438, 0x8834, 0xa438, 0x0273, 0xa438, 0x7cef,
++        0xa438, 0x95ff, 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8f9,
++        0xa438, 0xfafb, 0xa438, 0xef79, 0xa438, 0xbf88, 0xa438, 0x1f02,
++        0xa438, 0x737c, 0xa438, 0x1f22, 0xa438, 0xac32, 0xa438, 0x31ef,
++        0xa438, 0x12bf, 0xa438, 0x8822, 0xa438, 0x026c, 0xa438, 0x4ed6,
++        0xa438, 0x8fba, 0xa438, 0x1f33, 0xa438, 0xac3c, 0xa438, 0x1eef,
++        0xa438, 0x13bf, 0xa438, 0x8837, 0xa438, 0x026c, 0xa438, 0x4eef,
++        0xa438, 0x96d8, 0xa438, 0x19d9, 0xa438, 0xbf88, 0xa438, 0x2502,
++        0xa438, 0x6c4e, 0xa438, 0xbf88, 0xa438, 0x2502, 0xa438, 0x6c4e,
++        0xa438, 0x1616, 0xa438, 0x13ae, 0xa438, 0xdf12, 0xa438, 0xaecc,
++        0xa438, 0xbf88, 0xa438, 0x1f02, 0xa438, 0x7373, 0xa438, 0xef97,
++        0xa438, 0xfffe, 0xa438, 0xfdfc, 0xa438, 0x0466, 0xa438, 0xac88,
++        0xa438, 0x54ac, 0xa438, 0x88f0, 0xa438, 0xac8a, 0xa438, 0x92ac,
++        0xa438, 0xbadd, 0xa438, 0xac6c, 0xa438, 0xeeac, 0xa438, 0x6cff,
++        0xa438, 0xad02, 0xa438, 0x99ac, 0xa438, 0x0030, 0xa438, 0xac88,
++        0xa438, 0xd4c3, 0xa438, 0x5000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x00b4, 0xa438, 0xecee,
++        0xa438, 0x8298, 0xa438, 0x00af, 0xa438, 0x1412, 0xa438, 0xf8bf,
++        0xa438, 0x8b5d, 0xa438, 0x026c, 0xa438, 0x6d58, 0xa438, 0x03e1,
++        0xa438, 0x8fb8, 0xa438, 0x2901, 0xa438, 0xe58f, 0xa438, 0xb8a0,
++        0xa438, 0x0049, 0xa438, 0xef47, 0xa438, 0xe483, 0xa438, 0x02e5,
++        0xa438, 0x8303, 0xa438, 0xbfc2, 0xa438, 0x5f1a, 0xa438, 0x95f7,
++        0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00d8, 0xa438, 0xf605,
++        0xa438, 0x1f11, 0xa438, 0xef60, 0xa438, 0xbf8b, 0xa438, 0x3002,
++        0xa438, 0x6c4e, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c6d,
++        0xa438, 0xf728, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e,
++        0xa438, 0xf628, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e,
++        0xa438, 0x0c64, 0xa438, 0xef46, 0xa438, 0xbf8b, 0xa438, 0x6002,
++        0xa438, 0x6c4e, 0xa438, 0x0289, 0xa438, 0x9902, 0xa438, 0x3920,
++        0xa438, 0xaf89, 0xa438, 0x96a0, 0xa438, 0x0149, 0xa438, 0xef47,
++        0xa438, 0xe483, 0xa438, 0x04e5, 0xa438, 0x8305, 0xa438, 0xbfc2,
++        0xa438, 0x5f1a, 0xa438, 0x95f7, 0xa438, 0x05ee, 0xa438, 0xffd2,
++        0xa438, 0x00d8, 0xa438, 0xf605, 0xa438, 0x1f11, 0xa438, 0xef60,
++        0xa438, 0xbf8b, 0xa438, 0x3002, 0xa438, 0x6c4e, 0xa438, 0xbf8b,
++        0xa438, 0x3302, 0xa438, 0x6c6d, 0xa438, 0xf729, 0xa438, 0xbf8b,
++        0xa438, 0x3302, 0xa438, 0x6c4e, 0xa438, 0xf629, 0xa438, 0xbf8b,
++        0xa438, 0x3302, 0xa438, 0x6c4e, 0xa438, 0x0c64, 0xa438, 0xef46,
++        0xa438, 0xbf8b, 0xa438, 0x6302, 0xa438, 0x6c4e, 0xa438, 0x0289,
++        0xa438, 0x9902, 0xa438, 0x3920, 0xa438, 0xaf89, 0xa438, 0x96a0,
++        0xa438, 0x0249, 0xa438, 0xef47, 0xa438, 0xe483, 0xa438, 0x06e5,
++        0xa438, 0x8307, 0xa438, 0xbfc2, 0xa438, 0x5f1a, 0xa438, 0x95f7,
++        0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00d8, 0xa438, 0xf605,
++        0xa438, 0x1f11, 0xa438, 0xef60, 0xa438, 0xbf8b, 0xa438, 0x3002,
++        0xa438, 0x6c4e, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c6d,
++        0xa438, 0xf72a, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e,
++        0xa438, 0xf62a, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e,
++        0xa438, 0x0c64, 0xa438, 0xef46, 0xa438, 0xbf8b, 0xa438, 0x6602,
++        0xa438, 0x6c4e, 0xa438, 0x0289, 0xa438, 0x9902, 0xa438, 0x3920,
++        0xa438, 0xaf89, 0xa438, 0x96ef, 0xa438, 0x47e4, 0xa438, 0x8308,
++        0xa438, 0xe583, 0xa438, 0x09bf, 0xa438, 0xc25f, 0xa438, 0x1a95,
++        0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xd8f6,
++        0xa438, 0x051f, 0xa438, 0x11ef, 0xa438, 0x60bf, 0xa438, 0x8b30,
++        0xa438, 0x026c, 0xa438, 0x4ebf, 0xa438, 0x8b33, 0xa438, 0x026c,
++        0xa438, 0x6df7, 0xa438, 0x2bbf, 0xa438, 0x8b33, 0xa438, 0x026c,
++        0xa438, 0x4ef6, 0xa438, 0x2bbf, 0xa438, 0x8b33, 0xa438, 0x026c,
++        0xa438, 0x4e0c, 0xa438, 0x64ef, 0xa438, 0x46bf, 0xa438, 0x8b69,
++        0xa438, 0x026c, 0xa438, 0x4e02, 0xa438, 0x8999, 0xa438, 0x0239,
++        0xa438, 0x20af, 0xa438, 0x8996, 0xa438, 0xaf39, 0xa438, 0x1ef8,
++        0xa438, 0xf9fa, 0xa438, 0xe08f, 0xa438, 0xb838, 0xa438, 0x02ad,
++        0xa438, 0x2702, 0xa438, 0xae03, 0xa438, 0xaf8b, 0xa438, 0x201f,
++        0xa438, 0x66ef, 0xa438, 0x65bf, 0xa438, 0xc21f, 0xa438, 0x1a96,
++        0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xdaf6,
++        0xa438, 0x05bf, 0xa438, 0xc22f, 0xa438, 0x1a96, 0xa438, 0xf705,
++        0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xdbf6, 0xa438, 0x05ef,
++        0xa438, 0x021f, 0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b3c,
++        0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x021b, 0xa438, 0x031f,
++        0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b36, 0xa438, 0x026c,
++        0xa438, 0x4eef, 0xa438, 0x021a, 0xa438, 0x031f, 0xa438, 0x110d,
++        0xa438, 0x42bf, 0xa438, 0x8b39, 0xa438, 0x026c, 0xa438, 0x4ebf,
++        0xa438, 0xc23f, 0xa438, 0x1a96, 0xa438, 0xf705, 0xa438, 0xeeff,
++        0xa438, 0xd200, 0xa438, 0xdaf6, 0xa438, 0x05bf, 0xa438, 0xc24f,
++        0xa438, 0x1a96, 0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200,
++        0xa438, 0xdbf6, 0xa438, 0x05ef, 0xa438, 0x021f, 0xa438, 0x110d,
++        0xa438, 0x42bf, 0xa438, 0x8b45, 0xa438, 0x026c, 0xa438, 0x4eef,
++        0xa438, 0x021b, 0xa438, 0x031f, 0xa438, 0x110d, 0xa438, 0x42bf,
++        0xa438, 0x8b3f, 0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x021a,
++        0xa438, 0x031f, 0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b42,
++        0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x56d0, 0xa438, 0x201f,
++        0xa438, 0x11bf, 0xa438, 0x8b4e, 0xa438, 0x026c, 0xa438, 0x4ebf,
++        0xa438, 0x8b48, 0xa438, 0x026c, 0xa438, 0x4ebf, 0xa438, 0x8b4b,
++        0xa438, 0x026c, 0xa438, 0x4ee1, 0xa438, 0x8578, 0xa438, 0xef03,
++        0xa438, 0x480a, 0xa438, 0x2805, 0xa438, 0xef20, 0xa438, 0x1b01,
++        0xa438, 0xad27, 0xa438, 0x3f1f, 0xa438, 0x44e0, 0xa438, 0x8560,
++        0xa438, 0xe185, 0xa438, 0x61bf, 0xa438, 0x8b51, 0xa438, 0x026c,
++        0xa438, 0x4ee0, 0xa438, 0x8566, 0xa438, 0xe185, 0xa438, 0x67bf,
++        0xa438, 0x8b54, 0xa438, 0x026c, 0xa438, 0x4ee0, 0xa438, 0x856c,
++        0xa438, 0xe185, 0xa438, 0x6dbf, 0xa438, 0x8b57, 0xa438, 0x026c,
++        0xa438, 0x4ee0, 0xa438, 0x8572, 0xa438, 0xe185, 0xa438, 0x73bf,
++        0xa438, 0x8b5a, 0xa438, 0x026c, 0xa438, 0x4ee1, 0xa438, 0x8fb8,
++        0xa438, 0x5900, 0xa438, 0xf728, 0xa438, 0xe58f, 0xa438, 0xb8af,
++        0xa438, 0x8b2c, 0xa438, 0xe185, 0xa438, 0x791b, 0xa438, 0x21ad,
++        0xa438, 0x373e, 0xa438, 0x1f44, 0xa438, 0xe085, 0xa438, 0x62e1,
++        0xa438, 0x8563, 0xa438, 0xbf8b, 0xa438, 0x5102, 0xa438, 0x6c4e,
++        0xa438, 0xe085, 0xa438, 0x68e1, 0xa438, 0x8569, 0xa438, 0xbf8b,
++        0xa438, 0x5402, 0xa438, 0x6c4e, 0xa438, 0xe085, 0xa438, 0x6ee1,
++        0xa438, 0x856f, 0xa438, 0xbf8b, 0xa438, 0x5702, 0xa438, 0x6c4e,
++        0xa438, 0xe085, 0xa438, 0x74e1, 0xa438, 0x8575, 0xa438, 0xbf8b,
++        0xa438, 0x5a02, 0xa438, 0x6c4e, 0xa438, 0xe18f, 0xa438, 0xb859,
++        0xa438, 0x00f7, 0xa438, 0x28e5, 0xa438, 0x8fb8, 0xa438, 0xae4a,
++        0xa438, 0x1f44, 0xa438, 0xe085, 0xa438, 0x64e1, 0xa438, 0x8565,
++        0xa438, 0xbf8b, 0xa438, 0x5102, 0xa438, 0x6c4e, 0xa438, 0xe085,
++        0xa438, 0x6ae1, 0xa438, 0x856b, 0xa438, 0xbf8b, 0xa438, 0x5402,
++        0xa438, 0x6c4e, 0xa438, 0xe085, 0xa438, 0x70e1, 0xa438, 0x8571,
++        0xa438, 0xbf8b, 0xa438, 0x5702, 0xa438, 0x6c4e, 0xa438, 0xe085,
++        0xa438, 0x76e1, 0xa438, 0x8577, 0xa438, 0xbf8b, 0xa438, 0x5a02,
++        0xa438, 0x6c4e, 0xa438, 0xe18f, 0xa438, 0xb859, 0xa438, 0x00f7,
++        0xa438, 0x28e5, 0xa438, 0x8fb8, 0xa438, 0xae0c, 0xa438, 0xe18f,
++        0xa438, 0xb839, 0xa438, 0x04ac, 0xa438, 0x2f04, 0xa438, 0xee8f,
++        0xa438, 0xb800, 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf0ac,
++        0xa438, 0x8efc, 0xa438, 0xac8c, 0xa438, 0xf0ac, 0xa438, 0xfaf0,
++        0xa438, 0xacf8, 0xa438, 0xf0ac, 0xa438, 0xf6f0, 0xa438, 0xad00,
++        0xa438, 0xf0ac, 0xa438, 0xfef0, 0xa438, 0xacfc, 0xa438, 0xf0ac,
++        0xa438, 0xf4f0, 0xa438, 0xacf2, 0xa438, 0xf0ac, 0xa438, 0xf0f0,
++        0xa438, 0xacb0, 0xa438, 0xf0ac, 0xa438, 0xaef0, 0xa438, 0xacac,
++        0xa438, 0xf0ac, 0xa438, 0xaaf0, 0xa438, 0xacee, 0xa438, 0xf0b0,
++        0xa438, 0x24f0, 0xa438, 0xb0a4, 0xa438, 0xf0b1, 0xa438, 0x24f0,
++        0xa438, 0xb1a4, 0xa438, 0xee8f, 0xa438, 0xb800, 0xa438, 0xd400,
++        0xa438, 0x00af, 0xa438, 0x3976, 0xa438, 0x66ac, 0xa438, 0xeabb,
++        0xa438, 0xa430, 0xa438, 0x6e50, 0xa438, 0x6e53, 0xa438, 0x6e56,
++        0xa438, 0x6e59, 0xa438, 0x6e5c, 0xa438, 0x6e5f, 0xa438, 0x6e62,
++        0xa438, 0x6e65, 0xa438, 0xd9ac, 0xa438, 0x70f0, 0xa438, 0xac6a,
++        0xa436, 0xb85e, 0xa438, 0x23b7, 0xa436, 0xb860, 0xa438, 0x74db,
++        0xa436, 0xb862, 0xa438, 0x268c, 0xa436, 0xb864, 0xa438, 0x3FE5,
++        0xa436, 0xb886, 0xa438, 0x2250, 0xa436, 0xb888, 0xa438, 0x140e,
++        0xa436, 0xb88a, 0xa438, 0x3696, 0xa436, 0xb88c, 0xa438, 0x3973,
++        0xa436, 0xb838, 0xa438, 0x00ff, 0xb820, 0x0010, 0xa436, 0x8464,
++        0xa438, 0xaf84, 0xa438, 0x7caf, 0xa438, 0x8485, 0xa438, 0xaf85,
++        0xa438, 0x13af, 0xa438, 0x851e, 0xa438, 0xaf85, 0xa438, 0xb9af,
++        0xa438, 0x8684, 0xa438, 0xaf87, 0xa438, 0x01af, 0xa438, 0x8701,
++        0xa438, 0xac38, 0xa438, 0x03af, 0xa438, 0x38bb, 0xa438, 0xaf38,
++        0xa438, 0xc302, 0xa438, 0x4618, 0xa438, 0xbf85, 0xa438, 0x0a02,
++        0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0x1002, 0xa438, 0x54c0,
++        0xa438, 0xd400, 0xa438, 0x0fbf, 0xa438, 0x8507, 0xa438, 0x024f,
++        0xa438, 0x48bf, 0xa438, 0x8504, 0xa438, 0x024f, 0xa438, 0x6759,
++        0xa438, 0xf0a1, 0xa438, 0x3008, 0xa438, 0xbf85, 0xa438, 0x0d02,
++        0xa438, 0x54c0, 0xa438, 0xae06, 0xa438, 0xbf85, 0xa438, 0x0d02,
++        0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0x0402, 0xa438, 0x4f67,
++        0xa438, 0xa183, 0xa438, 0x02ae, 0xa438, 0x15a1, 0xa438, 0x8502,
++        0xa438, 0xae10, 0xa438, 0x59f0, 0xa438, 0xa180, 0xa438, 0x16bf,
++        0xa438, 0x8501, 0xa438, 0x024f, 0xa438, 0x67a1, 0xa438, 0x381b,
++        0xa438, 0xae0b, 0xa438, 0xe18f, 0xa438, 0xffbf, 0xa438, 0x84fe,
++        0xa438, 0x024f, 0xa438, 0x48ae, 0xa438, 0x17bf, 0xa438, 0x84fe,
++        0xa438, 0x0254, 0xa438, 0xb7bf, 0xa438, 0x84fb, 0xa438, 0x0254,
++        0xa438, 0xb7ae, 0xa438, 0x09a1, 0xa438, 0x5006, 0xa438, 0xbf84,
++        0xa438, 0xfb02, 0xa438, 0x54c0, 0xa438, 0xaf04, 0xa438, 0x4700,
++        0xa438, 0xad34, 0xa438, 0xfdad, 0xa438, 0x0670, 0xa438, 0xae14,
++        0xa438, 0xf0a6, 0xa438, 0x00b8, 0xa438, 0xbd32, 0xa438, 0x30bd,
++        0xa438, 0x30aa, 0xa438, 0xbd2c, 0xa438, 0xccbd, 0xa438, 0x2ca1,
++        0xa438, 0x0705, 0xa438, 0xec80, 0xa438, 0xaf40, 0xa438, 0xf7af,
++        0xa438, 0x40f5, 0xa438, 0xd101, 0xa438, 0xbf85, 0xa438, 0xa402,
++        0xa438, 0x4f48, 0xa438, 0xbf85, 0xa438, 0xa702, 0xa438, 0x54c0,
++        0xa438, 0xd10f, 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48,
++        0xa438, 0x024d, 0xa438, 0x6abf, 0xa438, 0x85ad, 0xa438, 0x024f,
++        0xa438, 0x67bf, 0xa438, 0x8ff7, 0xa438, 0xddbf, 0xa438, 0x85b0,
++        0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff8, 0xa438, 0xddbf,
++        0xa438, 0x85b3, 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff9,
++        0xa438, 0xddbf, 0xa438, 0x85b6, 0xa438, 0x024f, 0xa438, 0x67bf,
++        0xa438, 0x8ffa, 0xa438, 0xddd1, 0xa438, 0x00bf, 0xa438, 0x85aa,
++        0xa438, 0x024f, 0xa438, 0x4802, 0xa438, 0x4d6a, 0xa438, 0xbf85,
++        0xa438, 0xad02, 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfbdd,
++        0xa438, 0xbf85, 0xa438, 0xb002, 0xa438, 0x4f67, 0xa438, 0xbf8f,
++        0xa438, 0xfcdd, 0xa438, 0xbf85, 0xa438, 0xb302, 0xa438, 0x4f67,
++        0xa438, 0xbf8f, 0xa438, 0xfddd, 0xa438, 0xbf85, 0xa438, 0xb602,
++        0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfedd, 0xa438, 0xbf85,
++        0xa438, 0xa702, 0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0xa102,
++        0xa438, 0x54b7, 0xa438, 0xaf3c, 0xa438, 0x2066, 0xa438, 0xb800,
++        0xa438, 0xb8bd, 0xa438, 0x30ee, 0xa438, 0xbd2c, 0xa438, 0xb8bd,
++        0xa438, 0x7040, 0xa438, 0xbd86, 0xa438, 0xc8bd, 0xa438, 0x8640,
++        0xa438, 0xbd88, 0xa438, 0xc8bd, 0xa438, 0x8802, 0xa438, 0x1929,
++        0xa438, 0xa202, 0xa438, 0x02ae, 0xa438, 0x03a2, 0xa438, 0x032e,
++        0xa438, 0xd10f, 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48,
++        0xa438, 0xe18f, 0xa438, 0xf7bf, 0xa438, 0x85ad, 0xa438, 0x024f,
++        0xa438, 0x48e1, 0xa438, 0x8ff8, 0xa438, 0xbf85, 0xa438, 0xb002,
++        0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf9bf, 0xa438, 0x85b3,
++        0xa438, 0x024f, 0xa438, 0x48e1, 0xa438, 0x8ffa, 0xa438, 0xbf85,
++        0xa438, 0xb602, 0xa438, 0x4f48, 0xa438, 0xae2c, 0xa438, 0xd100,
++        0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, 0xa438, 0xe18f,
++        0xa438, 0xfbbf, 0xa438, 0x85ad, 0xa438, 0x024f, 0xa438, 0x48e1,
++        0xa438, 0x8ffc, 0xa438, 0xbf85, 0xa438, 0xb002, 0xa438, 0x4f48,
++        0xa438, 0xe18f, 0xa438, 0xfdbf, 0xa438, 0x85b3, 0xa438, 0x024f,
++        0xa438, 0x48e1, 0xa438, 0x8ffe, 0xa438, 0xbf85, 0xa438, 0xb602,
++        0xa438, 0x4f48, 0xa438, 0xbf86, 0xa438, 0x7e02, 0xa438, 0x4f67,
++        0xa438, 0xa100, 0xa438, 0x02ae, 0xa438, 0x25a1, 0xa438, 0x041d,
++        0xa438, 0xe18f, 0xa438, 0xf1bf, 0xa438, 0x8675, 0xa438, 0x024f,
++        0xa438, 0x48e1, 0xa438, 0x8ff2, 0xa438, 0xbf86, 0xa438, 0x7802,
++        0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf3bf, 0xa438, 0x867b,
++        0xa438, 0x024f, 0xa438, 0x48ae, 0xa438, 0x29a1, 0xa438, 0x070b,
++        0xa438, 0xae24, 0xa438, 0xbf86, 0xa438, 0x8102, 0xa438, 0x4f67,
++        0xa438, 0xad28, 0xa438, 0x1be1, 0xa438, 0x8ff4, 0xa438, 0xbf86,
++        0xa438, 0x7502, 0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf5bf,
++        0xa438, 0x8678, 0xa438, 0x024f, 0xa438, 0x48e1, 0xa438, 0x8ff6,
++        0xa438, 0xbf86, 0xa438, 0x7b02, 0xa438, 0x4f48, 0xa438, 0xaf09,
++        0xa438, 0x8420, 0xa438, 0xbc32, 0xa438, 0x20bc, 0xa438, 0x3e76,
++        0xa438, 0xbc08, 0xa438, 0xfda6, 0xa438, 0x1a00, 0xa438, 0xb64e,
++        0xa438, 0xd101, 0xa438, 0xbf85, 0xa438, 0xa402, 0xa438, 0x4f48,
++        0xa438, 0xbf85, 0xa438, 0xa702, 0xa438, 0x54c0, 0xa438, 0xd10f,
++        0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, 0xa438, 0x024d,
++        0xa438, 0x6abf, 0xa438, 0x85ad, 0xa438, 0x024f, 0xa438, 0x67bf,
++        0xa438, 0x8ff7, 0xa438, 0xddbf, 0xa438, 0x85b0, 0xa438, 0x024f,
++        0xa438, 0x67bf, 0xa438, 0x8ff8, 0xa438, 0xddbf, 0xa438, 0x85b3,
++        0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff9, 0xa438, 0xddbf,
++        0xa438, 0x85b6, 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ffa,
++        0xa438, 0xddd1, 0xa438, 0x00bf, 0xa438, 0x85aa, 0xa438, 0x024f,
++        0xa438, 0x4802, 0xa438, 0x4d6a, 0xa438, 0xbf85, 0xa438, 0xad02,
++        0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfbdd, 0xa438, 0xbf85,
++        0xa438, 0xb002, 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfcdd,
++        0xa438, 0xbf85, 0xa438, 0xb302, 0xa438, 0x4f67, 0xa438, 0xbf8f,
++        0xa438, 0xfddd, 0xa438, 0xbf85, 0xa438, 0xb602, 0xa438, 0x4f67,
++        0xa438, 0xbf8f, 0xa438, 0xfedd, 0xa438, 0xbf85, 0xa438, 0xa702,
++        0xa438, 0x54b7, 0xa438, 0xaf00, 0xa438, 0x8800, 0xa436, 0xb818,
++        0xa438, 0x38b8, 0xa436, 0xb81a, 0xa438, 0x0444, 0xa436, 0xb81c,
++        0xa438, 0x40ee, 0xa436, 0xb81e, 0xa438, 0x3C1A, 0xa436, 0xb850,
++        0xa438, 0x0981, 0xa436, 0xb852, 0xa438, 0x0085, 0xa436, 0xb878,
++        0xa438, 0xffff, 0xa436, 0xb884, 0xa438, 0xffff, 0xa436, 0xb832,
++        0xa438, 0x003f, 0xa436, 0x0000, 0xa438, 0x0000, 0xa436, 0xB82E,
++        0xa438, 0x0000, 0xa436, 0x8024, 0xa438, 0x0000, 0xb820, 0x0000,
++        0xa436, 0x801E, 0xa438, 0x0021, 0xFFFF, 0xFFFF
++};
++
++static const u16 phy_mcu_ram_code_8125b_2[] = {
++        0xa436, 0x8024, 0xa438, 0x3701, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x801a, 0xa438, 0x1800, 0xa438, 0x803f,
++        0xa438, 0x1800, 0xa438, 0x8045, 0xa438, 0x1800, 0xa438, 0x8067,
++        0xa438, 0x1800, 0xa438, 0x806d, 0xa438, 0x1800, 0xa438, 0x8071,
++        0xa438, 0x1800, 0xa438, 0x80b1, 0xa438, 0xd093, 0xa438, 0xd1c4,
++        0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd704, 0xa438, 0x5fbc,
++        0xa438, 0xd504, 0xa438, 0xc9f1, 0xa438, 0x1800, 0xa438, 0x0fc9,
++        0xa438, 0xbb50, 0xa438, 0xd505, 0xa438, 0xa202, 0xa438, 0xd504,
++        0xa438, 0x8c0f, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1519,
++        0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x5fae,
++        0xa438, 0x9b50, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd75e,
++        0xa438, 0x7fae, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd707,
++        0xa438, 0x40a7, 0xa438, 0xd719, 0xa438, 0x4071, 0xa438, 0x1800,
++        0xa438, 0x1557, 0xa438, 0xd719, 0xa438, 0x2f70, 0xa438, 0x803b,
++        0xa438, 0x2f73, 0xa438, 0x156a, 0xa438, 0x5e70, 0xa438, 0x1800,
++        0xa438, 0x155d, 0xa438, 0xd505, 0xa438, 0xa202, 0xa438, 0xd500,
++        0xa438, 0xffed, 0xa438, 0xd709, 0xa438, 0x4054, 0xa438, 0xa788,
++        0xa438, 0xd70b, 0xa438, 0x1800, 0xa438, 0x172a, 0xa438, 0xc0c1,
++        0xa438, 0xc0c0, 0xa438, 0xd05a, 0xa438, 0xd1ba, 0xa438, 0xd701,
++        0xa438, 0x2529, 0xa438, 0x022a, 0xa438, 0xd0a7, 0xa438, 0xd1b9,
++        0xa438, 0xa208, 0xa438, 0x1000, 0xa438, 0x080e, 0xa438, 0xd701,
++        0xa438, 0x408b, 0xa438, 0x1000, 0xa438, 0x0a65, 0xa438, 0xf003,
++        0xa438, 0x1000, 0xa438, 0x0a6b, 0xa438, 0xd701, 0xa438, 0x1000,
++        0xa438, 0x0920, 0xa438, 0x1000, 0xa438, 0x0915, 0xa438, 0x1000,
++        0xa438, 0x0909, 0xa438, 0x228f, 0xa438, 0x804e, 0xa438, 0x9801,
++        0xa438, 0xd71e, 0xa438, 0x5d61, 0xa438, 0xd701, 0xa438, 0x1800,
++        0xa438, 0x022a, 0xa438, 0x2005, 0xa438, 0x091a, 0xa438, 0x3bd9,
++        0xa438, 0x0919, 0xa438, 0x1800, 0xa438, 0x0916, 0xa438, 0xd090,
++        0xa438, 0xd1c9, 0xa438, 0x1800, 0xa438, 0x1064, 0xa438, 0xd096,
++        0xa438, 0xd1a9, 0xa438, 0xd503, 0xa438, 0xa104, 0xa438, 0x0c07,
++        0xa438, 0x0902, 0xa438, 0xd500, 0xa438, 0xbc10, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0xa201, 0xa438, 0x8201, 0xa438, 0xce00,
++        0xa438, 0xd500, 0xa438, 0xc484, 0xa438, 0xd503, 0xa438, 0xcc02,
++        0xa438, 0xcd0d, 0xa438, 0xaf01, 0xa438, 0xd500, 0xa438, 0xd703,
++        0xa438, 0x4371, 0xa438, 0xbd08, 0xa438, 0x1000, 0xa438, 0x135c,
++        0xa438, 0xd75e, 0xa438, 0x5fb3, 0xa438, 0xd503, 0xa438, 0xd0f5,
++        0xa438, 0xd1c6, 0xa438, 0x0cf0, 0xa438, 0x0e50, 0xa438, 0xd704,
++        0xa438, 0x401c, 0xa438, 0xd0f5, 0xa438, 0xd1c6, 0xa438, 0x0cf0,
++        0xa438, 0x0ea0, 0xa438, 0x401c, 0xa438, 0xd07b, 0xa438, 0xd1c5,
++        0xa438, 0x8ef0, 0xa438, 0x401c, 0xa438, 0x9d08, 0xa438, 0x1000,
++        0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x7fb3, 0xa438, 0x1000,
++        0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x5fad, 0xa438, 0x1000,
++        0xa438, 0x14c5, 0xa438, 0xd703, 0xa438, 0x3181, 0xa438, 0x80af,
++        0xa438, 0x60ad, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd703,
++        0xa438, 0x5fba, 0xa438, 0x1800, 0xa438, 0x0cc7, 0xa438, 0xa802,
++        0xa438, 0xa301, 0xa438, 0xa801, 0xa438, 0xc004, 0xa438, 0xd710,
++        0xa438, 0x4000, 0xa438, 0x1800, 0xa438, 0x1e79, 0xa436, 0xA026,
++        0xa438, 0x1e78, 0xa436, 0xA024, 0xa438, 0x0c93, 0xa436, 0xA022,
++        0xa438, 0x1062, 0xa436, 0xA020, 0xa438, 0x0915, 0xa436, 0xA006,
++        0xa438, 0x020a, 0xa436, 0xA004, 0xa438, 0x1726, 0xa436, 0xA002,
++        0xa438, 0x1542, 0xa436, 0xA000, 0xa438, 0x0fc7, 0xa436, 0xA008,
++        0xa438, 0xff00, 0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x801d, 0xa438, 0x1800, 0xa438, 0x802c,
++        0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0x1800, 0xa438, 0x802c,
++        0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0x1800, 0xa438, 0x802c,
++        0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0xd700, 0xa438, 0x6090,
++        0xa438, 0x60d1, 0xa438, 0xc95c, 0xa438, 0xf007, 0xa438, 0x60b1,
++        0xa438, 0xc95a, 0xa438, 0xf004, 0xa438, 0xc956, 0xa438, 0xf002,
++        0xa438, 0xc94e, 0xa438, 0x1800, 0xa438, 0x00cd, 0xa438, 0xd700,
++        0xa438, 0x6090, 0xa438, 0x60d1, 0xa438, 0xc95c, 0xa438, 0xf007,
++        0xa438, 0x60b1, 0xa438, 0xc95a, 0xa438, 0xf004, 0xa438, 0xc956,
++        0xa438, 0xf002, 0xa438, 0xc94e, 0xa438, 0x1000, 0xa438, 0x022a,
++        0xa438, 0x1800, 0xa438, 0x0132, 0xa436, 0xA08E, 0xa438, 0xffff,
++        0xa436, 0xA08C, 0xa438, 0xffff, 0xa436, 0xA08A, 0xa438, 0xffff,
++        0xa436, 0xA088, 0xa438, 0xffff, 0xa436, 0xA086, 0xa438, 0xffff,
++        0xa436, 0xA084, 0xa438, 0xffff, 0xa436, 0xA082, 0xa438, 0x012f,
++        0xa436, 0xA080, 0xa438, 0x00cc, 0xa436, 0xA090, 0xa438, 0x0103,
++        0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000,
++        0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800,
++        0xa438, 0x8020, 0xa438, 0x1800, 0xa438, 0x802a, 0xa438, 0x1800,
++        0xa438, 0x8035, 0xa438, 0x1800, 0xa438, 0x803c, 0xa438, 0x1800,
++        0xa438, 0x803c, 0xa438, 0x1800, 0xa438, 0x803c, 0xa438, 0x1800,
++        0xa438, 0x803c, 0xa438, 0xd107, 0xa438, 0xd042, 0xa438, 0xa404,
++        0xa438, 0x1000, 0xa438, 0x09df, 0xa438, 0xd700, 0xa438, 0x5fb4,
++        0xa438, 0x8280, 0xa438, 0xd700, 0xa438, 0x6065, 0xa438, 0xd125,
++        0xa438, 0xf002, 0xa438, 0xd12b, 0xa438, 0xd040, 0xa438, 0x1800,
++        0xa438, 0x077f, 0xa438, 0x0cf0, 0xa438, 0x0c50, 0xa438, 0xd104,
++        0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0aa8, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0x1800, 0xa438, 0x0a2e, 0xa438, 0xcb9b,
++        0xa438, 0xd110, 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0b7b,
++        0xa438, 0x1000, 0xa438, 0x09df, 0xa438, 0xd700, 0xa438, 0x5fb4,
++        0xa438, 0x1800, 0xa438, 0x081b, 0xa438, 0x1000, 0xa438, 0x09df,
++        0xa438, 0xd704, 0xa438, 0x7fb8, 0xa438, 0xa718, 0xa438, 0x1800,
++        0xa438, 0x074e, 0xa436, 0xA10E, 0xa438, 0xffff, 0xa436, 0xA10C,
++        0xa438, 0xffff, 0xa436, 0xA10A, 0xa438, 0xffff, 0xa436, 0xA108,
++        0xa438, 0xffff, 0xa436, 0xA106, 0xa438, 0x074d, 0xa436, 0xA104,
++        0xa438, 0x0818, 0xa436, 0xA102, 0xa438, 0x0a2c, 0xa436, 0xA100,
++        0xa438, 0x077e, 0xa436, 0xA110, 0xa438, 0x000f, 0xa436, 0xb87c,
++        0xa438, 0x8625, 0xa436, 0xb87e, 0xa438, 0xaf86, 0xa438, 0x3daf,
++        0xa438, 0x8689, 0xa438, 0xaf88, 0xa438, 0x69af, 0xa438, 0x8887,
++        0xa438, 0xaf88, 0xa438, 0x9caf, 0xa438, 0x88be, 0xa438, 0xaf88,
++        0xa438, 0xbeaf, 0xa438, 0x88be, 0xa438, 0xbf86, 0xa438, 0x49d7,
++        0xa438, 0x0040, 0xa438, 0x0277, 0xa438, 0x7daf, 0xa438, 0x2727,
++        0xa438, 0x0000, 0xa438, 0x7205, 0xa438, 0x0000, 0xa438, 0x7208,
++        0xa438, 0x0000, 0xa438, 0x71f3, 0xa438, 0x0000, 0xa438, 0x71f6,
++        0xa438, 0x0000, 0xa438, 0x7229, 0xa438, 0x0000, 0xa438, 0x722c,
++        0xa438, 0x0000, 0xa438, 0x7217, 0xa438, 0x0000, 0xa438, 0x721a,
++        0xa438, 0x0000, 0xa438, 0x721d, 0xa438, 0x0000, 0xa438, 0x7211,
++        0xa438, 0x0000, 0xa438, 0x7220, 0xa438, 0x0000, 0xa438, 0x7214,
++        0xa438, 0x0000, 0xa438, 0x722f, 0xa438, 0x0000, 0xa438, 0x7223,
++        0xa438, 0x0000, 0xa438, 0x7232, 0xa438, 0x0000, 0xa438, 0x7226,
++        0xa438, 0xf8f9, 0xa438, 0xfae0, 0xa438, 0x85b3, 0xa438, 0x3802,
++        0xa438, 0xad27, 0xa438, 0x02ae, 0xa438, 0x03af, 0xa438, 0x8830,
++        0xa438, 0x1f66, 0xa438, 0xef65, 0xa438, 0xbfc2, 0xa438, 0x1f1a,
++        0xa438, 0x96f7, 0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00da,
++        0xa438, 0xf605, 0xa438, 0xbfc2, 0xa438, 0x2f1a, 0xa438, 0x96f7,
++        0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00db, 0xa438, 0xf605,
++        0xa438, 0xef02, 0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88,
++        0xa438, 0x4202, 0xa438, 0x6e7d, 0xa438, 0xef02, 0xa438, 0x1b03,
++        0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4502,
++        0xa438, 0x6e7d, 0xa438, 0xef02, 0xa438, 0x1a03, 0xa438, 0x1f11,
++        0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4802, 0xa438, 0x6e7d,
++        0xa438, 0xbfc2, 0xa438, 0x3f1a, 0xa438, 0x96f7, 0xa438, 0x05ee,
++        0xa438, 0xffd2, 0xa438, 0x00da, 0xa438, 0xf605, 0xa438, 0xbfc2,
++        0xa438, 0x4f1a, 0xa438, 0x96f7, 0xa438, 0x05ee, 0xa438, 0xffd2,
++        0xa438, 0x00db, 0xa438, 0xf605, 0xa438, 0xef02, 0xa438, 0x1f11,
++        0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4b02, 0xa438, 0x6e7d,
++        0xa438, 0xef02, 0xa438, 0x1b03, 0xa438, 0x1f11, 0xa438, 0x0d42,
++        0xa438, 0xbf88, 0xa438, 0x4e02, 0xa438, 0x6e7d, 0xa438, 0xef02,
++        0xa438, 0x1a03, 0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88,
++        0xa438, 0x5102, 0xa438, 0x6e7d, 0xa438, 0xef56, 0xa438, 0xd020,
++        0xa438, 0x1f11, 0xa438, 0xbf88, 0xa438, 0x5402, 0xa438, 0x6e7d,
++        0xa438, 0xbf88, 0xa438, 0x5702, 0xa438, 0x6e7d, 0xa438, 0xbf88,
++        0xa438, 0x5a02, 0xa438, 0x6e7d, 0xa438, 0xe185, 0xa438, 0xa0ef,
++        0xa438, 0x0348, 0xa438, 0x0a28, 0xa438, 0x05ef, 0xa438, 0x201b,
++        0xa438, 0x01ad, 0xa438, 0x2735, 0xa438, 0x1f44, 0xa438, 0xe085,
++        0xa438, 0x88e1, 0xa438, 0x8589, 0xa438, 0xbf88, 0xa438, 0x5d02,
++        0xa438, 0x6e7d, 0xa438, 0xe085, 0xa438, 0x8ee1, 0xa438, 0x858f,
++        0xa438, 0xbf88, 0xa438, 0x6002, 0xa438, 0x6e7d, 0xa438, 0xe085,
++        0xa438, 0x94e1, 0xa438, 0x8595, 0xa438, 0xbf88, 0xa438, 0x6302,
++        0xa438, 0x6e7d, 0xa438, 0xe085, 0xa438, 0x9ae1, 0xa438, 0x859b,
++        0xa438, 0xbf88, 0xa438, 0x6602, 0xa438, 0x6e7d, 0xa438, 0xaf88,
++        0xa438, 0x3cbf, 0xa438, 0x883f, 0xa438, 0x026e, 0xa438, 0x9cad,
++        0xa438, 0x2835, 0xa438, 0x1f44, 0xa438, 0xe08f, 0xa438, 0xf8e1,
++        0xa438, 0x8ff9, 0xa438, 0xbf88, 0xa438, 0x5d02, 0xa438, 0x6e7d,
++        0xa438, 0xe08f, 0xa438, 0xfae1, 0xa438, 0x8ffb, 0xa438, 0xbf88,
++        0xa438, 0x6002, 0xa438, 0x6e7d, 0xa438, 0xe08f, 0xa438, 0xfce1,
++        0xa438, 0x8ffd, 0xa438, 0xbf88, 0xa438, 0x6302, 0xa438, 0x6e7d,
++        0xa438, 0xe08f, 0xa438, 0xfee1, 0xa438, 0x8fff, 0xa438, 0xbf88,
++        0xa438, 0x6602, 0xa438, 0x6e7d, 0xa438, 0xaf88, 0xa438, 0x3ce1,
++        0xa438, 0x85a1, 0xa438, 0x1b21, 0xa438, 0xad37, 0xa438, 0x341f,
++        0xa438, 0x44e0, 0xa438, 0x858a, 0xa438, 0xe185, 0xa438, 0x8bbf,
++        0xa438, 0x885d, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x8590,
++        0xa438, 0xe185, 0xa438, 0x91bf, 0xa438, 0x8860, 0xa438, 0x026e,
++        0xa438, 0x7de0, 0xa438, 0x8596, 0xa438, 0xe185, 0xa438, 0x97bf,
++        0xa438, 0x8863, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x859c,
++        0xa438, 0xe185, 0xa438, 0x9dbf, 0xa438, 0x8866, 0xa438, 0x026e,
++        0xa438, 0x7dae, 0xa438, 0x401f, 0xa438, 0x44e0, 0xa438, 0x858c,
++        0xa438, 0xe185, 0xa438, 0x8dbf, 0xa438, 0x885d, 0xa438, 0x026e,
++        0xa438, 0x7de0, 0xa438, 0x8592, 0xa438, 0xe185, 0xa438, 0x93bf,
++        0xa438, 0x8860, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x8598,
++        0xa438, 0xe185, 0xa438, 0x99bf, 0xa438, 0x8863, 0xa438, 0x026e,
++        0xa438, 0x7de0, 0xa438, 0x859e, 0xa438, 0xe185, 0xa438, 0x9fbf,
++        0xa438, 0x8866, 0xa438, 0x026e, 0xa438, 0x7dae, 0xa438, 0x0ce1,
++        0xa438, 0x85b3, 0xa438, 0x3904, 0xa438, 0xac2f, 0xa438, 0x04ee,
++        0xa438, 0x85b3, 0xa438, 0x00af, 0xa438, 0x39d9, 0xa438, 0x22ac,
++        0xa438, 0xeaf0, 0xa438, 0xacf6, 0xa438, 0xf0ac, 0xa438, 0xfaf0,
++        0xa438, 0xacf8, 0xa438, 0xf0ac, 0xa438, 0xfcf0, 0xa438, 0xad00,
++        0xa438, 0xf0ac, 0xa438, 0xfef0, 0xa438, 0xacf0, 0xa438, 0xf0ac,
++        0xa438, 0xf4f0, 0xa438, 0xacf2, 0xa438, 0xf0ac, 0xa438, 0xb0f0,
++        0xa438, 0xacae, 0xa438, 0xf0ac, 0xa438, 0xacf0, 0xa438, 0xacaa,
++        0xa438, 0xa100, 0xa438, 0x0ce1, 0xa438, 0x8ff7, 0xa438, 0xbf88,
++        0xa438, 0x8402, 0xa438, 0x6e7d, 0xa438, 0xaf26, 0xa438, 0xe9e1,
++        0xa438, 0x8ff6, 0xa438, 0xbf88, 0xa438, 0x8402, 0xa438, 0x6e7d,
++        0xa438, 0xaf26, 0xa438, 0xf520, 0xa438, 0xac86, 0xa438, 0xbf88,
++        0xa438, 0x3f02, 0xa438, 0x6e9c, 0xa438, 0xad28, 0xa438, 0x03af,
++        0xa438, 0x3324, 0xa438, 0xad38, 0xa438, 0x03af, 0xa438, 0x32e6,
++        0xa438, 0xaf32, 0xa438, 0xfbee, 0xa438, 0x826a, 0xa438, 0x0002,
++        0xa438, 0x88a6, 0xa438, 0xaf04, 0xa438, 0x78f8, 0xa438, 0xfaef,
++        0xa438, 0x69e0, 0xa438, 0x8015, 0xa438, 0xad20, 0xa438, 0x06bf,
++        0xa438, 0x88bb, 0xa438, 0x0275, 0xa438, 0xb1ef, 0xa438, 0x96fe,
++        0xa438, 0xfc04, 0xa438, 0x00b8, 0xa438, 0x7a00, 0xa436, 0xb87c,
++        0xa438, 0x8ff6, 0xa436, 0xb87e, 0xa438, 0x0705, 0xa436, 0xb87c,
++        0xa438, 0x8ff8, 0xa436, 0xb87e, 0xa438, 0x19cc, 0xa436, 0xb87c,
++        0xa438, 0x8ffa, 0xa436, 0xb87e, 0xa438, 0x28e3, 0xa436, 0xb87c,
++        0xa438, 0x8ffc, 0xa436, 0xb87e, 0xa438, 0x1047, 0xa436, 0xb87c,
++        0xa438, 0x8ffe, 0xa436, 0xb87e, 0xa438, 0x0a45, 0xa436, 0xb85e,
++        0xa438, 0x271E, 0xa436, 0xb860, 0xa438, 0x3846, 0xa436, 0xb862,
++        0xa438, 0x26E6, 0xa436, 0xb864, 0xa438, 0x32E3, 0xa436, 0xb886,
++        0xa438, 0x0474, 0xa436, 0xb888, 0xa438, 0xffff, 0xa436, 0xb88a,
++        0xa438, 0xffff, 0xa436, 0xb88c, 0xa438, 0xffff, 0xa436, 0xb838,
++        0xa438, 0x001f, 0xb820, 0x0010, 0xa436, 0x846e, 0xa438, 0xaf84,
++        0xa438, 0x86af, 0xa438, 0x8690, 0xa438, 0xaf86, 0xa438, 0xa4af,
++        0xa438, 0x8934, 0xa438, 0xaf89, 0xa438, 0x60af, 0xa438, 0x897e,
++        0xa438, 0xaf89, 0xa438, 0xa9af, 0xa438, 0x89a9, 0xa438, 0xee82,
++        0xa438, 0x5f00, 0xa438, 0x0284, 0xa438, 0x90af, 0xa438, 0x0441,
++        0xa438, 0xf8e0, 0xa438, 0x8ff3, 0xa438, 0xa000, 0xa438, 0x0502,
++        0xa438, 0x84a4, 0xa438, 0xae06, 0xa438, 0xa001, 0xa438, 0x0302,
++        0xa438, 0x84c8, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xef59,
++        0xa438, 0xe080, 0xa438, 0x15ad, 0xa438, 0x2702, 0xa438, 0xae03,
++        0xa438, 0xaf84, 0xa438, 0xc3bf, 0xa438, 0x53ca, 0xa438, 0x0252,
++        0xa438, 0xc8ad, 0xa438, 0x2807, 0xa438, 0x0285, 0xa438, 0x2cee,
++        0xa438, 0x8ff3, 0xa438, 0x01ef, 0xa438, 0x95fd, 0xa438, 0xfc04,
++        0xa438, 0xf8f9, 0xa438, 0xfaef, 0xa438, 0x69bf, 0xa438, 0x53ca,
++        0xa438, 0x0252, 0xa438, 0xc8ac, 0xa438, 0x2822, 0xa438, 0xd480,
++        0xa438, 0x00bf, 0xa438, 0x8684, 0xa438, 0x0252, 0xa438, 0xa9bf,
++        0xa438, 0x8687, 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868a,
++        0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868d, 0xa438, 0x0252,
++        0xa438, 0xa9ee, 0xa438, 0x8ff3, 0xa438, 0x00af, 0xa438, 0x8526,
++        0xa438, 0xe08f, 0xa438, 0xf4e1, 0xa438, 0x8ff5, 0xa438, 0xe28f,
++        0xa438, 0xf6e3, 0xa438, 0x8ff7, 0xa438, 0x1b45, 0xa438, 0xac27,
++        0xa438, 0x0eee, 0xa438, 0x8ff4, 0xa438, 0x00ee, 0xa438, 0x8ff5,
++        0xa438, 0x0002, 0xa438, 0x852c, 0xa438, 0xaf85, 0xa438, 0x26e0,
++        0xa438, 0x8ff4, 0xa438, 0xe18f, 0xa438, 0xf52c, 0xa438, 0x0001,
++        0xa438, 0xe48f, 0xa438, 0xf4e5, 0xa438, 0x8ff5, 0xa438, 0xef96,
++        0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xef59,
++        0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8, 0xa438, 0xa18b,
++        0xa438, 0x02ae, 0xa438, 0x03af, 0xa438, 0x85da, 0xa438, 0xbf57,
++        0xa438, 0x7202, 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xf8e5,
++        0xa438, 0x8ff9, 0xa438, 0xbf57, 0xa438, 0x7502, 0xa438, 0x52c8,
++        0xa438, 0xe48f, 0xa438, 0xfae5, 0xa438, 0x8ffb, 0xa438, 0xbf57,
++        0xa438, 0x7802, 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfce5,
++        0xa438, 0x8ffd, 0xa438, 0xbf57, 0xa438, 0x7b02, 0xa438, 0x52c8,
++        0xa438, 0xe48f, 0xa438, 0xfee5, 0xa438, 0x8fff, 0xa438, 0xbf57,
++        0xa438, 0x6c02, 0xa438, 0x52c8, 0xa438, 0xa102, 0xa438, 0x13ee,
++        0xa438, 0x8ffc, 0xa438, 0x80ee, 0xa438, 0x8ffd, 0xa438, 0x00ee,
++        0xa438, 0x8ffe, 0xa438, 0x80ee, 0xa438, 0x8fff, 0xa438, 0x00af,
++        0xa438, 0x8599, 0xa438, 0xa101, 0xa438, 0x0cbf, 0xa438, 0x534c,
++        0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x0303, 0xa438, 0xaf85,
++        0xa438, 0x77bf, 0xa438, 0x5322, 0xa438, 0x0252, 0xa438, 0xc8a1,
++        0xa438, 0x8b02, 0xa438, 0xae03, 0xa438, 0xaf86, 0xa438, 0x64e0,
++        0xa438, 0x8ff8, 0xa438, 0xe18f, 0xa438, 0xf9bf, 0xa438, 0x8684,
++        0xa438, 0x0252, 0xa438, 0xa9e0, 0xa438, 0x8ffa, 0xa438, 0xe18f,
++        0xa438, 0xfbbf, 0xa438, 0x8687, 0xa438, 0x0252, 0xa438, 0xa9e0,
++        0xa438, 0x8ffc, 0xa438, 0xe18f, 0xa438, 0xfdbf, 0xa438, 0x868a,
++        0xa438, 0x0252, 0xa438, 0xa9e0, 0xa438, 0x8ffe, 0xa438, 0xe18f,
++        0xa438, 0xffbf, 0xa438, 0x868d, 0xa438, 0x0252, 0xa438, 0xa9af,
++        0xa438, 0x867f, 0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8,
++        0xa438, 0xa144, 0xa438, 0x3cbf, 0xa438, 0x547b, 0xa438, 0x0252,
++        0xa438, 0xc8e4, 0xa438, 0x8ff8, 0xa438, 0xe58f, 0xa438, 0xf9bf,
++        0xa438, 0x547e, 0xa438, 0x0252, 0xa438, 0xc8e4, 0xa438, 0x8ffa,
++        0xa438, 0xe58f, 0xa438, 0xfbbf, 0xa438, 0x5481, 0xa438, 0x0252,
++        0xa438, 0xc8e4, 0xa438, 0x8ffc, 0xa438, 0xe58f, 0xa438, 0xfdbf,
++        0xa438, 0x5484, 0xa438, 0x0252, 0xa438, 0xc8e4, 0xa438, 0x8ffe,
++        0xa438, 0xe58f, 0xa438, 0xffbf, 0xa438, 0x5322, 0xa438, 0x0252,
++        0xa438, 0xc8a1, 0xa438, 0x4448, 0xa438, 0xaf85, 0xa438, 0xa7bf,
++        0xa438, 0x5322, 0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x313c,
++        0xa438, 0xbf54, 0xa438, 0x7b02, 0xa438, 0x52c8, 0xa438, 0xe48f,
++        0xa438, 0xf8e5, 0xa438, 0x8ff9, 0xa438, 0xbf54, 0xa438, 0x7e02,
++        0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfae5, 0xa438, 0x8ffb,
++        0xa438, 0xbf54, 0xa438, 0x8102, 0xa438, 0x52c8, 0xa438, 0xe48f,
++        0xa438, 0xfce5, 0xa438, 0x8ffd, 0xa438, 0xbf54, 0xa438, 0x8402,
++        0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfee5, 0xa438, 0x8fff,
++        0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8, 0xa438, 0xa131,
++        0xa438, 0x03af, 0xa438, 0x85a7, 0xa438, 0xd480, 0xa438, 0x00bf,
++        0xa438, 0x8684, 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x8687,
++        0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868a, 0xa438, 0x0252,
++        0xa438, 0xa9bf, 0xa438, 0x868d, 0xa438, 0x0252, 0xa438, 0xa9ef,
++        0xa438, 0x95fd, 0xa438, 0xfc04, 0xa438, 0xf0d1, 0xa438, 0x2af0,
++        0xa438, 0xd12c, 0xa438, 0xf0d1, 0xa438, 0x44f0, 0xa438, 0xd146,
++        0xa438, 0xbf86, 0xa438, 0xa102, 0xa438, 0x52c8, 0xa438, 0xbf86,
++        0xa438, 0xa102, 0xa438, 0x52c8, 0xa438, 0xd101, 0xa438, 0xaf06,
++        0xa438, 0xa570, 0xa438, 0xce42, 0xa438, 0xee83, 0xa438, 0xc800,
++        0xa438, 0x0286, 0xa438, 0xba02, 0xa438, 0x8728, 0xa438, 0x0287,
++        0xa438, 0xbe02, 0xa438, 0x87f9, 0xa438, 0x0288, 0xa438, 0xc3af,
++        0xa438, 0x4771, 0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef69,
++        0xa438, 0xfae0, 0xa438, 0x8015, 0xa438, 0xad25, 0xa438, 0x45d2,
++        0xa438, 0x0002, 0xa438, 0x8714, 0xa438, 0xac4f, 0xa438, 0x02ae,
++        0xa438, 0x0bef, 0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x0400,
++        0xa438, 0xab26, 0xa438, 0xae30, 0xa438, 0xe08f, 0xa438, 0xe9e1,
++        0xa438, 0x8fea, 0xa438, 0x1b46, 0xa438, 0xab26, 0xa438, 0xef32,
++        0xa438, 0x0c31, 0xa438, 0xbf8f, 0xa438, 0xe91a, 0xa438, 0x93d8,
++        0xa438, 0x19d9, 0xa438, 0x1b46, 0xa438, 0xab0a, 0xa438, 0x19d8,
++        0xa438, 0x19d9, 0xa438, 0x1b46, 0xa438, 0xaa02, 0xa438, 0xae0c,
++        0xa438, 0xbf57, 0xa438, 0x1202, 0xa438, 0x58b1, 0xa438, 0xbf57,
++        0xa438, 0x1202, 0xa438, 0x58a8, 0xa438, 0xfeef, 0xa438, 0x96ff,
++        0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8fb, 0xa438, 0xef79,
++        0xa438, 0xa200, 0xa438, 0x08bf, 0xa438, 0x892e, 0xa438, 0x0252,
++        0xa438, 0xc8ef, 0xa438, 0x64ef, 0xa438, 0x97ff, 0xa438, 0xfc04,
++        0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef69, 0xa438, 0xfae0,
++        0xa438, 0x8015, 0xa438, 0xad25, 0xa438, 0x50d2, 0xa438, 0x0002,
++        0xa438, 0x878d, 0xa438, 0xac4f, 0xa438, 0x02ae, 0xa438, 0x0bef,
++        0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x1000, 0xa438, 0xab31,
++        0xa438, 0xae29, 0xa438, 0xe08f, 0xa438, 0xede1, 0xa438, 0x8fee,
++        0xa438, 0x1b46, 0xa438, 0xab1f, 0xa438, 0xa200, 0xa438, 0x04ef,
++        0xa438, 0x32ae, 0xa438, 0x02d3, 0xa438, 0x010c, 0xa438, 0x31bf,
++        0xa438, 0x8fed, 0xa438, 0x1a93, 0xa438, 0xd819, 0xa438, 0xd91b,
++        0xa438, 0x46ab, 0xa438, 0x0e19, 0xa438, 0xd819, 0xa438, 0xd91b,
++        0xa438, 0x46aa, 0xa438, 0x0612, 0xa438, 0xa205, 0xa438, 0xc0ae,
++        0xa438, 0x0cbf, 0xa438, 0x5712, 0xa438, 0x0258, 0xa438, 0xb1bf,
++        0xa438, 0x5712, 0xa438, 0x0258, 0xa438, 0xa8fe, 0xa438, 0xef96,
++        0xa438, 0xfffe, 0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xfbef,
++        0xa438, 0x79a2, 0xa438, 0x0005, 0xa438, 0xbf89, 0xa438, 0x1fae,
++        0xa438, 0x1ba2, 0xa438, 0x0105, 0xa438, 0xbf89, 0xa438, 0x22ae,
++        0xa438, 0x13a2, 0xa438, 0x0205, 0xa438, 0xbf89, 0xa438, 0x25ae,
++        0xa438, 0x0ba2, 0xa438, 0x0305, 0xa438, 0xbf89, 0xa438, 0x28ae,
++        0xa438, 0x03bf, 0xa438, 0x892b, 0xa438, 0x0252, 0xa438, 0xc8ef,
++        0xa438, 0x64ef, 0xa438, 0x97ff, 0xa438, 0xfc04, 0xa438, 0xf8f9,
++        0xa438, 0xfaef, 0xa438, 0x69fa, 0xa438, 0xe080, 0xa438, 0x15ad,
++        0xa438, 0x2628, 0xa438, 0xe081, 0xa438, 0xabe1, 0xa438, 0x81ac,
++        0xa438, 0xef64, 0xa438, 0xbf57, 0xa438, 0x1802, 0xa438, 0x52c8,
++        0xa438, 0x1b46, 0xa438, 0xaa0a, 0xa438, 0xbf57, 0xa438, 0x1b02,
++        0xa438, 0x52c8, 0xa438, 0x1b46, 0xa438, 0xab0c, 0xa438, 0xbf57,
++        0xa438, 0x1502, 0xa438, 0x58b1, 0xa438, 0xbf57, 0xa438, 0x1502,
++        0xa438, 0x58a8, 0xa438, 0xfeef, 0xa438, 0x96fe, 0xa438, 0xfdfc,
++        0xa438, 0x04f8, 0xa438, 0xf9ef, 0xa438, 0x59f9, 0xa438, 0xe080,
++        0xa438, 0x15ad, 0xa438, 0x2622, 0xa438, 0xbf53, 0xa438, 0x2202,
++        0xa438, 0x52c8, 0xa438, 0x3972, 0xa438, 0x9e10, 0xa438, 0xe083,
++        0xa438, 0xc9ac, 0xa438, 0x2605, 0xa438, 0x0288, 0xa438, 0x2cae,
++        0xa438, 0x0d02, 0xa438, 0x8870, 0xa438, 0xae08, 0xa438, 0xe283,
++        0xa438, 0xc9f6, 0xa438, 0x36e6, 0xa438, 0x83c9, 0xa438, 0xfdef,
++        0xa438, 0x95fd, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xfafb,
++        0xa438, 0xef79, 0xa438, 0xfbbf, 0xa438, 0x5718, 0xa438, 0x0252,
++        0xa438, 0xc8ef, 0xa438, 0x64e2, 0xa438, 0x8fe5, 0xa438, 0xe38f,
++        0xa438, 0xe61b, 0xa438, 0x659e, 0xa438, 0x10e4, 0xa438, 0x8fe5,
++        0xa438, 0xe58f, 0xa438, 0xe6e2, 0xa438, 0x83c9, 0xa438, 0xf636,
++        0xa438, 0xe683, 0xa438, 0xc9ae, 0xa438, 0x13e2, 0xa438, 0x83c9,
++        0xa438, 0xf736, 0xa438, 0xe683, 0xa438, 0xc902, 0xa438, 0x5820,
++        0xa438, 0xef57, 0xa438, 0xe68f, 0xa438, 0xe7e7, 0xa438, 0x8fe8,
++        0xa438, 0xffef, 0xa438, 0x97ff, 0xa438, 0xfefd, 0xa438, 0xfc04,
++        0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef79, 0xa438, 0xfbe2,
++        0xa438, 0x8fe7, 0xa438, 0xe38f, 0xa438, 0xe8ef, 0xa438, 0x65e2,
++        0xa438, 0x81b8, 0xa438, 0xe381, 0xa438, 0xb9ef, 0xa438, 0x7502,
++        0xa438, 0x583b, 0xa438, 0xac50, 0xa438, 0x1abf, 0xa438, 0x5718,
++        0xa438, 0x0252, 0xa438, 0xc8ef, 0xa438, 0x64e2, 0xa438, 0x8fe5,
++        0xa438, 0xe38f, 0xa438, 0xe61b, 0xa438, 0x659e, 0xa438, 0x1ce4,
++        0xa438, 0x8fe5, 0xa438, 0xe58f, 0xa438, 0xe6ae, 0xa438, 0x0cbf,
++        0xa438, 0x5715, 0xa438, 0x0258, 0xa438, 0xb1bf, 0xa438, 0x5715,
++        0xa438, 0x0258, 0xa438, 0xa8e2, 0xa438, 0x83c9, 0xa438, 0xf636,
++        0xa438, 0xe683, 0xa438, 0xc9ff, 0xa438, 0xef97, 0xa438, 0xfffe,
++        0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xf9fa, 0xa438, 0xef69,
++        0xa438, 0xe080, 0xa438, 0x15ad, 0xa438, 0x264b, 0xa438, 0xbf53,
++        0xa438, 0xca02, 0xa438, 0x52c8, 0xa438, 0xad28, 0xa438, 0x42bf,
++        0xa438, 0x8931, 0xa438, 0x0252, 0xa438, 0xc8ef, 0xa438, 0x54bf,
++        0xa438, 0x576c, 0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x001b,
++        0xa438, 0xbf53, 0xa438, 0x4c02, 0xa438, 0x52c8, 0xa438, 0xac29,
++        0xa438, 0x0dac, 0xa438, 0x2805, 0xa438, 0xa302, 0xa438, 0x16ae,
++        0xa438, 0x20a3, 0xa438, 0x0311, 0xa438, 0xae1b, 0xa438, 0xa304,
++        0xa438, 0x0cae, 0xa438, 0x16a3, 0xa438, 0x0802, 0xa438, 0xae11,
++        0xa438, 0xa309, 0xa438, 0x02ae, 0xa438, 0x0cbf, 0xa438, 0x5715,
++        0xa438, 0x0258, 0xa438, 0xb1bf, 0xa438, 0x5715, 0xa438, 0x0258,
++        0xa438, 0xa8ef, 0xa438, 0x96fe, 0xa438, 0xfdfc, 0xa438, 0x04f0,
++        0xa438, 0xa300, 0xa438, 0xf0a3, 0xa438, 0x02f0, 0xa438, 0xa304,
++        0xa438, 0xf0a3, 0xa438, 0x06f0, 0xa438, 0xa308, 0xa438, 0xf0a2,
++        0xa438, 0x8074, 0xa438, 0xa600, 0xa438, 0xac4f, 0xa438, 0x02ae,
++        0xa438, 0x0bef, 0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x1000,
++        0xa438, 0xab1b, 0xa438, 0xae16, 0xa438, 0xe081, 0xa438, 0xabe1,
++        0xa438, 0x81ac, 0xa438, 0x1b46, 0xa438, 0xab0c, 0xa438, 0xac32,
++        0xa438, 0x04ef, 0xa438, 0x32ae, 0xa438, 0x02d3, 0xa438, 0x04af,
++        0xa438, 0x486c, 0xa438, 0xaf48, 0xa438, 0x82af, 0xa438, 0x4888,
++        0xa438, 0xe081, 0xa438, 0x9be1, 0xa438, 0x819c, 0xa438, 0xe28f,
++        0xa438, 0xe3ad, 0xa438, 0x3009, 0xa438, 0x1f55, 0xa438, 0xe38f,
++        0xa438, 0xe20c, 0xa438, 0x581a, 0xa438, 0x45e4, 0xa438, 0x83a6,
++        0xa438, 0xe583, 0xa438, 0xa7af, 0xa438, 0x2a75, 0xa438, 0xe08f,
++        0xa438, 0xe3ad, 0xa438, 0x201c, 0xa438, 0x1f44, 0xa438, 0xe18f,
++        0xa438, 0xe10c, 0xa438, 0x44ef, 0xa438, 0x64e0, 0xa438, 0x8232,
++        0xa438, 0xe182, 0xa438, 0x331b, 0xa438, 0x649f, 0xa438, 0x091f,
++        0xa438, 0x44e1, 0xa438, 0x8fe2, 0xa438, 0x0c48, 0xa438, 0x1b54,
++        0xa438, 0xe683, 0xa438, 0xa6e7, 0xa438, 0x83a7, 0xa438, 0xaf2b,
++        0xa438, 0xd900, 0xa436, 0xb818, 0xa438, 0x043d, 0xa436, 0xb81a,
++        0xa438, 0x06a3, 0xa436, 0xb81c, 0xa438, 0x476d, 0xa436, 0xb81e,
++        0xa438, 0x4852, 0xa436, 0xb850, 0xa438, 0x2A69, 0xa436, 0xb852,
++        0xa438, 0x2BD3, 0xa436, 0xb878, 0xa438, 0xffff, 0xa436, 0xb884,
++        0xa438, 0xffff, 0xa436, 0xb832, 0xa438, 0x003f, 0xb844, 0xffff,
++        0xa436, 0x8fe9, 0xa438, 0x0000, 0xa436, 0x8feb, 0xa438, 0x02fe,
++        0xa436, 0x8fed, 0xa438, 0x0019, 0xa436, 0x8fef, 0xa438, 0x0bdb,
++        0xa436, 0x8ff1, 0xa438, 0x0ca4, 0xa436, 0x0000, 0xa438, 0x0000,
++        0xa436, 0xB82E, 0xa438, 0x0000, 0xa436, 0x8024, 0xa438, 0x0000,
++        0xa436, 0x801E, 0xa438, 0x0024, 0xb820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16  phy_mcu_ram_code_8125d_1_1[] = {
++        0xa436, 0x8023, 0xa438, 0x3800, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x8018, 0xa438, 0x1800, 0xa438, 0x8021,
++        0xa438, 0x1800, 0xa438, 0x8029, 0xa438, 0x1800, 0xa438, 0x8031,
++        0xa438, 0x1800, 0xa438, 0x8035, 0xa438, 0x1800, 0xa438, 0x819c,
++        0xa438, 0x1800, 0xa438, 0x81e9, 0xa438, 0xd711, 0xa438, 0x6081,
++        0xa438, 0x8904, 0xa438, 0x1800, 0xa438, 0x2021, 0xa438, 0xa904,
++        0xa438, 0x1800, 0xa438, 0x2021, 0xa438, 0xd75f, 0xa438, 0x4083,
++        0xa438, 0xd503, 0xa438, 0xa908, 0xa438, 0x87f0, 0xa438, 0x1000,
++        0xa438, 0x17e0, 0xa438, 0x1800, 0xa438, 0x13c3, 0xa438, 0xd707,
++        0xa438, 0x2005, 0xa438, 0x8027, 0xa438, 0xd75e, 0xa438, 0x1800,
++        0xa438, 0x1434, 0xa438, 0x1800, 0xa438, 0x14a5, 0xa438, 0xc504,
++        0xa438, 0xce20, 0xa438, 0xcf01, 0xa438, 0xd70a, 0xa438, 0x4005,
++        0xa438, 0xcf02, 0xa438, 0x1800, 0xa438, 0x1c50, 0xa438, 0xa980,
++        0xa438, 0xd500, 0xa438, 0x1800, 0xa438, 0x14f3, 0xa438, 0xd75e,
++        0xa438, 0x67b1, 0xa438, 0xd504, 0xa438, 0xd71e, 0xa438, 0x65bb,
++        0xa438, 0x63da, 0xa438, 0x61f9, 0xa438, 0x0cf0, 0xa438, 0x0c10,
++        0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0470, 0xa438, 0x0cf0,
++        0xa438, 0x0430, 0xa438, 0x0cf0, 0xa438, 0x0410, 0xa438, 0xf02a,
++        0xa438, 0x0cf0, 0xa438, 0x0c20, 0xa438, 0xd505, 0xa438, 0x0c0f,
++        0xa438, 0x0804, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0,
++        0xa438, 0x0470, 0xa438, 0x0cf0, 0xa438, 0x0430, 0xa438, 0x0cf0,
++        0xa438, 0x0420, 0xa438, 0xf01c, 0xa438, 0x0cf0, 0xa438, 0x0c40,
++        0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0470, 0xa438, 0x0cf0,
++        0xa438, 0x0450, 0xa438, 0x0cf0, 0xa438, 0x0440, 0xa438, 0xf00e,
++        0xa438, 0x0cf0, 0xa438, 0x0c80, 0xa438, 0xd505, 0xa438, 0x0c0f,
++        0xa438, 0x0801, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0,
++        0xa438, 0x04b0, 0xa438, 0x0cf0, 0xa438, 0x0490, 0xa438, 0x0cf0,
++        0xa438, 0x0480, 0xa438, 0xd501, 0xa438, 0xce00, 0xa438, 0xd500,
++        0xa438, 0xc48e, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718,
++        0xa438, 0x5faf, 0xa438, 0xd504, 0xa438, 0x8e01, 0xa438, 0x8c0f,
++        0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x17e0, 0xa438, 0xd504,
++        0xa438, 0xd718, 0xa438, 0x4074, 0xa438, 0x6195, 0xa438, 0xf005,
++        0xa438, 0x60f5, 0xa438, 0x0c03, 0xa438, 0x0d00, 0xa438, 0xf009,
++        0xa438, 0x0c03, 0xa438, 0x0d01, 0xa438, 0xf006, 0xa438, 0x0c03,
++        0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c03, 0xa438, 0x0d03,
++        0xa438, 0xd500, 0xa438, 0xd706, 0xa438, 0x2529, 0xa438, 0x809c,
++        0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, 0xa438, 0xf00f,
++        0xa438, 0x431a, 0xa438, 0xf021, 0xa438, 0xd718, 0xa438, 0x617b,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0x1000, 0xa438, 0x1ad1,
++        0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34,
++        0xa438, 0xf020, 0xa438, 0xf053, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0x1000, 0xa438, 0x1ad1, 0xa438, 0xd718, 0xa438, 0x608e,
++        0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf023, 0xa438, 0xf067,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0x1000, 0xa438, 0x1ad1,
++        0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34,
++        0xa438, 0xf026, 0xa438, 0xf07b, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0x1000, 0xa438, 0x1ad1, 0xa438, 0xd718, 0xa438, 0x608e,
++        0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf029, 0xa438, 0xf08f,
++        0xa438, 0x1000, 0xa438, 0x8173, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x8188,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fae,
++        0xa438, 0xf028, 0xa438, 0x1000, 0xa438, 0x8173, 0xa438, 0x1000,
++        0xa438, 0x1a41, 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000,
++        0xa438, 0x8188, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718,
++        0xa438, 0x5fae, 0xa438, 0xf039, 0xa438, 0x1000, 0xa438, 0x8173,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd73e, 0xa438, 0x7fb4,
++        0xa438, 0x1000, 0xa438, 0x8188, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf04a, 0xa438, 0x1000,
++        0xa438, 0x8173, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd73e,
++        0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x8188, 0xa438, 0x1000,
++        0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf05b,
++        0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac01,
++        0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a2f,
++        0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504,
++        0xa438, 0xac11, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa410,
++        0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719,
++        0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf05d, 0xa438, 0x4b98,
++        0xa438, 0xa808, 0xa438, 0xf05a, 0xa438, 0xd719, 0xa438, 0x4119,
++        0xa438, 0xd504, 0xa438, 0xac02, 0xa438, 0xae01, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a2f, 0xa438, 0xf00a, 0xa438, 0xd719,
++        0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac22, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0xa420, 0xa438, 0xce00, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fb0,
++        0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f,
++        0xa438, 0xf03f, 0xa438, 0x47d8, 0xa438, 0xa804, 0xa438, 0xf03c,
++        0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac04,
++        0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a2f,
++        0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504,
++        0xa438, 0xac44, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa440,
++        0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a41,
++        0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719,
++        0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf021, 0xa438, 0x4418,
++        0xa438, 0xa802, 0xa438, 0xf01e, 0xa438, 0xd719, 0xa438, 0x4119,
++        0xa438, 0xd504, 0xa438, 0xac08, 0xa438, 0xae01, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a2f, 0xa438, 0xf00a, 0xa438, 0xd719,
++        0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac88, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0xa480, 0xa438, 0xce00, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fb0,
++        0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f,
++        0xa438, 0xf003, 0xa438, 0x4058, 0xa438, 0xa801, 0xa438, 0x1800,
++        0xa438, 0x16ed, 0xa438, 0xd73e, 0xa438, 0xd505, 0xa438, 0x3088,
++        0xa438, 0x817a, 0xa438, 0x6193, 0xa438, 0x6132, 0xa438, 0x60d1,
++        0xa438, 0x3298, 0xa438, 0x8185, 0xa438, 0xf00a, 0xa438, 0xa808,
++        0xa438, 0xf008, 0xa438, 0xa804, 0xa438, 0xf006, 0xa438, 0xa802,
++        0xa438, 0xf004, 0xa438, 0xa801, 0xa438, 0xf002, 0xa438, 0xa80f,
++        0xa438, 0xd500, 0xa438, 0x0800, 0xa438, 0xd505, 0xa438, 0xd75e,
++        0xa438, 0x6211, 0xa438, 0xd71e, 0xa438, 0x619b, 0xa438, 0x611a,
++        0xa438, 0x6099, 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf009,
++        0xa438, 0x0c0f, 0xa438, 0x0804, 0xa438, 0xf006, 0xa438, 0x0c0f,
++        0xa438, 0x0802, 0xa438, 0xf003, 0xa438, 0x0c0f, 0xa438, 0x0801,
++        0xa438, 0xd500, 0xa438, 0x0800, 0xa438, 0xd500, 0xa438, 0xc48d,
++        0xa438, 0xd504, 0xa438, 0x8d03, 0xa438, 0xd701, 0xa438, 0x4045,
++        0xa438, 0xad02, 0xa438, 0xd504, 0xa438, 0xd706, 0xa438, 0x2529,
++        0xa438, 0x81ad, 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da,
++        0xa438, 0xf013, 0xa438, 0x441a, 0xa438, 0xf02d, 0xa438, 0xd718,
++        0xa438, 0x61fb, 0xa438, 0xbb01, 0xa438, 0xd75e, 0xa438, 0x6171,
++        0xa438, 0x0cf0, 0xa438, 0x0c10, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0x0cf0, 0xa438, 0x0410, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf02a, 0xa438, 0xbb02,
++        0xa438, 0xd75e, 0xa438, 0x6171, 0xa438, 0x0cf0, 0xa438, 0x0c20,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0420,
++        0xa438, 0xce00, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0804,
++        0xa438, 0xf01c, 0xa438, 0xbb04, 0xa438, 0xd75e, 0xa438, 0x6171,
++        0xa438, 0x0cf0, 0xa438, 0x0c40, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0x0cf0, 0xa438, 0x0440, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf00e, 0xa438, 0xbb08,
++        0xa438, 0xd75e, 0xa438, 0x6171, 0xa438, 0x0cf0, 0xa438, 0x0c80,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0480,
++        0xa438, 0xce00, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0801,
++        0xa438, 0xd500, 0xa438, 0x1800, 0xa438, 0x1616, 0xa436, 0xA026,
++        0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0x15d8, 0xa436, 0xA022,
++        0xa438, 0x161f, 0xa436, 0xA020, 0xa438, 0x14f2, 0xa436, 0xA006,
++        0xa438, 0x1c4f, 0xa436, 0xA004, 0xa438, 0x1433, 0xa436, 0xA002,
++        0xa438, 0x13c1, 0xa436, 0xA000, 0xa438, 0x2020, 0xa436, 0xA008,
++        0xa438, 0x7f00, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xd04d, 0xa438, 0x8904,
++        0xa438, 0x813C, 0xa438, 0xA13D, 0xa438, 0xcc01, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152, 0xa438, 0x1384,
++        0xa436, 0xA154, 0xa438, 0x1fa8, 0xa436, 0xA156, 0xa438, 0x218B,
++        0xa436, 0xA158, 0xa438, 0x21B8, 0xa436, 0xA15A, 0xa438, 0x021c,
++        0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E, 0xa438, 0x3fff,
++        0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150, 0xa438, 0x001f,
++        0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012, 0xa438, 0x0000,
++        0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800,
++        0xa438, 0x8013, 0xa438, 0x1800, 0xa438, 0x803a, 0xa438, 0x1800,
++        0xa438, 0x8045, 0xa438, 0x1800, 0xa438, 0x8049, 0xa438, 0x1800,
++        0xa438, 0x804d, 0xa438, 0x1800, 0xa438, 0x8059, 0xa438, 0x1800,
++        0xa438, 0x805d, 0xa438, 0xc2ff, 0xa438, 0x1800, 0xa438, 0x0042,
++        0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0x1000, 0xa438, 0x02b4,
++        0xa438, 0xd701, 0xa438, 0x40e3, 0xa438, 0xd700, 0xa438, 0x5f6c,
++        0xa438, 0x1000, 0xa438, 0x8021, 0xa438, 0x1800, 0xa438, 0x0073,
++        0xa438, 0x1800, 0xa438, 0x0084, 0xa438, 0xd701, 0xa438, 0x4061,
++        0xa438, 0xba0f, 0xa438, 0xf004, 0xa438, 0x4060, 0xa438, 0x1000,
++        0xa438, 0x802a, 0xa438, 0xba10, 0xa438, 0x0800, 0xa438, 0xd700,
++        0xa438, 0x60bb, 0xa438, 0x611c, 0xa438, 0x0c0f, 0xa438, 0x1a01,
++        0xa438, 0xf00a, 0xa438, 0x60fc, 0xa438, 0x0c0f, 0xa438, 0x1a02,
++        0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x1a04, 0xa438, 0xf003,
++        0xa438, 0x0c0f, 0xa438, 0x1a08, 0xa438, 0x0800, 0xa438, 0x0c0f,
++        0xa438, 0x0504, 0xa438, 0xad02, 0xa438, 0x1000, 0xa438, 0x02c0,
++        0xa438, 0xd700, 0xa438, 0x5fac, 0xa438, 0x1000, 0xa438, 0x8021,
++        0xa438, 0x1800, 0xa438, 0x0139, 0xa438, 0x9a1f, 0xa438, 0x8bf0,
++        0xa438, 0x1800, 0xa438, 0x02df, 0xa438, 0x9a1f, 0xa438, 0x9910,
++        0xa438, 0x1800, 0xa438, 0x02d7, 0xa438, 0xad02, 0xa438, 0x8d01,
++        0xa438, 0x9a1f, 0xa438, 0x9910, 0xa438, 0x9860, 0xa438, 0xcb00,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x85f0, 0xa438, 0xd500,
++        0xa438, 0x1800, 0xa438, 0x015c, 0xa438, 0x8580, 0xa438, 0x8d02,
++        0xa438, 0x1800, 0xa438, 0x018f, 0xa438, 0x0c0f, 0xa438, 0x0503,
++        0xa438, 0xad02, 0xa438, 0x1800, 0xa438, 0x00dd, 0xa436, 0xA08E,
++        0xa438, 0x00db, 0xa436, 0xA08C, 0xa438, 0x018e, 0xa436, 0xA08A,
++        0xa438, 0x015a, 0xa436, 0xA088, 0xa438, 0x02d6, 0xa436, 0xA086,
++        0xa438, 0x02de, 0xa436, 0xA084, 0xa438, 0x0137, 0xa436, 0xA082,
++        0xa438, 0x0071, 0xa436, 0xA080, 0xa438, 0x0041, 0xa436, 0xA090,
++        0xa438, 0x00ff, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012,
++        0xa438, 0x1ff8, 0xa436, 0xA014, 0xa438, 0x001c, 0xa438, 0xce15,
++        0xa438, 0xd105, 0xa438, 0xa410, 0xa438, 0x8320, 0xa438, 0xFFD7,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA164, 0xa438, 0x0260,
++        0xa436, 0xA166, 0xa438, 0x0add, 0xa436, 0xA168, 0xa438, 0x05CC,
++        0xa436, 0xA16A, 0xa438, 0x05C5, 0xa436, 0xA16C, 0xa438, 0x0429,
++        0xa436, 0xA16E, 0xa438, 0x07B6, 0xa436, 0xA170, 0xa438, 0x0259,
++        0xa436, 0xA172, 0xa438, 0x3fff, 0xa436, 0xA162, 0xa438, 0x003f,
++        0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000,
++        0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800,
++        0xa438, 0x8023, 0xa438, 0x1800, 0xa438, 0x814c, 0xa438, 0x1800,
++        0xa438, 0x8156, 0xa438, 0x1800, 0xa438, 0x815e, 0xa438, 0x1800,
++        0xa438, 0x8210, 0xa438, 0x1800, 0xa438, 0x8221, 0xa438, 0x1800,
++        0xa438, 0x822f, 0xa438, 0xa801, 0xa438, 0x9308, 0xa438, 0xb201,
++        0xa438, 0xb301, 0xa438, 0xd701, 0xa438, 0x4000, 0xa438, 0xd2ff,
++        0xa438, 0xb302, 0xa438, 0xd200, 0xa438, 0xb201, 0xa438, 0xb309,
++        0xa438, 0xd701, 0xa438, 0x4000, 0xa438, 0xd2ff, 0xa438, 0xb302,
++        0xa438, 0xd200, 0xa438, 0xa800, 0xa438, 0x1800, 0xa438, 0x0031,
++        0xa438, 0xd700, 0xa438, 0x4543, 0xa438, 0xd71f, 0xa438, 0x40fe,
++        0xa438, 0xd1b7, 0xa438, 0xd049, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0xa220, 0xa438, 0x8501,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x0c70, 0xa438, 0x0b00,
++        0xa438, 0x0c07, 0xa438, 0x0604, 0xa438, 0x9503, 0xa438, 0xa510,
++        0xa438, 0xce49, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x8520,
++        0xa438, 0xa520, 0xa438, 0xa501, 0xa438, 0xd105, 0xa438, 0xd047,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd707, 0xa438, 0x6087,
++        0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xffe9, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0x8501, 0xa438, 0xd707, 0xa438, 0x5e08,
++        0xa438, 0x8530, 0xa438, 0xba20, 0xa438, 0xf00c, 0xa438, 0xd700,
++        0xa438, 0x4098, 0xa438, 0xd1ef, 0xa438, 0xd047, 0xa438, 0xf003,
++        0xa438, 0xd1db, 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x8980, 0xa438, 0xd702,
++        0xa438, 0x6126, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702,
++        0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6077, 0xa438, 0x8410,
++        0xa438, 0xf002, 0xa438, 0xa410, 0xa438, 0xce02, 0xa438, 0x1000,
++        0xa438, 0x10be, 0xa438, 0xcd81, 0xa438, 0xd412, 0xa438, 0x1000,
++        0xa438, 0x1069, 0xa438, 0xcd82, 0xa438, 0xd40e, 0xa438, 0x1000,
++        0xa438, 0x1069, 0xa438, 0xcd83, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd71f, 0xa438, 0x5fb4, 0xa438, 0xd702, 0xa438, 0x6c26,
++        0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702, 0xa438, 0x6060,
++        0xa438, 0xd702, 0xa438, 0x6b77, 0xa438, 0xa340, 0xa438, 0x0c06,
++        0xa438, 0x0102, 0xa438, 0xce01, 0xa438, 0x1000, 0xa438, 0x10be,
++        0xa438, 0xa240, 0xa438, 0xa902, 0xa438, 0xa204, 0xa438, 0xa280,
++        0xa438, 0xa364, 0xa438, 0xab02, 0xa438, 0x8380, 0xa438, 0xa00a,
++        0xa438, 0xcd8d, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706,
++        0xa438, 0x5fb5, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0xd71f, 0xa438, 0x6065, 0xa438, 0x7c74,
++        0xa438, 0xfffb, 0xa438, 0xb820, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd71f, 0xa438, 0x7fa5, 0xa438, 0x9820, 0xa438, 0xa410,
++        0xa438, 0x8902, 0xa438, 0xa120, 0xa438, 0xa380, 0xa438, 0xce02,
++        0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x8280, 0xa438, 0xa324,
++        0xa438, 0xab02, 0xa438, 0xa00a, 0xa438, 0x8118, 0xa438, 0x863f,
++        0xa438, 0x87fb, 0xa438, 0xcd8e, 0xa438, 0xd193, 0xa438, 0xd047,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3,
++        0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xa280, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3, 0xa438, 0xd706,
++        0xa438, 0x5f78, 0xa438, 0xa210, 0xa438, 0xd700, 0xa438, 0x6083,
++        0xa438, 0xd101, 0xa438, 0xd047, 0xa438, 0xf003, 0xa438, 0xd160,
++        0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000,
++        0xa438, 0x10a3, 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3, 0xa438, 0xd706,
++        0xa438, 0x5f79, 0xa438, 0x8120, 0xa438, 0xbb20, 0xa438, 0xf04c,
++        0xa438, 0xa00a, 0xa438, 0xa340, 0xa438, 0x0c06, 0xa438, 0x0102,
++        0xa438, 0xa240, 0xa438, 0xa290, 0xa438, 0xa324, 0xa438, 0xab02,
++        0xa438, 0xd13e, 0xa438, 0xd05a, 0xa438, 0xd13e, 0xa438, 0xd06b,
++        0xa438, 0xcd84, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706,
++        0xa438, 0x6079, 0xa438, 0xd700, 0xa438, 0x5f5c, 0xa438, 0xcd8a,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, 0xa438, 0x6079,
++        0xa438, 0xd700, 0xa438, 0x5f5d, 0xa438, 0xcd8b, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0xcd8c, 0xa438, 0xd700, 0xa438, 0x6050,
++        0xa438, 0xab04, 0xa438, 0xd700, 0xa438, 0x4083, 0xa438, 0xd160,
++        0xa438, 0xd04b, 0xa438, 0xf003, 0xa438, 0xd193, 0xa438, 0xd047,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd700, 0xa438, 0x5fbb,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x8410, 0xa438, 0xd71f,
++        0xa438, 0x5f94, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x109e,
++        0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0xd71f, 0xa438, 0x6105, 0xa438, 0x6054,
++        0xa438, 0xfffb, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706,
++        0xa438, 0x5fb9, 0xa438, 0xfff0, 0xa438, 0xa410, 0xa438, 0xb820,
++        0xa438, 0xcd85, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd71f,
++        0xa438, 0x7fa5, 0xa438, 0x9820, 0xa438, 0xbb20, 0xa438, 0xd105,
++        0xa438, 0xd042, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706,
++        0xa438, 0x5fbb, 0xa438, 0x5f85, 0xa438, 0xd700, 0xa438, 0x5f5b,
++        0xa438, 0xd700, 0xa438, 0x6090, 0xa438, 0xd700, 0xa438, 0x4043,
++        0xa438, 0xaa20, 0xa438, 0xcd86, 0xa438, 0xd700, 0xa438, 0x6083,
++        0xa438, 0xd1c7, 0xa438, 0xd045, 0xa438, 0xf003, 0xa438, 0xd17a,
++        0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd700,
++        0xa438, 0x5fbb, 0xa438, 0x0c18, 0xa438, 0x0108, 0xa438, 0x0c3f,
++        0xa438, 0x0609, 0xa438, 0x0cfb, 0xa438, 0x0729, 0xa438, 0xa308,
++        0xa438, 0x8320, 0xa438, 0xd105, 0xa438, 0xd042, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x1800,
++        0xa438, 0x08f7, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000,
++        0xa438, 0x10a3, 0xa438, 0xd700, 0xa438, 0x607b, 0xa438, 0xd700,
++        0xa438, 0x5f2b, 0xa438, 0x1800, 0xa438, 0x0a81, 0xa438, 0xd700,
++        0xa438, 0x40bd, 0xa438, 0xd707, 0xa438, 0x4065, 0xa438, 0x1800,
++        0xa438, 0x1121, 0xa438, 0x1800, 0xa438, 0x1124, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8f80, 0xa438, 0x9503, 0xa438, 0xd705,
++        0xa438, 0x641d, 0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702,
++        0xa438, 0x4116, 0xa438, 0xce15, 0xa438, 0x1000, 0xa438, 0x10be,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503,
++        0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700,
++        0xa438, 0x3691, 0xa438, 0x8183, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xa570, 0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x1108,
++        0xa438, 0xcd64, 0xa438, 0xd704, 0xa438, 0x3398, 0xa438, 0x8203,
++        0xa438, 0xd71f, 0xa438, 0x620e, 0xa438, 0xd704, 0xa438, 0x6096,
++        0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf004, 0xa438, 0xd705,
++        0xa438, 0x605d, 0xa438, 0xf008, 0xa438, 0xd706, 0xa438, 0x609d,
++        0xa438, 0xd705, 0xa438, 0x405f, 0xa438, 0xf003, 0xa438, 0xd700,
++        0xa438, 0x58fb, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xc7aa,
++        0xa438, 0x9503, 0xa438, 0xd71f, 0xa438, 0x6d2e, 0xa438, 0xd704,
++        0xa438, 0x6096, 0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf005,
++        0xa438, 0xd705, 0xa438, 0x607d, 0xa438, 0x1800, 0xa438, 0x0cc7,
++        0xa438, 0xd706, 0xa438, 0x60bd, 0xa438, 0xd705, 0xa438, 0x407f,
++        0xa438, 0x1800, 0xa438, 0x0e42, 0xa438, 0xd702, 0xa438, 0x40a4,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8e20, 0xa438, 0x9503,
++        0xa438, 0xd702, 0xa438, 0x40a5, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8e40, 0xa438, 0x9503, 0xa438, 0xd705, 0xa438, 0x659d,
++        0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702, 0xa438, 0x4116,
++        0xa438, 0xce15, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0xa00a,
++        0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700, 0xa438, 0x3691,
++        0xa438, 0x81de, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570,
++        0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xd706,
++        0xa438, 0x60e4, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x0cf0,
++        0xa438, 0x07a0, 0xa438, 0x9503, 0xa438, 0xf005, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x87f0, 0xa438, 0x9503, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x1108, 0xa438, 0xcd61,
++        0xa438, 0xd704, 0xa438, 0x3398, 0xa438, 0x8203, 0xa438, 0xd704,
++        0xa438, 0x6096, 0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf005,
++        0xa438, 0xd705, 0xa438, 0x607d, 0xa438, 0x1800, 0xa438, 0x0cc7,
++        0xa438, 0xd71f, 0xa438, 0x61ce, 0xa438, 0xd706, 0xa438, 0x767d,
++        0xa438, 0xd705, 0xa438, 0x563f, 0xa438, 0x1800, 0xa438, 0x0e42,
++        0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40,
++        0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c47, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xaf80, 0xa438, 0x9503, 0xa438, 0x1800,
++        0xa438, 0x0b5f, 0xa438, 0x607c, 0xa438, 0x1800, 0xa438, 0x027a,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae01, 0xa438, 0x9503,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd702, 0xa438, 0x5fa3,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8e01, 0xa438, 0x9503,
++        0xa438, 0x1800, 0xa438, 0x027d, 0xa438, 0x1000, 0xa438, 0x10be,
++        0xa438, 0xd702, 0xa438, 0x40a5, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8e40, 0xa438, 0x9503, 0xa438, 0xd73e, 0xa438, 0x6065,
++        0xa438, 0x1800, 0xa438, 0x0cea, 0xa438, 0x1800, 0xa438, 0x0cf4,
++        0xa438, 0xd701, 0xa438, 0x6fd1, 0xa438, 0xd71f, 0xa438, 0x6eee,
++        0xa438, 0xd707, 0xa438, 0x4d0f, 0xa438, 0xd73e, 0xa438, 0x4cc5,
++        0xa438, 0xd705, 0xa438, 0x4c99, 0xa438, 0xd704, 0xa438, 0x6c57,
++        0xa438, 0xd702, 0xa438, 0x6c11, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8c20, 0xa438, 0xa608, 0xa438, 0x9503, 0xa438, 0xa201,
++        0xa438, 0xa804, 0xa438, 0xd704, 0xa438, 0x40a7, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xa620, 0xa438, 0x9503, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xac40, 0xa438, 0x9503, 0xa438, 0x800a,
++        0xa438, 0x8290, 0xa438, 0x8306, 0xa438, 0x8b02, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xce00,
++        0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0xcd99, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10cc, 0xa438, 0xd701,
++        0xa438, 0x69f1, 0xa438, 0xd71f, 0xa438, 0x690e, 0xa438, 0xd73e,
++        0xa438, 0x5ee6, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x87f0,
++        0xa438, 0x9503, 0xa438, 0xce46, 0xa438, 0x1000, 0xa438, 0x10be,
++        0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x40a7, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xa570, 0xa438, 0x9503, 0xa438, 0xcd9a,
++        0xa438, 0xd700, 0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a,
++        0xa438, 0xd109, 0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109,
++        0xa438, 0xd075, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000,
++        0xa438, 0x10cc, 0xa438, 0xd701, 0xa438, 0x65b1, 0xa438, 0xd71f,
++        0xa438, 0x64ce, 0xa438, 0xd700, 0xa438, 0x5efe, 0xa438, 0xce00,
++        0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8608, 0xa438, 0x8c40, 0xa438, 0x9503, 0xa438, 0x8201,
++        0xa438, 0x800a, 0xa438, 0x8290, 0xa438, 0x8306, 0xa438, 0x8b02,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xc7aa, 0xa438, 0x8570,
++        0xa438, 0x8d08, 0xa438, 0x9503, 0xa438, 0xcd9b, 0xa438, 0x1800,
++        0xa438, 0x0c8b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd705,
++        0xa438, 0x61d9, 0xa438, 0xd704, 0xa438, 0x4193, 0xa438, 0x800a,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40, 0xa438, 0x9503,
++        0xa438, 0x1800, 0xa438, 0x0c47, 0xa438, 0x1800, 0xa438, 0x0df8,
++        0xa438, 0x1800, 0xa438, 0x8339, 0xa438, 0x0800, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8d08, 0xa438, 0x8f02, 0xa438, 0x8c40,
++        0xa438, 0x9503, 0xa438, 0x8201, 0xa438, 0xa804, 0xa438, 0xd704,
++        0xa438, 0x40a7, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa620,
++        0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x8290, 0xa438, 0x8306,
++        0xa438, 0x8b02, 0xa438, 0x8010, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xaa03, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0xac20, 0xa438, 0xa608, 0xa438, 0x9503,
++        0xa438, 0xce00, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0xcd95,
++        0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd701, 0xa438, 0x7b91,
++        0xa438, 0xd71f, 0xa438, 0x7aae, 0xa438, 0xd701, 0xa438, 0x7ab0,
++        0xa438, 0xd704, 0xa438, 0x7ef3, 0xa438, 0xd701, 0xa438, 0x5eb3,
++        0xa438, 0x84b0, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa608,
++        0xa438, 0xc700, 0xa438, 0x9503, 0xa438, 0xce54, 0xa438, 0x1000,
++        0xa438, 0x10be, 0xa438, 0xa290, 0xa438, 0xa304, 0xa438, 0xab02,
++        0xa438, 0xd700, 0xa438, 0x6050, 0xa438, 0xab04, 0xa438, 0x0c38,
++        0xa438, 0x0608, 0xa438, 0xaa0b, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8d01, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40,
++        0xa438, 0x9503, 0xa438, 0xd702, 0xa438, 0x40a4, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8e20, 0xa438, 0x9503, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8c20, 0xa438, 0x9503, 0xa438, 0xd700,
++        0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a, 0xa438, 0xd109,
++        0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109, 0xa438, 0xd075,
++        0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702, 0xa438, 0x4116,
++        0xa438, 0xce54, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0xa00a,
++        0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700, 0xa438, 0x3691,
++        0xa438, 0x8326, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570,
++        0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0x1000,
++        0xa438, 0x109e, 0xa438, 0xd704, 0xa438, 0x60f3, 0xa438, 0xd71f,
++        0xa438, 0x618e, 0xa438, 0xd700, 0xa438, 0x5b5e, 0xa438, 0x1800,
++        0xa438, 0x0deb, 0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xae40, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c47,
++        0xa438, 0x1800, 0xa438, 0x0df8, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8608, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0e2b,
++        0xa436, 0xA10E, 0xa438, 0x0d14, 0xa436, 0xA10C, 0xa438, 0x0ce8,
++        0xa436, 0xA10A, 0xa438, 0x0279, 0xa436, 0xA108, 0xa438, 0x0b19,
++        0xa436, 0xA106, 0xa438, 0x111f, 0xa436, 0xA104, 0xa438, 0x0a7b,
++        0xa436, 0xA102, 0xa438, 0x0ba3, 0xa436, 0xA100, 0xa438, 0x0022,
++        0xa436, 0xA110, 0xa438, 0x00ff, 0xa436, 0xb87c, 0xa438, 0x859b,
++        0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xb3af, 0xa438, 0x863b,
++        0xa438, 0xaf86, 0xa438, 0x4caf, 0xa438, 0x8688, 0xa438, 0xaf86,
++        0xa438, 0xceaf, 0xa438, 0x8744, 0xa438, 0xaf87, 0xa438, 0x68af,
++        0xa438, 0x8781, 0xa438, 0xbf5e, 0xa438, 0x7202, 0xa438, 0x5f7e,
++        0xa438, 0xac28, 0xa438, 0x68e1, 0xa438, 0x84e6, 0xa438, 0xad28,
++        0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7eac,
++        0xa438, 0x2d59, 0xa438, 0xe18f, 0xa438, 0xebad, 0xa438, 0x2809,
++        0xa438, 0xbf5e, 0xa438, 0x7502, 0xa438, 0x5f7e, 0xa438, 0xac2e,
++        0xa438, 0x50e1, 0xa438, 0x84e6, 0xa438, 0xac28, 0xa438, 0x08bf,
++        0xa438, 0x873e, 0xa438, 0x025f, 0xa438, 0x3cae, 0xa438, 0x06bf,
++        0xa438, 0x873e, 0xa438, 0x025f, 0xa438, 0x33bf, 0xa438, 0x8741,
++        0xa438, 0x025f, 0xa438, 0x33ee, 0xa438, 0x8fea, 0xa438, 0x02e1,
++        0xa438, 0x84e4, 0xa438, 0xad28, 0xa438, 0x14e1, 0xa438, 0x8fe8,
++        0xa438, 0xad28, 0xa438, 0x17e1, 0xa438, 0x84e5, 0xa438, 0x11e5,
++        0xa438, 0x84e5, 0xa438, 0xa10c, 0xa438, 0x04ee, 0xa438, 0x84e5,
++        0xa438, 0x0002, 0xa438, 0x4977, 0xa438, 0xee84, 0xa438, 0xdc03,
++        0xa438, 0xae1d, 0xa438, 0xe18f, 0xa438, 0xe811, 0xa438, 0xe58f,
++        0xa438, 0xe8ae, 0xa438, 0x14bf, 0xa438, 0x873e, 0xa438, 0x025f,
++        0xa438, 0x3cbf, 0xa438, 0x8741, 0xa438, 0x025f, 0xa438, 0x3cee,
++        0xa438, 0x8fea, 0xa438, 0x01ee, 0xa438, 0x84e4, 0xa438, 0x00af,
++        0xa438, 0x50c1, 0xa438, 0x1f00, 0xa438, 0xbf5a, 0xa438, 0x6102,
++        0xa438, 0x5f5f, 0xa438, 0xbf5a, 0xa438, 0x5e02, 0xa438, 0x5f3c,
++        0xa438, 0xaf45, 0xa438, 0x7be0, 0xa438, 0x8012, 0xa438, 0xad23,
++        0xa438, 0x141f, 0xa438, 0x001f, 0xa438, 0x22d1, 0xa438, 0x00bf,
++        0xa438, 0x3fcf, 0xa438, 0x0261, 0xa438, 0x3412, 0xa438, 0xa204,
++        0xa438, 0xf6ee, 0xa438, 0x8317, 0xa438, 0x00e0, 0xa438, 0x8012,
++        0xa438, 0xad24, 0xa438, 0x141f, 0xa438, 0x001f, 0xa438, 0x22d1,
++        0xa438, 0x00bf, 0xa438, 0x3fd7, 0xa438, 0x0261, 0xa438, 0x3412,
++        0xa438, 0xa204, 0xa438, 0xf6ee, 0xa438, 0x8317, 0xa438, 0x00ef,
++        0xa438, 0x96fe, 0xa438, 0xfdfc, 0xa438, 0xaf42, 0xa438, 0x9802,
++        0xa438, 0x56ec, 0xa438, 0xf70b, 0xa438, 0xac13, 0xa438, 0x0fbf,
++        0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7eac, 0xa438, 0x280c,
++        0xa438, 0xe2ff, 0xa438, 0xcfad, 0xa438, 0x32ee, 0xa438, 0x0257,
++        0xa438, 0x05af, 0xa438, 0x00a4, 0xa438, 0x0286, 0xa438, 0xaaae,
++        0xa438, 0xeff8, 0xa438, 0xf9ef, 0xa438, 0x5902, 0xa438, 0x1fe1,
++        0xa438, 0xbf59, 0xa438, 0x4d02, 0xa438, 0x5f3c, 0xa438, 0xac13,
++        0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7ea1,
++        0xa438, 0x00f4, 0xa438, 0xbf59, 0xa438, 0x4d02, 0xa438, 0x5f33,
++        0xa438, 0xef95, 0xa438, 0xfdfc, 0xa438, 0x04bf, 0xa438, 0x5e72,
++        0xa438, 0x025f, 0xa438, 0x7eac, 0xa438, 0x284a, 0xa438, 0xe184,
++        0xa438, 0xe6ad, 0xa438, 0x2809, 0xa438, 0xbf5e, 0xa438, 0x7502,
++        0xa438, 0x5f7e, 0xa438, 0xac2d, 0xa438, 0x3be1, 0xa438, 0x8feb,
++        0xa438, 0xad28, 0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f,
++        0xa438, 0x7eac, 0xa438, 0x2e32, 0xa438, 0xe184, 0xa438, 0xe6ac,
++        0xa438, 0x2808, 0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f3c,
++        0xa438, 0xae06, 0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f33,
++        0xa438, 0xbf87, 0xa438, 0x4102, 0xa438, 0x5f33, 0xa438, 0xee8f,
++        0xa438, 0xea04, 0xa438, 0xbf5e, 0xa438, 0x4e02, 0xa438, 0x5f7e,
++        0xa438, 0xad28, 0xa438, 0x1f02, 0xa438, 0x4b12, 0xa438, 0xae1a,
++        0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f3c, 0xa438, 0xbf87,
++        0xa438, 0x4102, 0xa438, 0x5f3c, 0xa438, 0xee8f, 0xa438, 0xea03,
++        0xa438, 0xbf5e, 0xa438, 0x2a02, 0xa438, 0x5f33, 0xa438, 0xee84,
++        0xa438, 0xe701, 0xa438, 0xaf4a, 0xa438, 0x7444, 0xa438, 0xac0e,
++        0xa438, 0x55ac, 0xa438, 0x0ebf, 0xa438, 0x5e75, 0xa438, 0x025f,
++        0xa438, 0x7ead, 0xa438, 0x2d0b, 0xa438, 0xbf5e, 0xa438, 0x36e1,
++        0xa438, 0x8fe9, 0xa438, 0x025f, 0xa438, 0x5fae, 0xa438, 0x09bf,
++        0xa438, 0x5e36, 0xa438, 0xe184, 0xa438, 0xe102, 0xa438, 0x5f5f,
++        0xa438, 0xee8f, 0xa438, 0xe800, 0xa438, 0xaf49, 0xa438, 0xcdbf,
++        0xa438, 0x595c, 0xa438, 0x025f, 0xa438, 0x7ea1, 0xa438, 0x0203,
++        0xa438, 0xaf87, 0xa438, 0x79d1, 0xa438, 0x00af, 0xa438, 0x877c,
++        0xa438, 0xe181, 0xa438, 0x941f, 0xa438, 0x00af, 0xa438, 0x3ff7,
++        0xa438, 0xac4e, 0xa438, 0x06ac, 0xa438, 0x4003, 0xa438, 0xaf24,
++        0xa438, 0x97af, 0xa438, 0x2467, 0xa436, 0xb85e, 0xa438, 0x5082,
++        0xa436, 0xb860, 0xa438, 0x4575, 0xa436, 0xb862, 0xa438, 0x425F,
++        0xa436, 0xb864, 0xa438, 0x0096, 0xa436, 0xb886, 0xa438, 0x4A44,
++        0xa436, 0xb888, 0xa438, 0x49c4, 0xa436, 0xb88a, 0xa438, 0x3FF2,
++        0xa436, 0xb88c, 0xa438, 0x245C, 0xa436, 0xb838, 0xa438, 0x00ff,
++        0xb820, 0x0010, 0xa436, 0x843d, 0xa438, 0xaf84, 0xa438, 0xa6af,
++        0xa438, 0x8540, 0xa438, 0xaf85, 0xa438, 0xaeaf, 0xa438, 0x85b5,
++        0xa438, 0xaf87, 0xa438, 0x7daf, 0xa438, 0x8784, 0xa438, 0xaf87,
++        0xa438, 0x87af, 0xa438, 0x87e5, 0xa438, 0x0066, 0xa438, 0x0a03,
++        0xa438, 0x6607, 0xa438, 0x2666, 0xa438, 0x1c00, 0xa438, 0x660d,
++        0xa438, 0x0166, 0xa438, 0x1004, 0xa438, 0x6616, 0xa438, 0x0566,
++        0xa438, 0x1f06, 0xa438, 0x6a5d, 0xa438, 0x2766, 0xa438, 0x1900,
++        0xa438, 0x6625, 0xa438, 0x2466, 0xa438, 0x2820, 0xa438, 0x662b,
++        0xa438, 0x2466, 0xa438, 0x4600, 0xa438, 0x664c, 0xa438, 0x0166,
++        0xa438, 0x4902, 0xa438, 0x8861, 0xa438, 0x0388, 0xa438, 0x5e05,
++        0xa438, 0x886d, 0xa438, 0x0588, 0xa438, 0x7005, 0xa438, 0x8873,
++        0xa438, 0x0588, 0xa438, 0x7605, 0xa438, 0x8879, 0xa438, 0x0588,
++        0xa438, 0x7c05, 0xa438, 0x887f, 0xa438, 0x0588, 0xa438, 0x8205,
++        0xa438, 0x8885, 0xa438, 0x0588, 0xa438, 0x881e, 0xa438, 0x13ad,
++        0xa438, 0x2841, 0xa438, 0xbf64, 0xa438, 0xf102, 0xa438, 0x6b9d,
++        0xa438, 0xad28, 0xa438, 0x03af, 0xa438, 0x15fc, 0xa438, 0xbf65,
++        0xa438, 0xcb02, 0xa438, 0x6b9d, 0xa438, 0x0d11, 0xa438, 0xf62f,
++        0xa438, 0xef31, 0xa438, 0xd202, 0xa438, 0xbf88, 0xa438, 0x6402,
++        0xa438, 0x6b52, 0xa438, 0xe082, 0xa438, 0x020d, 0xa438, 0x01f6,
++        0xa438, 0x271b, 0xa438, 0x03aa, 0xa438, 0x0182, 0xa438, 0xe082,
++        0xa438, 0x010d, 0xa438, 0x01f6, 0xa438, 0x271b, 0xa438, 0x03aa,
++        0xa438, 0x0782, 0xa438, 0xbf88, 0xa438, 0x6402, 0xa438, 0x6b5b,
++        0xa438, 0xaf15, 0xa438, 0xf9bf, 0xa438, 0x65cb, 0xa438, 0x026b,
++        0xa438, 0x9d0d, 0xa438, 0x11f6, 0xa438, 0x2fef, 0xa438, 0x31e0,
++        0xa438, 0x8ff7, 0xa438, 0x0d01, 0xa438, 0xf627, 0xa438, 0x1b03,
++        0xa438, 0xaa20, 0xa438, 0xe18f, 0xa438, 0xf4d0, 0xa438, 0x00bf,
++        0xa438, 0x6587, 0xa438, 0x026b, 0xa438, 0x7ee1, 0xa438, 0x8ff5,
++        0xa438, 0xbf65, 0xa438, 0x8a02, 0xa438, 0x6b7e, 0xa438, 0xe18f,
++        0xa438, 0xf6bf, 0xa438, 0x6584, 0xa438, 0x026b, 0xa438, 0x7eaf,
++        0xa438, 0x15fc, 0xa438, 0xe18f, 0xa438, 0xf1d0, 0xa438, 0x00bf,
++        0xa438, 0x6587, 0xa438, 0x026b, 0xa438, 0x7ee1, 0xa438, 0x8ff2,
++        0xa438, 0xbf65, 0xa438, 0x8a02, 0xa438, 0x6b7e, 0xa438, 0xe18f,
++        0xa438, 0xf3bf, 0xa438, 0x6584, 0xa438, 0xaf15, 0xa438, 0xfcd1,
++        0xa438, 0x07bf, 0xa438, 0x65ce, 0xa438, 0x026b, 0xa438, 0x7ed1,
++        0xa438, 0x0cbf, 0xa438, 0x65d1, 0xa438, 0x026b, 0xa438, 0x7ed1,
++        0xa438, 0x03bf, 0xa438, 0x885e, 0xa438, 0x026b, 0xa438, 0x7ed1,
++        0xa438, 0x05bf, 0xa438, 0x8867, 0xa438, 0x026b, 0xa438, 0x7ed1,
++        0xa438, 0x07bf, 0xa438, 0x886a, 0xa438, 0x026b, 0xa438, 0x7ebf,
++        0xa438, 0x6a6c, 0xa438, 0x026b, 0xa438, 0x5b02, 0xa438, 0x62b5,
++        0xa438, 0xbf6a, 0xa438, 0x0002, 0xa438, 0x6b5b, 0xa438, 0xbf64,
++        0xa438, 0x4e02, 0xa438, 0x6b9d, 0xa438, 0xac28, 0xa438, 0x0bbf,
++        0xa438, 0x6412, 0xa438, 0x026b, 0xa438, 0x9da1, 0xa438, 0x0502,
++        0xa438, 0xaeec, 0xa438, 0xd104, 0xa438, 0xbf65, 0xa438, 0xce02,
++        0xa438, 0x6b7e, 0xa438, 0xd104, 0xa438, 0xbf65, 0xa438, 0xd102,
++        0xa438, 0x6b7e, 0xa438, 0xd102, 0xa438, 0xbf88, 0xa438, 0x6702,
++        0xa438, 0x6b7e, 0xa438, 0xd104, 0xa438, 0xbf88, 0xa438, 0x6a02,
++        0xa438, 0x6b7e, 0xa438, 0xaf62, 0xa438, 0x72f6, 0xa438, 0x0af6,
++        0xa438, 0x09af, 0xa438, 0x34e3, 0xa438, 0x0285, 0xa438, 0xbe02,
++        0xa438, 0x106c, 0xa438, 0xaf10, 0xa438, 0x6bf8, 0xa438, 0xfaef,
++        0xa438, 0x69e0, 0xa438, 0x804c, 0xa438, 0xac25, 0xa438, 0x17e0,
++        0xa438, 0x8040, 0xa438, 0xad25, 0xa438, 0x1a02, 0xa438, 0x85ed,
++        0xa438, 0xe080, 0xa438, 0x40ac, 0xa438, 0x2511, 0xa438, 0xbf87,
++        0xa438, 0x6502, 0xa438, 0x6b5b, 0xa438, 0xae09, 0xa438, 0x0287,
++        0xa438, 0x2402, 0xa438, 0x875a, 0xa438, 0x0287, 0xa438, 0x4fef,
++        0xa438, 0x96fe, 0xa438, 0xfc04, 0xa438, 0xf8e0, 0xa438, 0x8019,
++        0xa438, 0xad20, 0xa438, 0x11e0, 0xa438, 0x8fe3, 0xa438, 0xac20,
++        0xa438, 0x0502, 0xa438, 0x860a, 0xa438, 0xae03, 0xa438, 0x0286,
++        0xa438, 0x7802, 0xa438, 0x86c1, 0xa438, 0x0287, 0xa438, 0x4ffc,
++        0xa438, 0x04f8, 0xa438, 0xf9ef, 0xa438, 0x79fb, 0xa438, 0xbf87,
++        0xa438, 0x6802, 0xa438, 0x6b9d, 0xa438, 0x5c20, 0xa438, 0x000d,
++        0xa438, 0x4da1, 0xa438, 0x0151, 0xa438, 0xbf87, 0xa438, 0x6802,
++        0xa438, 0x6b9d, 0xa438, 0x5c07, 0xa438, 0xffe3, 0xa438, 0x8fe4,
++        0xa438, 0x1b31, 0xa438, 0x9f41, 0xa438, 0x0d48, 0xa438, 0xe38f,
++        0xa438, 0xe51b, 0xa438, 0x319f, 0xa438, 0x38bf, 0xa438, 0x876b,
++        0xa438, 0x026b, 0xa438, 0x9d5c, 0xa438, 0x07ff, 0xa438, 0xe38f,
++        0xa438, 0xe61b, 0xa438, 0x319f, 0xa438, 0x280d, 0xa438, 0x48e3,
++        0xa438, 0x8fe7, 0xa438, 0x1b31, 0xa438, 0x9f1f, 0xa438, 0xbf87,
++        0xa438, 0x6e02, 0xa438, 0x6b9d, 0xa438, 0x5c07, 0xa438, 0xffe3,
++        0xa438, 0x8fe8, 0xa438, 0x1b31, 0xa438, 0x9f0f, 0xa438, 0x0d48,
++        0xa438, 0xe38f, 0xa438, 0xe91b, 0xa438, 0x319f, 0xa438, 0x06ee,
++        0xa438, 0x8fe3, 0xa438, 0x01ae, 0xa438, 0x04ee, 0xa438, 0x8fe3,
++        0xa438, 0x00ff, 0xa438, 0xef97, 0xa438, 0xfdfc, 0xa438, 0x04f8,
++        0xa438, 0xf9ef, 0xa438, 0x79fb, 0xa438, 0xbf87, 0xa438, 0x6802,
++        0xa438, 0x6b9d, 0xa438, 0x5c20, 0xa438, 0x000d, 0xa438, 0x4da1,
++        0xa438, 0x0020, 0xa438, 0xbf87, 0xa438, 0x6802, 0xa438, 0x6b9d,
++        0xa438, 0x5c06, 0xa438, 0x000d, 0xa438, 0x49e3, 0xa438, 0x8fea,
++        0xa438, 0x1b31, 0xa438, 0x9f0e, 0xa438, 0xbf87, 0xa438, 0x7102,
++        0xa438, 0x6b5b, 0xa438, 0xbf87, 0xa438, 0x7702, 0xa438, 0x6b5b,
++        0xa438, 0xae0c, 0xa438, 0xbf87, 0xa438, 0x7102, 0xa438, 0x6b52,
++        0xa438, 0xbf87, 0xa438, 0x7702, 0xa438, 0x6b52, 0xa438, 0xee8f,
++        0xa438, 0xe300, 0xa438, 0xffef, 0xa438, 0x97fd, 0xa438, 0xfc04,
++        0xa438, 0xf8f9, 0xa438, 0xef79, 0xa438, 0xfbbf, 0xa438, 0x8768,
++        0xa438, 0x026b, 0xa438, 0x9d5c, 0xa438, 0x2000, 0xa438, 0x0d4d,
++        0xa438, 0xa101, 0xa438, 0x4abf, 0xa438, 0x8768, 0xa438, 0x026b,
++        0xa438, 0x9d5c, 0xa438, 0x07ff, 0xa438, 0xe38f, 0xa438, 0xeb1b,
++        0xa438, 0x319f, 0xa438, 0x3a0d, 0xa438, 0x48e3, 0xa438, 0x8fec,
++        0xa438, 0x1b31, 0xa438, 0x9f31, 0xa438, 0xbf87, 0xa438, 0x6b02,
++        0xa438, 0x6b9d, 0xa438, 0xe38f, 0xa438, 0xed1b, 0xa438, 0x319f,
++        0xa438, 0x240d, 0xa438, 0x48e3, 0xa438, 0x8fee, 0xa438, 0x1b31,
++        0xa438, 0x9f1b, 0xa438, 0xbf87, 0xa438, 0x6e02, 0xa438, 0x6b9d,
++        0xa438, 0xe38f, 0xa438, 0xef1b, 0xa438, 0x319f, 0xa438, 0x0ebf,
++        0xa438, 0x8774, 0xa438, 0x026b, 0xa438, 0x5bbf, 0xa438, 0x877a,
++        0xa438, 0x026b, 0xa438, 0x5bae, 0xa438, 0x00ff, 0xa438, 0xef97,
++        0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xef79, 0xa438, 0xfbe0,
++        0xa438, 0x8019, 0xa438, 0xad20, 0xa438, 0x1cee, 0xa438, 0x8fe3,
++        0xa438, 0x00bf, 0xa438, 0x8771, 0xa438, 0x026b, 0xa438, 0x52bf,
++        0xa438, 0x8777, 0xa438, 0x026b, 0xa438, 0x52bf, 0xa438, 0x8774,
++        0xa438, 0x026b, 0xa438, 0x52bf, 0xa438, 0x877a, 0xa438, 0x026b,
++        0xa438, 0x52ff, 0xa438, 0xef97, 0xa438, 0xfc04, 0xa438, 0xf8e0,
++        0xa438, 0x8040, 0xa438, 0xf625, 0xa438, 0xe480, 0xa438, 0x40fc,
++        0xa438, 0x04f8, 0xa438, 0xe080, 0xa438, 0x4cf6, 0xa438, 0x25e4,
++        0xa438, 0x804c, 0xa438, 0xfc04, 0xa438, 0x55a4, 0xa438, 0xbaf0,
++        0xa438, 0xa64a, 0xa438, 0xf0a6, 0xa438, 0x4cf0, 0xa438, 0xa64e,
++        0xa438, 0x66a4, 0xa438, 0xb655, 0xa438, 0xa4b6, 0xa438, 0x00ac,
++        0xa438, 0x0e66, 0xa438, 0xac0e, 0xa438, 0xee80, 0xa438, 0x4c3a,
++        0xa438, 0xaf07, 0xa438, 0xd0af, 0xa438, 0x26d0, 0xa438, 0xa201,
++        0xa438, 0x0ebf, 0xa438, 0x663d, 0xa438, 0x026b, 0xa438, 0x52bf,
++        0xa438, 0x6643, 0xa438, 0x026b, 0xa438, 0x52ae, 0xa438, 0x11bf,
++        0xa438, 0x6643, 0xa438, 0x026b, 0xa438, 0x5bd4, 0xa438, 0x0054,
++        0xa438, 0xb4fe, 0xa438, 0xbf66, 0xa438, 0x3d02, 0xa438, 0x6b5b,
++        0xa438, 0xd300, 0xa438, 0x020d, 0xa438, 0xf6a2, 0xa438, 0x0405,
++        0xa438, 0xe081, 0xa438, 0x47ae, 0xa438, 0x03e0, 0xa438, 0x8148,
++        0xa438, 0xac23, 0xa438, 0x02ae, 0xa438, 0x0268, 0xa438, 0xf01a,
++        0xa438, 0x10ad, 0xa438, 0x2f04, 0xa438, 0xd100, 0xa438, 0xae05,
++        0xa438, 0xad2c, 0xa438, 0x02d1, 0xa438, 0x0f1f, 0xa438, 0x00a2,
++        0xa438, 0x0407, 0xa438, 0x3908, 0xa438, 0xad2f, 0xa438, 0x02d1,
++        0xa438, 0x0002, 0xa438, 0x0e1c, 0xa438, 0x2b01, 0xa438, 0xad3a,
++        0xa438, 0xc9af, 0xa438, 0x0dee, 0xa438, 0xa000, 0xa438, 0x2702,
++        0xa438, 0x1beb, 0xa438, 0xe18f, 0xa438, 0xe1ac, 0xa438, 0x2819,
++        0xa438, 0xee8f, 0xa438, 0xe101, 0xa438, 0x1f44, 0xa438, 0xbf65,
++        0xa438, 0x9302, 0xa438, 0x6b9d, 0xa438, 0xe58f, 0xa438, 0xe21f,
++        0xa438, 0x44d1, 0xa438, 0x02bf, 0xa438, 0x6593, 0xa438, 0x026b,
++        0xa438, 0x7ee0, 0xa438, 0x82b1, 0xa438, 0xae49, 0xa438, 0xa001,
++        0xa438, 0x0502, 0xa438, 0x1c4d, 0xa438, 0xae41, 0xa438, 0xa002,
++        0xa438, 0x0502, 0xa438, 0x1c90, 0xa438, 0xae39, 0xa438, 0xa003,
++        0xa438, 0x0502, 0xa438, 0x1c9d, 0xa438, 0xae31, 0xa438, 0xa004,
++        0xa438, 0x0502, 0xa438, 0x1cbc, 0xa438, 0xae29, 0xa438, 0xa005,
++        0xa438, 0x1e02, 0xa438, 0x1cc9, 0xa438, 0xe080, 0xa438, 0xdfac,
++        0xa438, 0x2013, 0xa438, 0xac21, 0xa438, 0x10ac, 0xa438, 0x220d,
++        0xa438, 0xe18f, 0xa438, 0xe2bf, 0xa438, 0x6593, 0xa438, 0x026b,
++        0xa438, 0x7eee, 0xa438, 0x8fe1, 0xa438, 0x00ae, 0xa438, 0x08a0,
++        0xa438, 0x0605, 0xa438, 0x021d, 0xa438, 0x07ae, 0xa438, 0x00e0,
++        0xa438, 0x82b1, 0xa438, 0xaf1b, 0xa438, 0xe910, 0xa438, 0xbf4a,
++        0xa438, 0x99bf, 0xa438, 0x4a00, 0xa438, 0xa86a, 0xa438, 0xfdad,
++        0xa438, 0x5eca, 0xa438, 0xad5e, 0xa438, 0x88bd, 0xa438, 0x2c99,
++        0xa438, 0xbd2c, 0xa438, 0x33bd, 0xa438, 0x3222, 0xa438, 0xbd32,
++        0xa438, 0x11bd, 0xa438, 0x3200, 0xa438, 0xbd32, 0xa438, 0x77bd,
++        0xa438, 0x3266, 0xa438, 0xbd32, 0xa438, 0x55bd, 0xa438, 0x3244,
++        0xa438, 0xbd32, 0xa436, 0xb818, 0xa438, 0x15c5, 0xa436, 0xb81a,
++        0xa438, 0x6255, 0xa436, 0xb81c, 0xa438, 0x34e1, 0xa436, 0xb81e,
++        0xa438, 0x1068, 0xa436, 0xb850, 0xa438, 0x07cc, 0xa436, 0xb852,
++        0xa438, 0x26ca, 0xa436, 0xb878, 0xa438, 0x0dbf, 0xa436, 0xb884,
++        0xa438, 0x1BB1, 0xa436, 0xb832, 0xa438, 0x00ff, 0xa436, 0x0000,
++        0xa438, 0x0000, 0xB82E, 0x0000, 0xa436, 0x8023, 0xa438, 0x0000,
++        0xa436, 0x801E, 0xa438, 0x0031, 0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16  phy_mcu_ram_code_8125d_1_2[] = {
++        0xb892, 0x0000, 0xB88E, 0xC28F, 0xB890, 0x252D, 0xB88E, 0xC290,
++        0xB890, 0xC924, 0xB88E, 0xC291, 0xB890, 0xC92E, 0xB88E, 0xC292,
++        0xB890, 0xF626, 0xB88E, 0xC293, 0xB890, 0xF630, 0xB88E, 0xC294,
++        0xB890, 0xA328, 0xB88E, 0xC295, 0xB890, 0xA332, 0xB88E, 0xC296,
++        0xB890, 0xD72B, 0xB88E, 0xC297, 0xB890, 0xD735, 0xB88E, 0xC298,
++        0xB890, 0x8A2E, 0xB88E, 0xC299, 0xB890, 0x8A38, 0xB88E, 0xC29A,
++        0xB890, 0xBE32, 0xB88E, 0xC29B, 0xB890, 0xBE3C, 0xB88E, 0xC29C,
++        0xB890, 0x7436, 0xB88E, 0xC29D, 0xB890, 0x7440, 0xB88E, 0xC29E,
++        0xB890, 0xAD3B, 0xB88E, 0xC29F, 0xB890, 0xAD45, 0xB88E, 0xC2A0,
++        0xB890, 0x6640, 0xB88E, 0xC2A1, 0xB890, 0x664A, 0xB88E, 0xC2A2,
++        0xB890, 0xA646, 0xB88E, 0xC2A3, 0xB890, 0xA650, 0xB88E, 0xC2A4,
++        0xB890, 0x624C, 0xB88E, 0xC2A5, 0xB890, 0x6256, 0xB88E, 0xC2A6,
++        0xB890, 0xA453, 0xB88E, 0xC2A7, 0xB890, 0xA45D, 0xB88E, 0xC2A8,
++        0xB890, 0x665A, 0xB88E, 0xC2A9, 0xB890, 0x6664, 0xB88E, 0xC2AA,
++        0xB890, 0xAC62, 0xB88E, 0xC2AB, 0xB890, 0xAC6C, 0xB88E, 0xC2AC,
++        0xB890, 0x746A, 0xB88E, 0xC2AD, 0xB890, 0x7474, 0xB88E, 0xC2AE,
++        0xB890, 0xBCFA, 0xB88E, 0xC2AF, 0xB890, 0xBCFD, 0xB88E, 0xC2B0,
++        0xB890, 0x79FF, 0xB88E, 0xC2B1, 0xB890, 0x7901, 0xB88E, 0xC2B2,
++        0xB890, 0xF703, 0xB88E, 0xC2B3, 0xB890, 0xF706, 0xB88E, 0xC2B4,
++        0xB890, 0x7408, 0xB88E, 0xC2B5, 0xB890, 0x740A, 0xB88E, 0xC2B6,
++        0xB890, 0xF10C, 0xB88E, 0xC2B7, 0xB890, 0xF10F, 0xB88E, 0xC2B8,
++        0xB890, 0x6F10, 0xB88E, 0xC2B9, 0xB890, 0x6F13, 0xB88E, 0xC2BA,
++        0xB890, 0xEC15, 0xB88E, 0xC2BB, 0xB890, 0xEC18, 0xB88E, 0xC2BC,
++        0xB890, 0x6A1A, 0xB88E, 0xC2BD, 0xB890, 0x6A1C, 0xB88E, 0xC2BE,
++        0xB890, 0xE71E, 0xB88E, 0xC2BF, 0xB890, 0xE721, 0xB88E, 0xC2C0,
++        0xB890, 0x6424, 0xB88E, 0xC2C1, 0xB890, 0x6425, 0xB88E, 0xC2C2,
++        0xB890, 0xE228, 0xB88E, 0xC2C3, 0xB890, 0xE22A, 0xB88E, 0xC2C4,
++        0xB890, 0x5F2B, 0xB88E, 0xC2C5, 0xB890, 0x5F2E, 0xB88E, 0xC2C6,
++        0xB890, 0xDC31, 0xB88E, 0xC2C7, 0xB890, 0xDC33, 0xB88E, 0xC2C8,
++        0xB890, 0x2035, 0xB88E, 0xC2C9, 0xB890, 0x2036, 0xB88E, 0xC2CA,
++        0xB890, 0x9F3A, 0xB88E, 0xC2CB, 0xB890, 0x9F3A, 0xB88E, 0xC2CC,
++        0xB890, 0x4430, 0xFFFF, 0xFFFF
++};
++
++static const u16  phy_mcu_ram_code_8125d_1_3[] = {
++        0xa436, 0xacca, 0xa438, 0x0104, 0xa436, 0xaccc, 0xa438, 0x8000,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xfd47, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xe56f, 0xa436, 0xacd0, 0xa438, 0x01c0,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xed97, 0xa436, 0xacd0, 0xa438, 0x01c8,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xf5bf, 0xa436, 0xacd0, 0xa438, 0x01d0,
++        0xa436, 0xacce, 0xa438, 0xfb07, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb0f, 0xa436, 0xacd0, 0xa438, 0x01d8,
++        0xa436, 0xacce, 0xa438, 0xa087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0xa00f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0xa807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0xa88f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0xb027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0xb02f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0xb847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0xb84f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0xfb17, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb1f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xa017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0xa01f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0xa837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0xa83f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0xb097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0xb05f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0xb857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0xb89f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0xfb27, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb2f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x8087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x800f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x8807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x888f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x9027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x902f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x9847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x984f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0xa0a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0xa8af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0xa067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0xa86f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb37, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb3f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x8017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x801f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x8837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x883f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x9097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x905f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x9857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x989f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0xb0b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0xb8bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0xb077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0xb87f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb67, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb6f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x4087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x400f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x4807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x488f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x5027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x502f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x5847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x584f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0x60a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x68af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x6067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x686f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb77, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb7f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x4017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x401f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x4837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x483f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x5097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x505f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x5857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x589f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0x70b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x78bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x7077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x787f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb87, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb8f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x40a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x48af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x4067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x486f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb97, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb9f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x50b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x58bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x5077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x587f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfba7, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfbaf, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x2067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x286f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfbb7, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfbbf, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x3077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x387f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff,
++        0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xfff8, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x1171,
++        0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x1179,
++        0xa436, 0xacca, 0xa438, 0x0004, 0xa436, 0xacc6, 0xa438, 0x0008,
++        0xa436, 0xacc8, 0xa438, 0xc000, 0xa436, 0xacc6, 0xa438, 0x0015,
++        0xa436, 0xacc8, 0xa438, 0xc043, 0xa436, 0xacc8, 0xa438, 0x0000,
++        0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16 phy_mcu_ram_code_8125d_1_efuse[] = {
++        0xB87C, 0x8014, 0xB87E, 0x90C0, 0xa436, 0x8023, 0xa438, 0x3800,
++        0xa436, 0xB82E, 0xa438, 0x0001, 0xb820, 0x0010, 0xa436, 0x843d,
++        0xa438, 0xaf84, 0xa438, 0x55af, 0xa438, 0x8458, 0xa438, 0xaf84,
++        0xa438, 0x58af, 0xa438, 0x8458, 0xa438, 0xaf84, 0xa438, 0x58af,
++        0xa438, 0x8458, 0xa438, 0xaf84, 0xa438, 0x58af, 0xa438, 0x8458,
++        0xa438, 0xaf26, 0xa438, 0xd000, 0xa436, 0xb818, 0xa438, 0x26ca,
++        0xa436, 0xb81a, 0xa438, 0xffff, 0xa436, 0xb81c, 0xa438, 0xffff,
++        0xa436, 0xb81e, 0xa438, 0xffff, 0xa436, 0xb850, 0xa438, 0xffff,
++        0xa436, 0xb852, 0xa438, 0xffff, 0xa436, 0xb878, 0xa438, 0xffff,
++        0xa436, 0xb884, 0xa438, 0xffff, 0xa436, 0xb832, 0xa438, 0x0001,
++        0xa436, 0x0000, 0xa438, 0x0000, 0xB82E, 0x0000, 0xa436, 0x8023,
++        0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16  phy_mcu_ram_code_8125d_2_1[] = {
++        0xa436, 0x8023, 0xa438, 0x3801, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x808e, 0xa438, 0x1800, 0xa438, 0x80d6,
++        0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0x1800, 0xa438, 0x81e2,
++        0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0x1800, 0xa438, 0x81e2,
++        0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0xd500, 0xa438, 0xc48d,
++        0xa438, 0xd504, 0xa438, 0x8d03, 0xa438, 0xd701, 0xa438, 0x4045,
++        0xa438, 0xad02, 0xa438, 0xd504, 0xa438, 0xd706, 0xa438, 0x2529,
++        0xa438, 0x8021, 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da,
++        0xa438, 0xf019, 0xa438, 0x459a, 0xa438, 0xf03f, 0xa438, 0xd718,
++        0xa438, 0x62bb, 0xa438, 0xbb01, 0xa438, 0xd75e, 0xa438, 0x6231,
++        0xa438, 0x0cf0, 0xa438, 0x0c10, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0x8440,
++        0xa438, 0x8420, 0xa438, 0xa410, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf002, 0xa438, 0xa4f0,
++        0xa438, 0xf03c, 0xa438, 0xbb02, 0xa438, 0xd75e, 0xa438, 0x6231,
++        0xa438, 0x0cf0, 0xa438, 0x0c20, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0x8440,
++        0xa438, 0xa420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0804, 0xa438, 0xf002, 0xa438, 0xa4f0,
++        0xa438, 0xf028, 0xa438, 0xbb04, 0xa438, 0xd75e, 0xa438, 0x6231,
++        0xa438, 0x0cf0, 0xa438, 0x0c40, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0xa440,
++        0xa438, 0x8420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf002, 0xa438, 0xa4f0,
++        0xa438, 0xf014, 0xa438, 0xbb08, 0xa438, 0xd75e, 0xa438, 0x6231,
++        0xa438, 0x0cf0, 0xa438, 0x0c80, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0xa480, 0xa438, 0x8440,
++        0xa438, 0x8420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505,
++        0xa438, 0x0c0f, 0xa438, 0x0801, 0xa438, 0xf002, 0xa438, 0xa4f0,
++        0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0x1000, 0xa438, 0x1829, 0xa438, 0xd73e, 0xa438, 0x6074,
++        0xa438, 0xd718, 0xa438, 0x5f2d, 0xa438, 0x1000, 0xa438, 0x81b7,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1829,
++        0xa438, 0xd73e, 0xa438, 0x7f74, 0xa438, 0x1000, 0xa438, 0x81ce,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1829,
++        0xa438, 0xd718, 0xa438, 0x5f6d, 0xa438, 0x1800, 0xa438, 0x1660,
++        0xa438, 0xd75e, 0xa438, 0x68b1, 0xa438, 0xd504, 0xa438, 0xd71e,
++        0xa438, 0x667b, 0xa438, 0x645a, 0xa438, 0x6239, 0xa438, 0x0cf0,
++        0xa438, 0x0c10, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0808,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7,
++        0xa438, 0x8480, 0xa438, 0x8440, 0xa438, 0x8420, 0xa438, 0xa410,
++        0xa438, 0xf032, 0xa438, 0xa4f0, 0xa438, 0xf030, 0xa438, 0x0cf0,
++        0xa438, 0x0c20, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0804,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7,
++        0xa438, 0x8480, 0xa438, 0x8440, 0xa438, 0xa420, 0xa438, 0x8410,
++        0xa438, 0xf022, 0xa438, 0xa4f0, 0xa438, 0xf020, 0xa438, 0x0cf0,
++        0xa438, 0x0c40, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0802,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7,
++        0xa438, 0x8480, 0xa438, 0xa440, 0xa438, 0x8420, 0xa438, 0x8410,
++        0xa438, 0xf012, 0xa438, 0xa4f0, 0xa438, 0xf010, 0xa438, 0x0cf0,
++        0xa438, 0x0c80, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0801,
++        0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7,
++        0xa438, 0xa480, 0xa438, 0x8440, 0xa438, 0x8420, 0xa438, 0x8410,
++        0xa438, 0xf002, 0xa438, 0xa4f0, 0xa438, 0x1800, 0xa438, 0x168c,
++        0xa438, 0xd500, 0xa438, 0xd706, 0xa438, 0x2529, 0xa438, 0x80e0,
++        0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, 0xa438, 0xf00f,
++        0xa438, 0x431a, 0xa438, 0xf021, 0xa438, 0xd718, 0xa438, 0x617b,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1b1a,
++        0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34,
++        0xa438, 0xf020, 0xa438, 0xf053, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0x1000, 0xa438, 0x1b1a, 0xa438, 0xd718, 0xa438, 0x608e,
++        0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf023, 0xa438, 0xf067,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1b1a,
++        0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34,
++        0xa438, 0xf026, 0xa438, 0xf07b, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0x1000, 0xa438, 0x1b1a, 0xa438, 0xd718, 0xa438, 0x608e,
++        0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf029, 0xa438, 0xf08f,
++        0xa438, 0x1000, 0xa438, 0x81b7, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x81ce,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fae,
++        0xa438, 0xf028, 0xa438, 0x1000, 0xa438, 0x81b7, 0xa438, 0x1000,
++        0xa438, 0x1a8a, 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000,
++        0xa438, 0x81ce, 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718,
++        0xa438, 0x5fae, 0xa438, 0xf039, 0xa438, 0x1000, 0xa438, 0x81b7,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd73e, 0xa438, 0x7fb4,
++        0xa438, 0x1000, 0xa438, 0x81ce, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf04a, 0xa438, 0x1000,
++        0xa438, 0x81b7, 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd73e,
++        0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x81ce, 0xa438, 0x1000,
++        0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf05b,
++        0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac01,
++        0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a78,
++        0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504,
++        0xa438, 0xac11, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa410,
++        0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719,
++        0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf05d, 0xa438, 0x4b98,
++        0xa438, 0xa808, 0xa438, 0xf05a, 0xa438, 0xd719, 0xa438, 0x4119,
++        0xa438, 0xd504, 0xa438, 0xac02, 0xa438, 0xae01, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a78, 0xa438, 0xf00a, 0xa438, 0xd719,
++        0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac22, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0xa420, 0xa438, 0xce00, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fb0,
++        0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f,
++        0xa438, 0xf03f, 0xa438, 0x47d8, 0xa438, 0xa804, 0xa438, 0xf03c,
++        0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac04,
++        0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a78,
++        0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504,
++        0xa438, 0xac44, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa440,
++        0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a,
++        0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719,
++        0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf021, 0xa438, 0x4418,
++        0xa438, 0xa802, 0xa438, 0xf01e, 0xa438, 0xd719, 0xa438, 0x4119,
++        0xa438, 0xd504, 0xa438, 0xac08, 0xa438, 0xae01, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a78, 0xa438, 0xf00a, 0xa438, 0xd719,
++        0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac88, 0xa438, 0xd501,
++        0xa438, 0xce01, 0xa438, 0xa480, 0xa438, 0xce00, 0xa438, 0xd500,
++        0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fb0,
++        0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f,
++        0xa438, 0xf003, 0xa438, 0x4058, 0xa438, 0xa801, 0xa438, 0x1800,
++        0xa438, 0x1736, 0xa438, 0xd73e, 0xa438, 0xd505, 0xa438, 0x3088,
++        0xa438, 0x81c0, 0xa438, 0x61d3, 0xa438, 0x6172, 0xa438, 0x6111,
++        0xa438, 0x60b0, 0xa438, 0xf00d, 0xa438, 0x3298, 0xa438, 0x81cb,
++        0xa438, 0xf00a, 0xa438, 0xa808, 0xa438, 0xf008, 0xa438, 0xa804,
++        0xa438, 0xf006, 0xa438, 0xa802, 0xa438, 0xf004, 0xa438, 0xa801,
++        0xa438, 0xf002, 0xa438, 0xa80f, 0xa438, 0xd500, 0xa438, 0x0800,
++        0xa438, 0xd505, 0xa438, 0xd75e, 0xa438, 0x6211, 0xa438, 0xd71e,
++        0xa438, 0x619b, 0xa438, 0x611a, 0xa438, 0x6099, 0xa438, 0x0c0f,
++        0xa438, 0x0808, 0xa438, 0xf009, 0xa438, 0x0c0f, 0xa438, 0x0804,
++        0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf003,
++        0xa438, 0x0c0f, 0xa438, 0x0801, 0xa438, 0xd500, 0xa438, 0x0800,
++        0xa436, 0xA026, 0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0xffff,
++        0xa436, 0xA022, 0xa438, 0xffff, 0xa436, 0xA020, 0xa438, 0xffff,
++        0xa436, 0xA006, 0xa438, 0xffff, 0xa436, 0xA004, 0xa438, 0x16ab,
++        0xa436, 0xA002, 0xa438, 0x1663, 0xa436, 0xA000, 0xa438, 0x1608,
++        0xa436, 0xA008, 0xa438, 0x0700, 0xa436, 0xA016, 0xa438, 0x0000,
++        0xa436, 0xA012, 0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xcc01,
++        0xa438, 0x20f6, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152,
++        0xa438, 0x021c, 0xa436, 0xA154, 0xa438, 0x2100, 0xa436, 0xA156,
++        0xa438, 0x3fff, 0xa436, 0xA158, 0xa438, 0x3fff, 0xa436, 0xA15A,
++        0xa438, 0x3fff, 0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E,
++        0xa438, 0x3fff, 0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150,
++        0xa438, 0x0003, 0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x8014, 0xa438, 0x1800, 0xa438, 0x803d,
++        0xa438, 0x1800, 0xa438, 0x804a, 0xa438, 0x1800, 0xa438, 0x804e,
++        0xa438, 0x1800, 0xa438, 0x8052, 0xa438, 0x1800, 0xa438, 0x8092,
++        0xa438, 0x1800, 0xa438, 0x80a0, 0xa438, 0xc2ff, 0xa438, 0x9a40,
++        0xa438, 0x1800, 0xa438, 0x0042, 0xa438, 0x1000, 0xa438, 0x02e5,
++        0xa438, 0xba20, 0xa438, 0x1000, 0xa438, 0x02b4, 0xa438, 0xd701,
++        0xa438, 0x4103, 0xa438, 0xd700, 0xa438, 0x5f6c, 0xa438, 0x1000,
++        0xa438, 0x8024, 0xa438, 0x9a20, 0xa438, 0x1800, 0xa438, 0x0073,
++        0xa438, 0x1800, 0xa438, 0x0084, 0xa438, 0xd701, 0xa438, 0x4061,
++        0xa438, 0xba0f, 0xa438, 0xf004, 0xa438, 0x4060, 0xa438, 0x1000,
++        0xa438, 0x802d, 0xa438, 0xba10, 0xa438, 0x0800, 0xa438, 0xd700,
++        0xa438, 0x60bb, 0xa438, 0x611c, 0xa438, 0x0c0f, 0xa438, 0x1a01,
++        0xa438, 0xf00a, 0xa438, 0x60fc, 0xa438, 0x0c0f, 0xa438, 0x1a02,
++        0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x1a04, 0xa438, 0xf003,
++        0xa438, 0x0c0f, 0xa438, 0x1a08, 0xa438, 0x0800, 0xa438, 0x0c0f,
++        0xa438, 0x0504, 0xa438, 0xad02, 0xa438, 0xd73e, 0xa438, 0x40f6,
++        0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5fac,
++        0xa438, 0x1000, 0xa438, 0x8024, 0xa438, 0x1800, 0xa438, 0x0139,
++        0xa438, 0x9a3f, 0xa438, 0x8bf0, 0xa438, 0x1800, 0xa438, 0x02df,
++        0xa438, 0x9a3f, 0xa438, 0x9910, 0xa438, 0x1800, 0xa438, 0x02d7,
++        0xa438, 0xad02, 0xa438, 0x8d01, 0xa438, 0x9a7f, 0xa438, 0x9910,
++        0xa438, 0x9860, 0xa438, 0xcb00, 0xa438, 0xd501, 0xa438, 0xce01,
++        0xa438, 0x85f0, 0xa438, 0xd500, 0xa438, 0x0c0f, 0xa438, 0x0505,
++        0xa438, 0xb820, 0xa438, 0xc000, 0xa438, 0xc100, 0xa438, 0xc628,
++        0xa438, 0xc700, 0xa438, 0xc801, 0xa438, 0xc91e, 0xa438, 0xc001,
++        0xa438, 0x4019, 0xa438, 0xc6f8, 0xa438, 0xc702, 0xa438, 0xc809,
++        0xa438, 0xc940, 0xa438, 0xc002, 0xa438, 0x4019, 0xa438, 0x1000,
++        0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa7, 0xa438, 0xc010,
++        0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa0,
++        0xa438, 0xc020, 0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700,
++        0xa438, 0x5fa1, 0xa438, 0x0c0f, 0xa438, 0x0506, 0xa438, 0xb840,
++        0xa438, 0xc6ca, 0xa438, 0xc701, 0xa438, 0xc809, 0xa438, 0xc900,
++        0xa438, 0xc001, 0xa438, 0x4019, 0xa438, 0xc6b8, 0xa438, 0xc700,
++        0xa438, 0xc800, 0xa438, 0xc900, 0xa438, 0xc008, 0xa438, 0x4019,
++        0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa5,
++        0xa438, 0x8580, 0xa438, 0x8d02, 0xa438, 0x1800, 0xa438, 0x018f,
++        0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x6124,
++        0xa438, 0xd73e, 0xa438, 0x5f75, 0xa438, 0xd700, 0xa438, 0x5f2c,
++        0xa438, 0x1000, 0xa438, 0x8024, 0xa438, 0x9a20, 0xa438, 0xfff5,
++        0xa438, 0x1800, 0xa438, 0x00b8, 0xa438, 0x0c0f, 0xa438, 0x0503,
++        0xa438, 0xad02, 0xa438, 0x68c8, 0xa438, 0x1000, 0xa438, 0x02c0,
++        0xa438, 0xd700, 0xa438, 0x6848, 0xa438, 0x604d, 0xa438, 0xfffb,
++        0xa438, 0xd73e, 0xa438, 0x6082, 0xa438, 0x1000, 0xa438, 0x02a1,
++        0xa438, 0x8a0f, 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700,
++        0xa438, 0x5fae, 0xa438, 0x1000, 0xa438, 0x02de, 0xa438, 0x1000,
++        0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5faf, 0xa438, 0x8d01,
++        0xa438, 0x8b0f, 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700,
++        0xa438, 0x2a58, 0xa438, 0x80c5, 0xa438, 0x2a5b, 0xa438, 0x80cd,
++        0xa438, 0x2b53, 0xa438, 0x80d9, 0xa438, 0xfff7, 0xa438, 0x1000,
++        0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0xba40,
++        0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0xf018, 0xa438, 0x1000,
++        0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0xba40,
++        0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5faa,
++        0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0xf00c, 0xa438, 0x1000,
++        0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0x1000,
++        0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5fab, 0xa438, 0x1000,
++        0xa438, 0x02e5, 0xa438, 0xba40, 0xa438, 0x1000, 0xa438, 0x02c0,
++        0xa438, 0xd700, 0xa438, 0x6088, 0xa438, 0xfffc, 0xa438, 0x1800,
++        0xa438, 0x0120, 0xa438, 0x1800, 0xa438, 0x0122, 0xa436, 0xA08E,
++        0xa438, 0x00db, 0xa436, 0xA08C, 0xa438, 0x00b4, 0xa436, 0xA08A,
++        0xa438, 0x015a, 0xa436, 0xA088, 0xa438, 0x02d6, 0xa436, 0xA086,
++        0xa438, 0x02de, 0xa436, 0xA084, 0xa438, 0x0137, 0xa436, 0xA082,
++        0xa438, 0x0071, 0xa436, 0xA080, 0xa438, 0x0041, 0xa436, 0xA090,
++        0xa438, 0x00ff, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x801d, 0xa438, 0x1800, 0xa438, 0x808a,
++        0xa438, 0x1800, 0xa438, 0x80a5, 0xa438, 0x1800, 0xa438, 0x80b8,
++        0xa438, 0x1800, 0xa438, 0x8108, 0xa438, 0x1800, 0xa438, 0x810f,
++        0xa438, 0x1800, 0xa438, 0x811b, 0xa438, 0x8980, 0xa438, 0xd702,
++        0xa438, 0x6126, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702,
++        0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6077, 0xa438, 0x1800,
++        0xa438, 0x0c29, 0xa438, 0x1800, 0xa438, 0x0c2b, 0xa438, 0x1000,
++        0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x5fb4, 0xa438, 0xd702,
++        0xa438, 0x6c46, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702,
++        0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6b97, 0xa438, 0xa340,
++        0xa438, 0x0c06, 0xa438, 0x0102, 0xa438, 0xce01, 0xa438, 0x1000,
++        0xa438, 0x117a, 0xa438, 0xa240, 0xa438, 0xa902, 0xa438, 0xa204,
++        0xa438, 0xa280, 0xa438, 0xa364, 0xa438, 0xab02, 0xa438, 0x8380,
++        0xa438, 0xa00a, 0xa438, 0xcd8d, 0xa438, 0x1000, 0xa438, 0x115a,
++        0xa438, 0xd706, 0xa438, 0x5fb5, 0xa438, 0xb920, 0xa438, 0x1000,
++        0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920,
++        0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x6065,
++        0xa438, 0x7c74, 0xa438, 0xfffb, 0xa438, 0xb820, 0xa438, 0x1000,
++        0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x7fa5, 0xa438, 0x9820,
++        0xa438, 0xa410, 0xa438, 0x8902, 0xa438, 0xa120, 0xa438, 0xa380,
++        0xa438, 0xce02, 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x8280,
++        0xa438, 0xa324, 0xa438, 0xab02, 0xa438, 0xa00a, 0xa438, 0x8118,
++        0xa438, 0x863f, 0xa438, 0x87fb, 0xa438, 0xcd8e, 0xa438, 0xd193,
++        0xa438, 0xd047, 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000,
++        0xa438, 0x115f, 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xa280,
++        0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000, 0xa438, 0x115f,
++        0xa438, 0xd706, 0xa438, 0x5f78, 0xa438, 0xa210, 0xa438, 0xd700,
++        0xa438, 0x6083, 0xa438, 0xd101, 0xa438, 0xd047, 0xa438, 0xf003,
++        0xa438, 0xd160, 0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x115a,
++        0xa438, 0x1000, 0xa438, 0x115f, 0xa438, 0xd700, 0xa438, 0x5f7b,
++        0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000, 0xa438, 0x115f,
++        0xa438, 0xd706, 0xa438, 0x5f79, 0xa438, 0x8120, 0xa438, 0xbb20,
++        0xa438, 0x1800, 0xa438, 0x0c8b, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8f80, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c3c,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa608, 0xa438, 0x9503,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f80, 0xa438, 0x9503,
++        0xa438, 0xd704, 0xa438, 0x6192, 0xa438, 0xd702, 0xa438, 0x4116,
++        0xa438, 0xce04, 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0x1800,
++        0xa438, 0x0b3d, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40,
++        0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0b48, 0xa438, 0xd704,
++        0xa438, 0x6192, 0xa438, 0xd702, 0xa438, 0x4116, 0xa438, 0xce04,
++        0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x1269,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40, 0xa438, 0x9503,
++        0xa438, 0x1800, 0xa438, 0x1274, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xa608, 0xa438, 0xc700, 0xa438, 0x9503, 0xa438, 0xce54,
++        0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0xa290, 0xa438, 0xa304,
++        0xa438, 0xab02, 0xa438, 0xd700, 0xa438, 0x6050, 0xa438, 0xab04,
++        0xa438, 0x0c38, 0xa438, 0x0608, 0xa438, 0xaa0b, 0xa438, 0xd702,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8d01, 0xa438, 0xae40,
++        0xa438, 0x4044, 0xa438, 0x8e20, 0xa438, 0x9503, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8c20, 0xa438, 0x9503, 0xa438, 0xd700,
++        0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a, 0xa438, 0xd109,
++        0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109, 0xa438, 0xd075,
++        0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0xd704, 0xa438, 0x6252,
++        0xa438, 0xd702, 0xa438, 0x4116, 0xa438, 0xce54, 0xa438, 0x1000,
++        0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f40,
++        0xa438, 0x9503, 0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x41e7,
++        0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570, 0xa438, 0x9503,
++        0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40,
++        0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xd704, 0xa438, 0x60f3,
++        0xa438, 0xd71f, 0xa438, 0x60ee, 0xa438, 0xd700, 0xa438, 0x5bbe,
++        0xa438, 0x1800, 0xa438, 0x0e71, 0xa438, 0x1800, 0xa438, 0x0e7c,
++        0xa438, 0x1800, 0xa438, 0x0e7e, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xaf80, 0xa438, 0x9503, 0xa438, 0xcd62, 0xa438, 0x1800,
++        0xa438, 0x0bd2, 0xa438, 0x800a, 0xa438, 0x8530, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8d10, 0xa438, 0x9503, 0xa438, 0xd700,
++        0xa438, 0x6050, 0xa438, 0xaa20, 0xa438, 0x8306, 0xa438, 0x1800,
++        0xa438, 0x0cb6, 0xa438, 0xd105, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0d8f, 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8608, 0xa438, 0x9503, 0xa438, 0x1000,
++        0xa438, 0x0d8f, 0xa438, 0xd704, 0xa438, 0x7fb6, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x87f0, 0xa438, 0x9503, 0xa438, 0xce88,
++        0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502,
++        0xa438, 0xa608, 0xa438, 0x9503, 0xa438, 0xd73e, 0xa438, 0x60a5,
++        0xa438, 0xd705, 0xa438, 0x4071, 0xa438, 0x1800, 0xa438, 0x0d65,
++        0xa438, 0x1800, 0xa438, 0x0d6f, 0xa436, 0xA10E, 0xa438, 0x0d58,
++        0xa436, 0xA10C, 0xa438, 0x0cb5, 0xa436, 0xA10A, 0xa438, 0x0bd1,
++        0xa436, 0xA108, 0xa438, 0x0e37, 0xa436, 0xA106, 0xa438, 0x1267,
++        0xa436, 0xA104, 0xa438, 0x0b3b, 0xa436, 0xA102, 0xa438, 0x0c38,
++        0xa436, 0xA100, 0xa438, 0x0c24, 0xa436, 0xA110, 0xa438, 0x00ff,
++        0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x1ff8,
++        0xa436, 0xA014, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa436, 0xA164, 0xa438, 0x0ceb, 0xa436, 0xA166,
++        0xa438, 0x0e73, 0xa436, 0xA168, 0xa438, 0x0deb, 0xa436, 0xA16A,
++        0xa438, 0x3fff, 0xa436, 0xA16C, 0xa438, 0x3fff, 0xa436, 0xA16E,
++        0xa438, 0x3fff, 0xa436, 0xA170, 0xa438, 0x3fff, 0xa436, 0xA172,
++        0xa438, 0x3fff, 0xa436, 0xA162, 0xa438, 0x0007, 0xa436, 0xb87c,
++        0xa438, 0x85bf, 0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xd7af,
++        0xa438, 0x85fb, 0xa438, 0xaf86, 0xa438, 0x10af, 0xa438, 0x8638,
++        0xa438, 0xaf86, 0xa438, 0x47af, 0xa438, 0x8647, 0xa438, 0xaf86,
++        0xa438, 0x47af, 0xa438, 0x8647, 0xa438, 0xbf85, 0xa438, 0xf802,
++        0xa438, 0x627f, 0xa438, 0xbf61, 0xa438, 0xc702, 0xa438, 0x627f,
++        0xa438, 0xae0c, 0xa438, 0xbf85, 0xa438, 0xf802, 0xa438, 0x6276,
++        0xa438, 0xbf61, 0xa438, 0xc702, 0xa438, 0x6276, 0xa438, 0xee85,
++        0xa438, 0x4200, 0xa438, 0xaf1b, 0xa438, 0x2333, 0xa438, 0xa484,
++        0xa438, 0xbf86, 0xa438, 0x0a02, 0xa438, 0x627f, 0xa438, 0xbf86,
++        0xa438, 0x0d02, 0xa438, 0x627f, 0xa438, 0xaf1b, 0xa438, 0x8422,
++        0xa438, 0xa484, 0xa438, 0x66ac, 0xa438, 0x0ef8, 0xa438, 0xfbef,
++        0xa438, 0x79fb, 0xa438, 0xe080, 0xa438, 0x16ad, 0xa438, 0x230f,
++        0xa438, 0xee85, 0xa438, 0x4200, 0xa438, 0x1f44, 0xa438, 0xbf86,
++        0xa438, 0x30d7, 0xa438, 0x0008, 0xa438, 0x0264, 0xa438, 0xa3ff,
++        0xa438, 0xef97, 0xa438, 0xfffc, 0xa438, 0x0485, 0xa438, 0xf861,
++        0xa438, 0xc786, 0xa438, 0x0a86, 0xa438, 0x0de1, 0xa438, 0x8feb,
++        0xa438, 0xe583, 0xa438, 0x20e1, 0xa438, 0x8fea, 0xa438, 0xe583,
++        0xa438, 0x21af, 0xa438, 0x41a7, 0xa436, 0xb85e, 0xa438, 0x1b05,
++        0xa436, 0xb860, 0xa438, 0x1b78, 0xa436, 0xb862, 0xa438, 0x1a08,
++        0xa436, 0xb864, 0xa438, 0x419F, 0xa436, 0xb886, 0xa438, 0xffff,
++        0xa436, 0xb888, 0xa438, 0xffff, 0xa436, 0xb88a, 0xa438, 0xffff,
++        0xa436, 0xb88c, 0xa438, 0xffff, 0xa436, 0xb838, 0xa438, 0x000f,
++        0xb820, 0x0010, 0xa436, 0x0000, 0xa438, 0x0000, 0xB82E, 0x0000,
++        0xa436, 0x8023, 0xa438, 0x0000, 0xa436, 0x801E, 0xa438, 0x0013,
++        0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16  phy_mcu_ram_code_8125d_2_2[] = {
++        0xa436, 0xacca, 0xa438, 0x0104, 0xa436, 0xaccc, 0xa438, 0x8000,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xfd47, 0xa436, 0xacd0, 0xa438, 0x0fff,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xe56f, 0xa436, 0xacd0, 0xa438, 0x01c0,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xed97, 0xa436, 0xacd0, 0xa438, 0x01c8,
++        0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xf5bf, 0xa436, 0xacd0, 0xa438, 0x01d0,
++        0xa436, 0xacce, 0xa438, 0xfb07, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb0f, 0xa436, 0xacd0, 0xa438, 0x01d8,
++        0xa436, 0xacce, 0xa438, 0xa087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0xa00f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0xa807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0xa88f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0xb027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0xb02f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0xb847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0xb84f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0xfb17, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb1f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xa017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0xa01f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0xa837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0xa83f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0xb097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0xb05f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0xb857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0xb89f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0xfb27, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb2f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x8087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x800f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x8807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x888f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x9027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x902f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x9847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x984f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0xa0a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0xa8af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0xa067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0xa86f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb37, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb3f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x8017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x801f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x8837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x883f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x9097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x905f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x9857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x989f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0xb0b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0xb8bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0xb077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0xb87f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb67, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb6f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x4087, 0xa436, 0xacd0, 0xa438, 0x0180,
++        0xa436, 0xacce, 0xa438, 0x400f, 0xa436, 0xacd0, 0xa438, 0x0108,
++        0xa436, 0xacce, 0xa438, 0x4807, 0xa436, 0xacd0, 0xa438, 0x0100,
++        0xa436, 0xacce, 0xa438, 0x488f, 0xa436, 0xacd0, 0xa438, 0x0188,
++        0xa436, 0xacce, 0xa438, 0x5027, 0xa436, 0xacd0, 0xa438, 0x0120,
++        0xa436, 0xacce, 0xa438, 0x502f, 0xa436, 0xacd0, 0xa438, 0x0128,
++        0xa436, 0xacce, 0xa438, 0x5847, 0xa436, 0xacd0, 0xa438, 0x0140,
++        0xa436, 0xacce, 0xa438, 0x584f, 0xa436, 0xacd0, 0xa438, 0x0148,
++        0xa436, 0xacce, 0xa438, 0x60a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x68af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x6067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x686f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb77, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb7f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x4017, 0xa436, 0xacd0, 0xa438, 0x0110,
++        0xa436, 0xacce, 0xa438, 0x401f, 0xa436, 0xacd0, 0xa438, 0x0118,
++        0xa436, 0xacce, 0xa438, 0x4837, 0xa436, 0xacd0, 0xa438, 0x0130,
++        0xa436, 0xacce, 0xa438, 0x483f, 0xa436, 0xacd0, 0xa438, 0x0138,
++        0xa436, 0xacce, 0xa438, 0x5097, 0xa436, 0xacd0, 0xa438, 0x0190,
++        0xa436, 0xacce, 0xa438, 0x505f, 0xa436, 0xacd0, 0xa438, 0x0158,
++        0xa436, 0xacce, 0xa438, 0x5857, 0xa436, 0xacd0, 0xa438, 0x0150,
++        0xa436, 0xacce, 0xa438, 0x589f, 0xa436, 0xacd0, 0xa438, 0x0198,
++        0xa436, 0xacce, 0xa438, 0x70b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x78bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x7077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x787f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfb87, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb8f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x40a7, 0xa436, 0xacd0, 0xa438, 0x01a0,
++        0xa436, 0xacce, 0xa438, 0x48af, 0xa436, 0xacd0, 0xa438, 0x01a8,
++        0xa436, 0xacce, 0xa438, 0x4067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x486f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfb97, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfb9f, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x50b7, 0xa436, 0xacd0, 0xa438, 0x01b0,
++        0xa436, 0xacce, 0xa438, 0x58bf, 0xa436, 0xacd0, 0xa438, 0x01b8,
++        0xa436, 0xacce, 0xa438, 0x5077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x587f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfba7, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfbaf, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x2067, 0xa436, 0xacd0, 0xa438, 0x0161,
++        0xa436, 0xacce, 0xa438, 0x286f, 0xa436, 0xacd0, 0xa438, 0x0169,
++        0xa436, 0xacce, 0xa438, 0xfbb7, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0xfbbf, 0xa436, 0xacd0, 0xa438, 0x07ff,
++        0xa436, 0xacce, 0xa438, 0x3077, 0xa436, 0xacd0, 0xa438, 0x0171,
++        0xa436, 0xacce, 0xa438, 0x387f, 0xa436, 0xacd0, 0xa438, 0x0179,
++        0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff,
++        0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff,
++        0xa436, 0xacca, 0xa438, 0x0004, 0xa436, 0xacc6, 0xa438, 0x0008,
++        0xa436, 0xacc8, 0xa438, 0xc000, 0xa436, 0xacc8, 0xa438, 0x0000,
++        0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16 phy_mcu_ram_code_8125bp_1_1[] = {
++        0xa436, 0x8024, 0xa438, 0x3600, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010,
++        0xa438, 0x1800, 0xa438, 0x8014, 0xa438, 0x1800, 0xa438, 0x8018,
++        0xa438, 0x1800, 0xa438, 0x801c, 0xa438, 0x1800, 0xa438, 0x8020,
++        0xa438, 0x1800, 0xa438, 0x8024, 0xa438, 0x1800, 0xa438, 0x8028,
++        0xa438, 0x1800, 0xa438, 0x8028, 0xa438, 0xdb20, 0xa438, 0xd501,
++        0xa438, 0x1800, 0xa438, 0x034c, 0xa438, 0xdb10, 0xa438, 0xd501,
++        0xa438, 0x1800, 0xa438, 0x032c, 0xa438, 0x8620, 0xa438, 0xa480,
++        0xa438, 0x1800, 0xa438, 0x1cfe, 0xa438, 0xbf40, 0xa438, 0xd703,
++        0xa438, 0x1800, 0xa438, 0x0ce9, 0xa438, 0x9c10, 0xa438, 0x9f40,
++        0xa438, 0x1800, 0xa438, 0x137a, 0xa438, 0x9f20, 0xa438, 0x9f40,
++        0xa438, 0x1800, 0xa438, 0x16c4, 0xa436, 0xA026, 0xa438, 0xffff,
++        0xa436, 0xA024, 0xa438, 0xffff, 0xa436, 0xA022, 0xa438, 0x16c3,
++        0xa436, 0xA020, 0xa438, 0x1379, 0xa436, 0xA006, 0xa438, 0x0ce8,
++        0xa436, 0xA004, 0xa438, 0x1cfd, 0xa436, 0xA002, 0xa438, 0x032b,
++        0xa436, 0xA000, 0xa438, 0x034b, 0xa436, 0xA008, 0xa438, 0x3f00,
++        0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000,
++        0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800,
++        0xa438, 0x8018, 0xa438, 0x1800, 0xa438, 0x8021, 0xa438, 0x1800,
++        0xa438, 0x802b, 0xa438, 0x1800, 0xa438, 0x8055, 0xa438, 0x1800,
++        0xa438, 0x805a, 0xa438, 0x1800, 0xa438, 0x805e, 0xa438, 0x1800,
++        0xa438, 0x8062, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xcb11,
++        0xa438, 0xd1b9, 0xa438, 0xd05b, 0xa438, 0x0000, 0xa438, 0x1800,
++        0xa438, 0x0284, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xd700,
++        0xa438, 0x5fb4, 0xa438, 0x5f95, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x1800, 0xa438, 0x02b7, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0xcb21, 0xa438, 0x1000, 0xa438, 0x0b34, 0xa438, 0xd71f,
++        0xa438, 0x5f5e, 0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x0322,
++        0xa438, 0xd700, 0xa438, 0xd113, 0xa438, 0xd040, 0xa438, 0x1000,
++        0xa438, 0x0a57, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd700,
++        0xa438, 0x6065, 0xa438, 0xd122, 0xa438, 0xf002, 0xa438, 0xd122,
++        0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0b53, 0xa438, 0xa008,
++        0xa438, 0xd704, 0xa438, 0x4052, 0xa438, 0xa002, 0xa438, 0xd704,
++        0xa438, 0x4054, 0xa438, 0xa740, 0xa438, 0x1000, 0xa438, 0x0a57,
++        0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb9b, 0xa438, 0xd110,
++        0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0c01, 0xa438, 0x1000,
++        0xa438, 0x0a57, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x801a,
++        0xa438, 0x1000, 0xa438, 0x0a57, 0xa438, 0xd704, 0xa438, 0x7fb9,
++        0xa438, 0x1800, 0xa438, 0x088d, 0xa438, 0xcb62, 0xa438, 0xd700,
++        0xa438, 0x8880, 0xa438, 0x1800, 0xa438, 0x06cb, 0xa438, 0xbe02,
++        0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa438, 0xbe04,
++        0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa438, 0xbe08,
++        0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa436, 0xA10E,
++        0xa438, 0x802a, 0xa436, 0xA10C, 0xa438, 0x8026, 0xa436, 0xA10A,
++        0xa438, 0x8022, 0xa436, 0xA108, 0xa438, 0x06ca, 0xa436, 0xA106,
++        0xa438, 0x086f, 0xa436, 0xA104, 0xa438, 0x0321, 0xa436, 0xA102,
++        0xa438, 0x02b5, 0xa436, 0xA100, 0xa438, 0x0283, 0xa436, 0xA110,
++        0xa438, 0x001f, 0xb820, 0x0010, 0xb82e, 0x0000, 0xa436, 0x8024,
++        0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16 phy_mcu_ram_code_8125bp_1_2[] = {
++        0xb892, 0x0000, 0xb88e, 0xC201, 0xb890, 0x2C01, 0xb890, 0xCD02,
++        0xb890, 0x0602, 0xb890, 0x5502, 0xb890, 0xB903, 0xb890, 0x3303,
++        0xb890, 0xC204, 0xb890, 0x6605, 0xb890, 0x1F05, 0xb890, 0xEE06,
++        0xb890, 0xD207, 0xb890, 0xCC08, 0xb890, 0xDA09, 0xb890, 0xFF0B,
++        0xb890, 0x380C, 0xb890, 0x87F3, 0xb88e, 0xC27F, 0xb890, 0x2B66,
++        0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666,
++        0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x66C2,
++        0xb88e, 0xC26F, 0xb890, 0x751D, 0xb890, 0x1D1F, 0xb890, 0x2022,
++        0xb890, 0x2325, 0xb890, 0x2627, 0xb890, 0x2829, 0xb890, 0x2929,
++        0xb890, 0x2A2A, 0xb890, 0x2B66, 0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static const u16 phy_mcu_ram_code_8125cp_1_1[] = {
++        0xa436, 0x8023, 0xa438, 0x2300, 0xa436, 0xB82E, 0xa438, 0x0001,
++        0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012,
++        0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xcc01, 0xa438, 0x2166,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000,
++        0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152, 0xa438, 0x021c,
++        0xa436, 0xA154, 0xa438, 0x2170, 0xa436, 0xA156, 0xa438, 0x3fff,
++        0xa436, 0xA158, 0xa438, 0x3fff, 0xa436, 0xA15A, 0xa438, 0x3fff,
++        0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E, 0xa438, 0x3fff,
++        0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150, 0xa438, 0x0003,
++        0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000,
++        0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800,
++        0xa438, 0x801b, 0xa438, 0x1800, 0xa438, 0x802b, 0xa438, 0x1800,
++        0xa438, 0x8031, 0xa438, 0x1800, 0xa438, 0x8037, 0xa438, 0x1800,
++        0xa438, 0x8037, 0xa438, 0x1800, 0xa438, 0x8037, 0xa438, 0x1800,
++        0xa438, 0x8037, 0xa438, 0x800a, 0xa438, 0x8530, 0xa438, 0x0c03,
++        0xa438, 0x1502, 0xa438, 0x8d10, 0xa438, 0x9503, 0xa438, 0xd700,
++        0xa438, 0x6050, 0xa438, 0xaa20, 0xa438, 0x1800, 0xa438, 0x0d53,
++        0xa438, 0xd707, 0xa438, 0x40f6, 0xa438, 0x8901, 0xa438, 0xd704,
++        0xa438, 0x6091, 0xa438, 0x8306, 0xa438, 0x8b02, 0xa438, 0x8290,
++        0xa438, 0x1000, 0xa438, 0x0e4d, 0xa438, 0x1000, 0xa438, 0x1277,
++        0xa438, 0xd704, 0xa438, 0x7e77, 0xa438, 0x1800, 0xa438, 0x0dc5,
++        0xa438, 0xd700, 0xa438, 0x4063, 0xa438, 0x1800, 0xa438, 0x0d15,
++        0xa438, 0x1800, 0xa438, 0x0d18, 0xa438, 0xd700, 0xa438, 0x6063,
++        0xa438, 0x1800, 0xa438, 0x0ca6, 0xa438, 0x1800, 0xa438, 0x0ca7,
++        0xa436, 0xA10E, 0xa438, 0xffff, 0xa436, 0xA10C, 0xa438, 0xffff,
++        0xa436, 0xA10A, 0xa438, 0xffff, 0xa436, 0xA108, 0xa438, 0xffff,
++        0xa436, 0xA106, 0xa438, 0x0ca2, 0xa436, 0xA104, 0xa438, 0x0d13,
++        0xa436, 0xA102, 0xa438, 0x0dbf, 0xa436, 0xA100, 0xa438, 0x0d52,
++        0xa436, 0xA110, 0xa438, 0x000f, 0xa436, 0xb87c, 0xa438, 0x85bd,
++        0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xd5af, 0xa438, 0x85fb,
++        0xa438, 0xaf85, 0xa438, 0xfbaf, 0xa438, 0x85fb, 0xa438, 0xaf85,
++        0xa438, 0xfbaf, 0xa438, 0x85fb, 0xa438, 0xaf85, 0xa438, 0xfbaf,
++        0xa438, 0x85fb, 0xa438, 0xac28, 0xa438, 0x0bd4, 0xa438, 0x0294,
++        0xa438, 0xbf85, 0xa438, 0xf802, 0xa438, 0x61c2, 0xa438, 0xae09,
++        0xa438, 0xd414, 0xa438, 0x50bf, 0xa438, 0x85f8, 0xa438, 0x0261,
++        0xa438, 0xc2bf, 0xa438, 0x60de, 0xa438, 0x0261, 0xa438, 0xe1bf,
++        0xa438, 0x80cf, 0xa438, 0xaf24, 0xa438, 0xe8f0, 0xa438, 0xac52,
++        0xa436, 0xb85e, 0xa438, 0x24e5, 0xa436, 0xb860, 0xa438, 0xffff,
++        0xa436, 0xb862, 0xa438, 0xffff, 0xa436, 0xb864, 0xa438, 0xffff,
++        0xa436, 0xb886, 0xa438, 0xffff, 0xa436, 0xb888, 0xa438, 0xffff,
++        0xa436, 0xb88a, 0xa438, 0xffff, 0xa436, 0xb88c, 0xa438, 0xffff,
++        0xa436, 0xb838, 0xa438, 0x0001, 0xb820, 0x0010, 0xB82E, 0x0000,
++        0xa436, 0x8023, 0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF
++};
++
++static void
++rtl8125_real_set_phy_mcu_8125b_1(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125b_1,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125b_1));
++}
++
++static void
++rtl8125_set_phy_mcu_8125b_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125b_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125b_2(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125b_2,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125b_2));
++}
++
++static void
++rtl8125_set_phy_mcu_8125b_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125b_2(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125d_1_1(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_1_1,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_1_1));
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125d_1_2(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_1_2,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_1_2));
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125d_1_3(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_1_3,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_1_3));
++}
++
++static void
++rtl8125_set_phy_mcu_8125d_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125d_1_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125d_1_2(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125d_1_3(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_set_phy_mcu_8125d_1_efuse(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_1_efuse,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_1_efuse));
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125d_2_1(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_2_1,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_2_1));
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125d_2_2(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125d_2_2,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125d_2_2));
++}
++
++static void
++rtl8125_set_phy_mcu_8125d_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125d_2_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125d_2_2(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125bp_1_1(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125bp_1_1,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125bp_1_1));
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125bp_1_2(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125bp_1_2,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125bp_1_2));
++}
++
++static void
++rtl8125_set_phy_mcu_8125bp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125bp_1_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125bp_1_2(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_real_set_phy_mcu_8125cp_1_1(struct net_device *dev)
++{
++        rtl8125_set_phy_mcu_ram_code(dev,
++                                     phy_mcu_ram_code_8125cp_1_1,
++                                     ARRAY_SIZE(phy_mcu_ram_code_8125cp_1_1));
++}
++
++static void
++rtl8125_set_phy_mcu_8125cp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_real_set_phy_mcu_8125cp_1_1(dev);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++}
++
++static void
++rtl8125_init_hw_phy_mcu(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u8 require_disable_phy_disable_mode = FALSE;
++
++        if (tp->NotWrRamCodeToMicroP == TRUE)
++                return;
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                return;
++
++        if (HW_SUPPORT_CHECK_PHY_DISABLE_MODE(tp) && rtl8125_is_in_phy_disable_mode(dev))
++                require_disable_phy_disable_mode = TRUE;
++
++        if (require_disable_phy_disable_mode)
++                rtl8125_disable_phy_disable_mode(dev);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                rtl8125_set_phy_mcu_8125a_1(dev);
++                break;
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_set_phy_mcu_8125a_2(dev);
++                break;
++        case CFG_METHOD_4:
++                rtl8125_set_phy_mcu_8125b_1(dev);
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                rtl8125_set_phy_mcu_8125b_2(dev);
++                break;
++        case CFG_METHOD_8:
++                rtl8125_set_phy_mcu_8125bp_1(dev);
++                break;
++        case CFG_METHOD_9:
++                /* nothing to do */
++                break;
++        case CFG_METHOD_10:
++                rtl8125_set_phy_mcu_8125d_1(dev);
++                break;
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                rtl8125_set_phy_mcu_8125d_2(dev);
++                break;
++        case CFG_METHOD_12:
++                rtl8125_set_phy_mcu_8125cp_1(dev);
++                break;
++        }
++
++        if (require_disable_phy_disable_mode)
++                rtl8125_enable_phy_disable_mode(dev);
++
++        rtl8125_write_hw_phy_mcu_code_ver(dev);
++
++        rtl8125_mdio_write(tp,0x1F, 0x0000);
++
++        tp->HwHasWrRamCodeToMicroP = TRUE;
++}
++#endif
++
++static void
++rtl8125_enable_phy_aldps(struct rtl8125_private *tp)
++{
++        //enable aldps
++        //GPHY OCP 0xA430 bit[2] = 0x1 (en_aldps)
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_2);
++}
++
++static void
++rtl8125_tgphy_irq_mask_and_ack(struct rtl8125_private *tp)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA4D2, 0x0000);
++                (void)rtl8125_mdio_direct_read_phy_ocp(tp, 0xA4D4);
++                break;
++        default:
++                break;
++        }
++}
++
++static void
++rtl8125_hw_phy_config_8125a_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD40,
++                                              0x03FF,
++                                              0x84);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD16,
++                                              0x03FF,
++                                              0x0006);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD32,
++                                              0x003F,
++                                              0x0006);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_12);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_8);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC8A,
++                                              BIT_15|BIT_14|BIT_13|BIT_12,
++                                              BIT_14|BIT_13|BIT_12);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xAD18, BIT_10);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xAD1A, 0x3FF);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xAD1C, 0x3FF);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EA);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xC400);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EB);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0x0700,
++                                              0x0300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80F8);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x1C00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80F1);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x3000);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80FE);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xA500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8102);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x5000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8105);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x3300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8100);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x7000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8104);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xF000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8106);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x6500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DC);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xED00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DF);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80E1);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_8);
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBF06,
++                                              0x003F,
++                                              0x38);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x819F);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xD0B6);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBC34, 0x5555);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBF0A,
++                                              BIT_11|BIT_10|BIT_9,
++                                              BIT_11|BIT_9);
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5C0, BIT_10);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++        //enable aldps
++        //GPHY OCP 0xA430 bit[2] = 0x1 (en_aldps)
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125a_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD16,
++                                              0x03FF,
++                                              0x03FF);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD32,
++                                              0x003F,
++                                              0x0006);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_12);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_8);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xACC0,
++                                              BIT_1|BIT_0,
++                                              BIT_1);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD40,
++                                              BIT_7|BIT_6|BIT_5,
++                                              BIT_6);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD40,
++                                              BIT_2|BIT_1|BIT_0,
++                                              BIT_2);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC14, BIT_7);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC80, BIT_9|BIT_8);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC5E,
++                                              BIT_2|BIT_1|BIT_0,
++                                              BIT_1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAD4C, 0x00A8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC5C, 0x01FF);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC8A,
++                                              BIT_7|BIT_6|BIT_5|BIT_4,
++                                              BIT_5|BIT_4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8157);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8159);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0700);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80A2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0153);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x809C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0153);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81B3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0043);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00A7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00D6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00EC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00F6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FB);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FD);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FF);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00BB);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0058);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0029);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0013);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0009);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0004);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8257);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020F);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EA);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7843);
++
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB896, BIT_0);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB892, 0xFF00);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC091);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E12);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC092);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1214);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC094);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1516);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC096);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x171B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC098);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1B1C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1F1F);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2021);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2224);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC018);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0AF2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0D4A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0F26);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x118D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC020);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x14F3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC022);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x175A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC024);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x19C0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC026);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1C26);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC089);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6050);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x5F6E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E6E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E6E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC090);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E12);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB896, BIT_0);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xD068, BIT_13);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81A2);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB54C,
++                                              0xFF00,
++                                              0xDB00);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA454, BIT_0);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA86A, BIT_0);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        if (tp->RequirePhyMdiSwapPatch) {
++                u16 adccal_offset_p0;
++                u16 adccal_offset_p1;
++                u16 adccal_offset_p2;
++                u16 adccal_offset_p3;
++                u16 rg_lpf_cap_xg_p0;
++                u16 rg_lpf_cap_xg_p1;
++                u16 rg_lpf_cap_xg_p2;
++                u16 rg_lpf_cap_xg_p3;
++                u16 rg_lpf_cap_p0;
++                u16 rg_lpf_cap_p1;
++                u16 rg_lpf_cap_p2;
++                u16 rg_lpf_cap_p3;
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0007,
++                                                      0x0001);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0000);
++                adccal_offset_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A);
++                adccal_offset_p0 &= 0x07FF;
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0008);
++                adccal_offset_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A);
++                adccal_offset_p1 &= 0x07FF;
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0010);
++                adccal_offset_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A);
++                adccal_offset_p2 &= 0x07FF;
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0018);
++                adccal_offset_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A);
++                adccal_offset_p3 &= 0x07FF;
++
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0000);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD06A,
++                                                      0x07FF,
++                                                      adccal_offset_p3);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0008);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD06A,
++                                                      0x07FF,
++                                                      adccal_offset_p2);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0010);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD06A,
++                                                      0x07FF,
++                                                      adccal_offset_p1);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD068,
++                                                      0x0018,
++                                                      0x0018);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xD06A,
++                                                      0x07FF,
++                                                      adccal_offset_p0);
++
++
++                rg_lpf_cap_xg_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5A);
++                rg_lpf_cap_xg_p0 &= 0x001F;
++                rg_lpf_cap_xg_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5A);
++                rg_lpf_cap_xg_p1 &= 0x1F00;
++                rg_lpf_cap_xg_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5C);
++                rg_lpf_cap_xg_p2 &= 0x001F;
++                rg_lpf_cap_xg_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5C);
++                rg_lpf_cap_xg_p3 &= 0x1F00;
++                rg_lpf_cap_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC18);
++                rg_lpf_cap_p0 &= 0x001F;
++                rg_lpf_cap_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC18);
++                rg_lpf_cap_p1 &= 0x1F00;
++                rg_lpf_cap_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC1A);
++                rg_lpf_cap_p2 &= 0x001F;
++                rg_lpf_cap_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC1A);
++                rg_lpf_cap_p3 &= 0x1F00;
++
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBD5A,
++                                                      0x001F,
++                                                      rg_lpf_cap_xg_p3 >> 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBD5A,
++                                                      0x1F00,
++                                                      rg_lpf_cap_xg_p2 << 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBD5C,
++                                                      0x001F,
++                                                      rg_lpf_cap_xg_p1 >> 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBD5C,
++                                                      0x1F00,
++                                                      rg_lpf_cap_xg_p0 << 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBC18,
++                                                      0x001F,
++                                                      rg_lpf_cap_p3 >> 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBC18,
++                                                      0x1F00,
++                                                      rg_lpf_cap_p2 << 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBC1A,
++                                                      0x001F,
++                                                      rg_lpf_cap_p1 >> 8);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xBC1A,
++                                                      0x1F00,
++                                                      rg_lpf_cap_p0 << 8);
++        }
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA424, BIT_3);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125b_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC08, (BIT_3 | BIT_2));
++
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FFF);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x0400);
++        }
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8560);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8562);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8564);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8566);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8568);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x856A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D);
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FFE);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0907);
++        }
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xACDA,
++                                              0xFF00,
++                                              0xFF00);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xACDE,
++                                              0xF000,
++                                              0xF000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80D6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x6077);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB506, 0x01E7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC8C, 0x0FFC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC46, 0xB7B4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC50, 0x0FBC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC3C, 0x9240);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC4E, 0x0DB4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xACC6, 0x0707);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xACC8, 0xA0D3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAD08, 0x0007);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8013);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0700);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FB9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBA);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBC);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x1900);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBE);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xE100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0800);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xE500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0F00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0400);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCa);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCc);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFD00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCe);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFF00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFB00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD2);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF400);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFF00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF600);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x813D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x390E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x814F);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x790E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80B0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0F31);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBF4C, BIT_1);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBCCA, (BIT_9 | BIT_8));
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8141);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x320E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8153);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x720E);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA432, BIT_6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8529);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x050E);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x816C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xC4A0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8170);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xC4A0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x04A0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8178);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x04A0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0719);
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF4);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0400);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF1);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0404);
++        }
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBF4A, 0x001B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8033);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8037);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x803B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFC32);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x803F);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8043);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8047);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8145);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x370E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8157);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x770E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8169);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0D0A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x817B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x1D0A);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8217);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x5000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x821A);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x5000);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DA);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0403);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DC);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0384);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2007);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BA);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x6C00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xF009);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BD);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x9F00);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80C7);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf083);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DD);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03f0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DF);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x1000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CB);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2007);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CE);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x6C00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80C9);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8009);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80D1);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x8000);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x200A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xF0AD);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x809F);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6073);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A9);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xC000);
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB896, BIT_0);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xB892, 0xFF00);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC23E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC240);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0103);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC242);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0507);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC244);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x090B);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC246);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0C0E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC248);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1012);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC24A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1416);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB896, BIT_0);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA86A, BIT_0);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA6F0, BIT_0);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA0, 0xD70D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x4100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA4, 0xE868);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA6, 0xDC59);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB54C, 0x3C18);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBFA4, BIT_5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817D);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_12);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125b_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC46,
++                                              0x00F0,
++                                              0x0090);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD30,
++                                              0x0003,
++                                              0x0001);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x760E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8107);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x360E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8551);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              BIT_15 | BIT_14 | BIT_13 | BIT_12 | BIT_11 | BIT_10 | BIT_9 | BIT_8,
++                                              BIT_11);
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xbf00,
++                                              0xE000,
++                                              0xA000);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xbf46,
++                                              0x0F00,
++                                              0x0300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8044);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x804A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8050);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8056);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x805C);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8062);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8068);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x806E);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8074);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x807A);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA4CA, BIT_6);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBF84,
++                                              BIT_15 | BIT_14 | BIT_13,
++                                              BIT_15 | BIT_13);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8170);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              BIT_13 | BIT_10 | BIT_9 | BIT_8,
++                                              BIT_15 | BIT_14 | BIT_12 | BIT_11);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8015);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB87E, BIT_8);
++        rtl8125_mdio_direct_read_phy_ocp(tp, 0xB906);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA424, BIT_3);
++
++        /*
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA0, 0xD70D);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x4100);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA4, 0xE868);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA6, 0xDC59);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB54C, 0x3C18);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBFA4, BIT_5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817D);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_12);
++        */
++
++
++#ifdef ENABLE_LIB_SUPPORT
++        /* disable phy speed down */
++        ClearEthPhyOcpBit(tp, 0xA442, BIT_3 | BIT_2);
++#endif /* ENABLE_LIB_SUPPORT */
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125bp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA80C,
++                                              BIT_14,
++                                              BIT_15 | BIT_11 | BIT_10);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8010);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_11);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8088);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x9000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x808F);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x9000);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              BIT_13,
++                                              BIT_12 | BIT_11);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125bp_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8010);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_11);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8088);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x9000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x808F);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x9000);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              BIT_13,
++                                              BIT_12 | BIT_11);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125cp_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_tgphy_irq_mask_and_ack(tp);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xad0e,
++                                              0x007F,
++                                              0x000B);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xad78, BIT_4);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81B8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00B4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81BA);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00E4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81C5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0104);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81D0);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x054D);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125d_1(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBF96, BIT_15);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBF94,
++                                              0x0007,
++                                              0x0005);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBF8E,
++                                              0x3C00,
++                                              0x2800);
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x4000);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x4000);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC80,
++                                              0x001F,
++                                              0x0004);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_15 | BIT_14 | BIT_13);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_12 | BIT_11 | BIT_10);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC80,
++                                              0x001F,
++                                              0x0005);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC82,
++                                              0x00E0,
++                                              0x0040);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_4 | BIT_3 | BIT_2);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x8000);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBD70, BIT_8);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA466, BIT_1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x836a);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, 0xFF00);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x832C);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0500);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB106,
++                                              0x0700,
++                                              0x0100);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB206,
++                                              0x0700,
++                                              0x0200);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB306,
++                                              0x0700,
++                                              0x0300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80CB);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0300);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBCF4, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBCF6, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBC12, 0x0000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x844d);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0200);
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8feb);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xB87E,
++                                                      0xFF00,
++                                                      0x0100);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8fe9);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xB87E,
++                                                      0xFF00,
++                                                      0x0600);
++        }
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC7E,
++                                              0x01FC,
++                                              0x00B4);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8105);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x7A00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8117);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x3A00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8103);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x7400);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8115);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x3400);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xAD40, BIT_5 | BIT_4);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD66,
++                                              0x000F,
++                                              0x0007);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD68,
++                                              0xF000,
++                                              0x8000);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD68,
++                                              0x0F00,
++                                              0x0500);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD68,
++                                              0x000F,
++                                              0x0002);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAD6A,
++                                              0xF000,
++                                              0x7000);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC50, 0x01E8);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81FA);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x5400);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA864,
++                                              0x00F0,
++                                              0x00C0);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA42C,
++                                              0x00FF,
++                                              0x0002);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80E1);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x0F00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DE);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xF000,
++                                              0x0700);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA846, BIT_7);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BA);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8A04);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BD);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xCA00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B7);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xB300);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CE);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8A04);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80D1);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xCA00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CB);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0xBB00);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A6);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4909);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x05B8);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8200);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x5800);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF1);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7078);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF3);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5D78);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF5);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7862);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF7);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x1400);
++
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x814C);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8455);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x814E);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x84A6);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8163);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x0600);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x816A);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x0500);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8171);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x1f00);
++        }
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC3A,
++                                              0x000F,
++                                              0x0006);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8064);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8067);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x806A);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x806D);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8070);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8073);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8076);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8079);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x807C);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x807F);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBFA0,
++                                              0xFF70,
++                                              0x5500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x9D00);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8165);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0x0700,
++                                              0x0200);
++
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8019);
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0005);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ED);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0B00);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xD401);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x2900);
++        }
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8018);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xA438,
++                                              0xFF00,
++                                              0x1700);
++
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x815B);
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA438,
++                                                      0xFF00,
++                                                      0x1700);
++        }
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4E0, BIT_15);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA654, BIT_11);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_12 | BIT_0);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_7);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config_8125d_2(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11);
++
++
++        rtl8125_set_phy_mcu_patch_request(tp);
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x4000);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x4000);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC80,
++                                              0x001F,
++                                              0x0004);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_15 | BIT_14 | BIT_13);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_12 | BIT_11 | BIT_10);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC80,
++                                              0x001F,
++                                              0x0005);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBC82,
++                                              0x00E0,
++                                              0x0040);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_4 | BIT_3 | BIT_2);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xBCD8,
++                                              0xC000,
++                                              0x8000);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14);
++
++        rtl8125_clear_phy_mcu_patch_request(tp);
++
++
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xAC7E,
++                                              0x01FC,
++                                              0x00B4);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8105);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x7A00);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8117);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x3A00);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8103);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x7400);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8115);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x3400);
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FEB);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0500);
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FEA);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0x0700);
++
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80D6);
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              0xB87E,
++                                              0xFF00,
++                                              0xEF00);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA654, BIT_11);
++
++
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA448, BIT_10);
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA586, BIT_10);
++
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_12 | BIT_0);
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_7);
++
++
++        if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                rtl8125_enable_phy_aldps(tp);
++}
++
++static void
++rtl8125_hw_phy_config(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++
++        if (tp->resume_not_chg_speed)
++                return;
++
++        tp->phy_reset_enable(dev);
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++#ifndef ENABLE_USE_FIRMWARE_FILE
++        if (!tp->rtl_fw) {
++                rtl8125_set_hw_phy_before_init_phy_mcu(dev);
++
++                rtl8125_init_hw_phy_mcu(dev);
++        }
++#endif
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                rtl8125_hw_phy_config_8125a_1(dev);
++                break;
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                rtl8125_hw_phy_config_8125a_2(dev);
++                break;
++        case CFG_METHOD_4:
++                rtl8125_hw_phy_config_8125b_1(dev);
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                rtl8125_hw_phy_config_8125b_2(dev);
++                break;
++        case CFG_METHOD_8:
++                rtl8125_hw_phy_config_8125bp_1(dev);
++                break;
++        case CFG_METHOD_9:
++                rtl8125_hw_phy_config_8125bp_2(dev);
++                break;
++        case CFG_METHOD_10:
++                rtl8125_hw_phy_config_8125d_1(dev);
++                break;
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                rtl8125_hw_phy_config_8125d_2(dev);
++                break;
++        case CFG_METHOD_12:
++                rtl8125_hw_phy_config_8125cp_1(dev);
++                break;
++        }
++
++        //legacy force mode(Chap 22)
++        rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5B4, BIT_15);
++
++#ifdef ENABLE_FIBER_SUPPORT
++        rtl8125_hw_fiber_phy_config(tp);
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        /*ocp phy power saving*/
++        /*
++        if (aspm) {
++        if (tp->mcfg == CFG_METHOD_2 || tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6)
++                rtl8125_enable_ocp_phy_power_saving(dev);
++        }
++        */
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++
++        if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) {
++                if (tp->eee.eee_enabled)
++                        rtl8125_enable_eee(tp);
++                else
++                        rtl8125_disable_eee(tp);
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++}
++
++static void
++rtl8125_up(struct net_device *dev)
++{
++        rtl8125_hw_init(dev);
++        rtl8125_hw_reset(dev);
++        rtl8125_powerup_pll(dev);
++        rtl8125_hw_ephy_config(dev);
++        rtl8125_hw_phy_config(dev);
++        rtl8125_hw_config(dev);
++}
++
++/*
++static inline void rtl8125_delete_esd_timer(struct net_device *dev, struct timer_list *timer)
++{
++        del_timer_sync(timer);
++}
++
++static inline void rtl8125_request_esd_timer(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct timer_list *timer = &tp->esd_timer;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++        setup_timer(timer, rtl8125_esd_timer, (unsigned long)dev);
++#else
++        timer_setup(timer, rtl8125_esd_timer, 0);
++#endif
++        mod_timer(timer, jiffies + RTL8125_ESD_TIMEOUT);
++}
++*/
++
++/*
++static inline void rtl8125_delete_link_timer(struct net_device *dev, struct timer_list *timer)
++{
++        del_timer_sync(timer);
++}
++
++static inline void rtl8125_request_link_timer(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct timer_list *timer = &tp->link_timer;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++        setup_timer(timer, rtl8125_link_timer, (unsigned long)dev);
++#else
++        timer_setup(timer, rtl8125_link_timer, 0);
++#endif
++        mod_timer(timer, jiffies + RTL8125_LINK_TIMEOUT);
++}
++*/
++
++#ifdef CONFIG_NET_POLL_CONTROLLER
++/*
++ * Polling 'interrupt' - used by things like netconsole to send skbs
++ * without having to re-enable interrupts. It's not called while
++ * the interrupt routine is executing.
++ */
++static void
++rtl8125_netpoll(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++        for (i = 0; i < tp->irq_nvecs; i++) {
++                struct r8125_irq *irq = &tp->irq_tbl[i];
++                struct r8125_napi *r8125napi = &tp->r8125napi[i];
++
++                disable_irq(irq->vector);
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0)
++                irq->handler(irq->vector, r8125napi);
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++                irq->handler(irq->vector, r8125napi, NULL);
++#else
++                irq->handler(irq->vector, r8125napi);
++#endif
++
++                enable_irq(irq->vector);
++        }
++}
++#endif //CONFIG_NET_POLL_CONTROLLER
++
++static void
++rtl8125_setup_interrupt_mask(struct rtl8125_private *tp)
++{
++        int i;
++
++        if (tp->HwCurrIsrVer == 7) {
++                tp->intr_mask = ISRIMR_V7_LINKCHG | ISRIMR_V7_TOK_Q0;
++                if (tp->num_tx_rings > 1)
++                        tp->intr_mask |= ISRIMR_V7_TOK_Q1;
++                for (i = 0; i < tp->num_rx_rings; i++)
++                        tp->intr_mask |= ISRIMR_V7_ROK_Q0 << i;
++        } else if (tp->HwCurrIsrVer == 5) {
++                tp->intr_mask = ISRIMR_V5_LINKCHG | ISRIMR_V5_TOK_Q0;
++                if (tp->num_tx_rings > 1)
++                        tp->intr_mask |= ISRIMR_V5_TOK_Q1;
++                for (i = 0; i < tp->num_rx_rings; i++)
++                        tp->intr_mask |= ISRIMR_V5_ROK_Q0 << i;
++        } else if (tp->HwCurrIsrVer == 4) {
++                tp->intr_mask = ISRIMR_V4_LINKCHG;
++                for (i = 0; i < max(tp->num_tx_rings, tp->num_rx_rings); i++)
++                        tp->intr_mask |= ISRIMR_V4_ROK_Q0 << i;
++
++                if (tp->DASH)
++                        tp->intr_l2_mask |= ISRIMR_V4_L2_IPC2;
++
++                if (tp->intr_l2_mask > 0)
++                        tp->intr_mask |= ISRIMR_V4_LAYER2_INTR_STS;
++        } else if (tp->HwCurrIsrVer == 3) {
++                tp->intr_mask = ISRIMR_V2_LINKCHG;
++                for (i = 0; i < max(tp->num_tx_rings, tp->num_rx_rings); i++)
++                        tp->intr_mask |= ISRIMR_V2_ROK_Q0 << i;
++        } else if (tp->HwCurrIsrVer == 2) {
++                tp->intr_mask = ISRIMR_V2_LINKCHG | ISRIMR_TOK_Q0;
++                if (tp->num_tx_rings > 1)
++                        tp->intr_mask |= ISRIMR_TOK_Q1;
++
++                for (i = 0; i < tp->num_rx_rings; i++)
++                        tp->intr_mask |= ISRIMR_V2_ROK_Q0 << i;
++        } else {
++                tp->intr_mask = LinkChg | RxDescUnavail | TxOK | RxOK | SWInt;
++                tp->timer_intr_mask = LinkChg | PCSTimeout;
++
++#ifdef ENABLE_DASH_SUPPORT
++                if (tp->DASH) {
++                        if (HW_DASH_SUPPORT_IPC2(tp)) {
++                                tp->timer_intr_mask |= ISRIMR_DASH_INTR_EN;
++                                tp->intr_mask |= ISRIMR_DASH_INTR_EN;
++                        }
++                }
++#endif
++        }
++}
++
++static void
++rtl8125_setup_mqs_reg(struct rtl8125_private *tp)
++{
++        u16 hw_clo_ptr0_reg, sw_tail_ptr0_reg;
++        u16 reg_len;
++        int i;
++
++        //tx
++        tp->tx_ring[0].tdsar_reg = TxDescStartAddrLow;
++        for (i = 1; i < tp->HwSuppNumTxQueues; i++) {
++                tp->tx_ring[i].tdsar_reg = (u16)(TNPDS_Q1_LOW_8125 + (i - 1) * 8);
++        }
++
++        switch (tp->HwSuppTxNoCloseVer) {
++        case 4:
++        case 5:
++                hw_clo_ptr0_reg = HW_CLO_PTR0_8126;
++                sw_tail_ptr0_reg = SW_TAIL_PTR0_8126;
++                reg_len = 4;
++                break;
++        case 6:
++                hw_clo_ptr0_reg = HW_CLO_PTR0_8125BP;
++                sw_tail_ptr0_reg = SW_TAIL_PTR0_8125BP;
++                reg_len = 8;
++                break;
++        default:
++                hw_clo_ptr0_reg = HW_CLO_PTR0_8125;
++                sw_tail_ptr0_reg = SW_TAIL_PTR0_8125;
++                reg_len = 4;
++                break;
++        }
++
++        for (i = 0; i < tp->HwSuppNumTxQueues; i++) {
++                tp->tx_ring[i].hw_clo_ptr_reg = (u16)(hw_clo_ptr0_reg + i * reg_len);
++                tp->tx_ring[i].sw_tail_ptr_reg = (u16)(sw_tail_ptr0_reg + i * reg_len);
++        }
++
++        //rx
++        tp->rx_ring[0].rdsar_reg = RxDescAddrLow;
++        for (i = 1; i < tp->HwSuppNumRxQueues; i++)
++                tp->rx_ring[i].rdsar_reg = (u16)(RDSAR_Q1_LOW_8125 + (i - 1) * 8);
++
++        tp->isr_reg[0] = ISR0_8125;
++        for (i = 1; i < tp->hw_supp_irq_nvecs; i++)
++                tp->isr_reg[i] = (u16)(ISR1_8125 + (i - 1) * 4);
++
++        tp->imr_reg[0] = IMR0_8125;
++        for (i = 1; i < tp->hw_supp_irq_nvecs; i++)
++                tp->imr_reg[i] = (u16)(IMR1_8125 + (i - 1) * 4);
++}
++
++static void
++rtl8125_backup_led_select(struct rtl8125_private *tp)
++{
++        tp->BackupLedSel[1] = RTL_R16(tp, LEDSEL_1_8125);
++        tp->BackupLedSel[2] = RTL_R16(tp, LEDSEL_2_8125);
++        tp->BackupLedSel[3] = RTL_R16(tp, LEDSEL_3_8125);
++        tp->BackupLedSel[0] = RTL_R16(tp, CustomLED);
++}
++
++static void
++rtl8125_restore_led_select(struct rtl8125_private *tp)
++{
++        RTL_W16(tp, LEDSEL_1_8125, tp->BackupLedSel[1]);
++        RTL_W16(tp, LEDSEL_2_8125, tp->BackupLedSel[2]);
++        RTL_W16(tp, LEDSEL_3_8125, tp->BackupLedSel[3]);
++        RTL_W16(tp, CustomLED, tp->BackupLedSel[0]);
++}
++
++static bool
++_rtl8125_backup_phy_fuse_dout_v4(struct rtl8125_private *tp)
++{
++        u16 i;
++
++        for (i = 0; i < R8125_PHY_FUSE_DOUT_NUM; i++) {
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA460,
++                                                      0x001F,
++                                                      i);
++                tp->BackupPhyFuseDout[i] = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA462);
++        }
++
++        if (tp->HwSuppEsdVer == 4) {
++                tp->BackupPhyFuseDout[3] |= 0xF000;
++                tp->BackupPhyFuseDout[7] |= 0x03FF;
++                tp->BackupPhyFuseDout[4] = USHRT_MAX;
++                tp->BackupPhyFuseDout[5] = USHRT_MAX;
++                tp->BackupPhyFuseDout[6] = USHRT_MAX;
++        } else if (tp->HwSuppEsdVer == 5) {
++                tp->BackupPhyFuseDout[30] = USHRT_MAX;
++                tp->BackupPhyFuseDout[31] = USHRT_MAX;
++        }
++
++        return TRUE;
++}
++
++static bool
++rtl8125_backup_phy_fuse_dout(struct rtl8125_private *tp)
++{
++        if (tp->HwSuppEsdVer == 4 || tp->HwSuppEsdVer == 5)
++                return _rtl8125_backup_phy_fuse_dout_v4(tp);
++        else
++                return FALSE;
++}
++
++static void
++_rtl8125_restore_phy_fuse_dout_v4(struct rtl8125_private *tp)
++{
++        u16 i;
++
++        for (i = 0; i < R8125_PHY_FUSE_DOUT_NUM; i++) {
++                if (tp->BackupPhyFuseDout[i] == USHRT_MAX)
++                        continue;
++
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      0xA460,
++                                                      0x001F,
++                                                      i);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA462, tp->BackupPhyFuseDout[i]);
++        }
++}
++
++static void
++rtl8125_restore_phy_fuse_dout(struct rtl8125_private *tp)
++{
++        if (tp->HwSuppEsdVer == 4 || tp->HwSuppEsdVer == 5)
++                _rtl8125_restore_phy_fuse_dout_v4(tp);
++        else
++                return;
++}
++
++static void
++rtl8125_init_software_variable(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct pci_dev *pdev = tp->pci_dev;
++
++#ifdef ENABLE_LIB_SUPPORT
++        tp->ring_lib_enabled = 1;
++#endif
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3: {
++                u8 tmp = (u8)rtl8125_mac_ocp_read(tp, 0xD006);
++                if (tmp == 0x02 || tmp == 0x04)
++                        tp->HwSuppDashVer = 2;
++        }
++        break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++                tp->HwSuppDashVer = 4;
++                break;
++        default:
++                tp->HwSuppDashVer = 0;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++                if (HW_DASH_SUPPORT_DASH(tp))
++                        tp->HwSuppOcpChannelVer = 2;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_12:
++                tp->HwSuppOcpChannelVer = 2;
++                break;
++        }
++        tp->AllowAccessDashOcp = rtl8125_is_allow_access_dash_ocp(tp);
++
++        tp->HwPkgDet = rtl8125_mac_ocp_read(tp, 0xDC00);
++        tp->HwPkgDet = (tp->HwPkgDet >> 3) & 0x07;
++
++        tp->HwSuppNowIsOobVer = 1;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++                tp->HwPcieSNOffset = 0x16C;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwPcieSNOffset = 0x168;
++                break;
++        }
++
++#ifdef ENABLE_REALWOW_SUPPORT
++        rtl8125_get_realwow_hw_version(dev);
++#endif //ENABLE_REALWOW_SUPPORT
++
++        tp->DASH = rtl8125_check_dash(tp);
++
++        if (tp->DASH) {
++                eee_enable = 0;
++
++                tp->SizeOfSendToFwBuffer = SEND_TO_FW_BUF_SIZE;
++                tp->SizeOfRecvFromFwBuffer = RECV_FROM_FW_BUF_SIZE;
++
++                tp->DashFirmwareVersion = rtl8125_get_dash_fw_ver(tp);
++        }
++
++        if (aspm) {
++                tp->org_pci_offset_99 = rtl8125_csi_fun0_read_byte(tp, 0x99);
++                tp->org_pci_offset_99 &= ~(BIT_5|BIT_6);
++
++                switch (tp->mcfg) {
++                case CFG_METHOD_2:
++                case CFG_METHOD_3:
++                case CFG_METHOD_6:
++                        tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x264);
++                        break;
++                case CFG_METHOD_4:
++                case CFG_METHOD_5:
++                case CFG_METHOD_7:
++                        tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x214);
++                        break;
++                case CFG_METHOD_8:
++                case CFG_METHOD_9:
++                case CFG_METHOD_10:
++                case CFG_METHOD_11:
++                case CFG_METHOD_13:
++                        tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x210);
++                        break;
++                case CFG_METHOD_12:
++                        tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x184);
++                        break;
++                }
++        }
++
++        pci_read_config_byte(pdev, 0x80, &tp->org_pci_offset_80);
++        pci_read_config_byte(pdev, 0x81, &tp->org_pci_offset_81);
++
++        tp->use_timer_interrupt = TRUE;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++                tp->HwSuppMaxPhyLinkSpeed = 2500;
++                break;
++        default:
++                tp->HwSuppMaxPhyLinkSpeed = 1000;
++                break;
++        }
++
++        if (timer_count == 0 || tp->mcfg == CFG_METHOD_DEFAULT)
++                tp->use_timer_interrupt = FALSE;
++
++        tp->ShortPacketSwChecksum = TRUE;
++        tp->UseSwPaddingShortPkt = TRUE;
++
++#ifdef ENABLE_FIBER_SUPPORT
++        rtl8125_check_fiber_mode_support(tp);
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                tp->HwSuppMagicPktVer = WAKEUP_MAGIC_PACKET_V3;
++                break;
++        default:
++                tp->HwSuppMagicPktVer = WAKEUP_MAGIC_PACKET_NOT_SUPPORT;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                tp->HwSuppEsdVer = 4;
++                break;
++        case CFG_METHOD_10:
++                tp->HwSuppEsdVer = 5;
++                break;
++        default:
++                tp->HwSuppEsdVer = 1;
++                break;
++        }
++
++        if (rtl8125_backup_phy_fuse_dout(tp))
++                tp->TestPhyOcpReg = TRUE;
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++        tp->TestPhyOcpReg = FALSE;
++#endif
++
++        tp->HwSuppLinkChgWakeUpVer = 3;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++                tp->HwSuppD0SpeedUpVer = 1;
++                break;
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppD0SpeedUpVer = 2;
++                break;
++        }
++
++        tp->HwSuppCheckPhyDisableModeVer = 3;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++                tp->HwSuppTxNoCloseVer = 3;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppTxNoCloseVer = 6;
++                break;
++        }
++
++        switch (tp->HwSuppTxNoCloseVer) {
++        case 5:
++        case 6:
++                tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V4;
++                break;
++        case 4:
++                tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V3;
++                break;
++        case 3:
++                tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V2;
++                break;
++        default:
++                tx_no_close_enable = 0;
++                break;
++        }
++
++        if (tp->HwSuppTxNoCloseVer > 0 && tx_no_close_enable == 1)
++                tp->EnableTxNoClose = TRUE;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                tp->RequireLSOPatch = TRUE;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_2;
++                break;
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_3;
++                break;
++        case CFG_METHOD_4:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_4;
++                break;
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_5;
++                break;
++        case CFG_METHOD_8:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_8;
++                break;
++        case CFG_METHOD_9:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_9;
++                break;
++        case CFG_METHOD_10:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_10;
++                break;
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_11;
++                break;
++        case CFG_METHOD_12:
++                tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_12;
++                break;
++        }
++
++        if (tp->HwIcVerUnknown) {
++                tp->NotWrRamCodeToMicroP = TRUE;
++                tp->NotWrMcuPatchCode = TRUE;
++        }
++
++        rtl8125_check_hw_phy_mcu_code_ver(dev);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                if ((rtl8125_mac_ocp_read(tp, 0xD442) & BIT_5) &&
++                    (rtl8125_mdio_direct_read_phy_ocp(tp, 0xD068) & BIT_1))
++                        tp->RequirePhyMdiSwapPatch = TRUE;
++                break;
++        }
++
++        tp->HwSuppMacMcuVer = 2;
++
++        tp->MacMcuPageSize = RTL8125_MAC_MCU_PAGE_SIZE;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppNumTxQueues = 2;
++                tp->HwSuppNumRxQueues = 4;
++                break;
++        default:
++                tp->HwSuppNumTxQueues = 1;
++                tp->HwSuppNumRxQueues = 1;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                /* mac ptp */
++                tp->HwSuppPtpVer = 1;
++                break;
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                /* phy ptp */
++                tp->HwSuppPtpVer = 3;
++                break;
++        }
++#ifdef ENABLE_PTP_SUPPORT
++        if (tp->HwSuppPtpVer > 0)
++                tp->EnablePtp = 1;
++#endif
++
++        //init interrupt
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                tp->HwSuppIsrVer = 2;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++                tp->HwSuppIsrVer = 4;
++                break;
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                tp->HwSuppIsrVer = 5;
++                break;
++        case CFG_METHOD_12:
++                tp->HwSuppIsrVer = 7;
++                break;
++        default:
++                tp->HwSuppIsrVer = 1;
++                break;
++        }
++
++        tp->HwCurrIsrVer = tp->HwSuppIsrVer;
++        if (tp->HwCurrIsrVer > 1) {
++                if (!(tp->features & RTL_FEATURE_MSIX) ||
++                    tp->irq_nvecs < tp->min_irq_nvecs)
++                        tp->HwCurrIsrVer = 1;
++        }
++
++        tp->num_tx_rings = 1;
++#ifdef ENABLE_MULTIPLE_TX_QUEUE
++#ifndef ENABLE_LIB_SUPPORT
++        tp->num_tx_rings = tp->HwSuppNumTxQueues;
++#endif
++#endif
++        if (tp->HwCurrIsrVer < 2 ||
++            (tp->HwCurrIsrVer == 2 && tp->irq_nvecs < 19))
++                tp->num_tx_rings = 1;
++
++        //RSS
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppRssVer = 5;
++                tp->HwSuppIndirTblEntries = 128;
++                break;
++        }
++
++        tp->num_rx_rings = 1;
++#ifdef ENABLE_RSS_SUPPORT
++#ifdef ENABLE_LIB_SUPPORT
++        if (tp->HwSuppRssVer > 0)
++                tp->EnableRss = 1;
++#else
++        if (tp->HwSuppRssVer > 0 && tp->HwCurrIsrVer > 1) {
++                u8 rss_queue_num = netif_get_num_default_rss_queues();
++                tp->num_rx_rings = (tp->HwSuppNumRxQueues > rss_queue_num)?
++                                   rss_queue_num : tp->HwSuppNumRxQueues;
++
++                if (!(tp->num_rx_rings >= 2 && tp->irq_nvecs >= tp->num_rx_rings))
++                        tp->num_rx_rings = 1;
++
++                if (tp->num_rx_rings >= 2)
++                        tp->EnableRss = 1;
++        }
++#endif
++#endif
++
++        //interrupt mask
++        rtl8125_setup_interrupt_mask(tp);
++
++        rtl8125_setup_mqs_reg(tp);
++
++        rtl8125_set_ring_size(tp, NUM_RX_DESC, NUM_TX_DESC);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++                tp->HwSuppIntMitiVer = 3;
++                break;
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                tp->HwSuppIntMitiVer = 4;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppIntMitiVer = 6;
++                break;
++        }
++
++        tp->HwSuppTcamVer = 1;
++        tp->TcamNotValidReg = TCAM_NOTVALID_ADDR;
++        tp->TcamValidReg = TCAM_VALID_ADDR;
++        tp->TcamMaAddrcOffset = TCAM_MAC_ADDR;
++        tp->TcamVlanTagOffset = TCAM_VLAN_TAG;
++
++        tp->HwSuppExtendTallyCounterVer = 1;
++
++        timer_count_v2 = (timer_count / 0x100);
++        /* timer unit is double */
++        switch (tp->mcfg) {
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                timer_count_v2 /= 2;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                tp->RequiredPfmPatch = TRUE;
++                break;
++        }
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_6:
++        case CFG_METHOD_7:
++                tp->HwSuppRxDescType = RX_DESC_RING_TYPE_3;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                tp->HwSuppRxDescType = RX_DESC_RING_TYPE_4;
++                break;
++        default:
++                tp->HwSuppRxDescType = RX_DESC_RING_TYPE_1;
++                break;
++        }
++
++        tp->InitRxDescType = RX_DESC_RING_TYPE_1;
++        tp->RxDescLength = RX_DESC_LEN_TYPE_1;
++        switch (tp->HwSuppRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                if (tp->EnableRss || tp->EnablePtp) {
++                        tp->InitRxDescType = RX_DESC_RING_TYPE_3;
++                        tp->RxDescLength = RX_DESC_LEN_TYPE_3;
++                }
++                break;
++        case RX_DESC_RING_TYPE_4:
++                if (tp->EnableRss) {
++                        tp->InitRxDescType = RX_DESC_RING_TYPE_4;
++                        tp->RxDescLength = RX_DESC_LEN_TYPE_4;
++                }
++                break;
++        }
++
++        tp->rtl8125_rx_config = rtl_chip_info[tp->chipset].RCR_Cfg;
++        if (tp->InitRxDescType == RX_DESC_RING_TYPE_3)
++                tp->rtl8125_rx_config |= EnableRxDescV3;
++        else if (tp->InitRxDescType == RX_DESC_RING_TYPE_4)
++                tp->rtl8125_rx_config &= ~EnableRxDescV4_1;
++
++        rtl8125_backup_led_select(tp);
++
++        tp->wol_opts = rtl8125_get_hw_wol(tp);
++        tp->wol_enabled = (tp->wol_opts) ? WOL_ENABLED : WOL_DISABLED;
++
++        rtl8125_set_link_option(tp, autoneg_mode, speed_mode, duplex_mode,
++                                rtl8125_fc_full);
++
++        tp->max_jumbo_frame_size = rtl_chip_info[tp->chipset].jumbo_frame_sz;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++        /* MTU range: 60 - hw-specific max */
++        dev->min_mtu = ETH_MIN_MTU;
++        dev->max_mtu = tp->max_jumbo_frame_size;
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++
++        if (tp->mcfg != CFG_METHOD_DEFAULT) {
++                struct ethtool_keee *eee = &tp->eee;
++
++                eee->eee_enabled = eee_enable;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0)
++                eee->supported  = SUPPORTED_100baseT_Full |
++                                  SUPPORTED_1000baseT_Full;
++                eee->advertised = mmd_eee_adv_to_ethtool_adv_t(MDIO_EEE_1000T | MDIO_EEE_100TX);
++                switch (tp->mcfg) {
++                case CFG_METHOD_2:
++                case CFG_METHOD_3:
++                        /* nothing to do */
++                        break;
++                default:
++                        if (HW_SUPP_PHY_LINK_SPEED_2500M(tp)) {
++                                eee->supported |= SUPPORTED_2500baseX_Full;
++                                eee->advertised |= SUPPORTED_2500baseX_Full;
++                        }
++                        break;
++                }
++#else
++                linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, eee->supported);
++                linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, eee->supported);
++                linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, eee->advertised);
++                linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, eee->advertised);
++                switch (tp->mcfg) {
++                case CFG_METHOD_2:
++                case CFG_METHOD_3:
++                        /* nothing to do */
++                        break;
++                default:
++                        if (HW_SUPP_PHY_LINK_SPEED_2500M(tp)) {
++                                linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, eee->supported);
++                                linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, eee->advertised);
++                        }
++                        break;
++                }
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) */
++                eee->tx_lpi_enabled = eee_enable;
++                eee->tx_lpi_timer = dev->mtu + ETH_HLEN + 0x20;
++        }
++
++        tp->ptp_master_mode = enable_ptp_master_mode;
++
++#ifdef ENABLE_RSS_SUPPORT
++        if (tp->EnableRss)
++                rtl8125_init_rss(tp);
++#endif
++}
++
++static void
++rtl8125_release_board(struct pci_dev *pdev,
++                      struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        void __iomem *ioaddr = tp->mmio_addr;
++
++        rtl8125_rar_set(tp, tp->org_mac_addr);
++        tp->wol_enabled = WOL_DISABLED;
++
++        if (!tp->DASH)
++                rtl8125_phy_power_down(dev);
++
++        iounmap(ioaddr);
++        pci_release_regions(pdev);
++        pci_clear_mwi(pdev);
++        pci_disable_device(pdev);
++        free_netdev(dev);
++}
++
++static void
++rtl8125_hw_address_set(struct net_device *dev, u8 mac_addr[MAC_ADDR_LEN])
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++        eth_hw_addr_set(dev, mac_addr);
++#else
++        memcpy(dev->dev_addr, mac_addr, MAC_ADDR_LEN);
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0)
++}
++
++static int
++rtl8125_get_mac_address(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++        u8 mac_addr[MAC_ADDR_LEN];
++
++        for (i = 0; i < MAC_ADDR_LEN; i++)
++                mac_addr[i] = RTL_R8(tp, MAC0 + i);
++
++        *(u32*)&mac_addr[0] = RTL_R32(tp, BACKUP_ADDR0_8125);
++        *(u16*)&mac_addr[4] = RTL_R16(tp, BACKUP_ADDR1_8125);
++
++        if (!is_valid_ether_addr(mac_addr)) {
++                netif_err(tp, probe, dev, "Invalid ether addr %pM\n",
++                          mac_addr);
++                eth_random_addr(mac_addr);
++                dev->addr_assign_type = NET_ADDR_RANDOM;
++                netif_info(tp, probe, dev, "Random ether addr %pM\n",
++                           mac_addr);
++                tp->random_mac = 1;
++        }
++
++        rtl8125_hw_address_set(dev, mac_addr);
++        rtl8125_rar_set(tp, mac_addr);
++
++        /* keep the original MAC address */
++        memcpy(tp->org_mac_addr, dev->dev_addr, MAC_ADDR_LEN);
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++        memcpy(dev->perm_addr, dev->dev_addr, MAC_ADDR_LEN);
++#endif
++        return 0;
++}
++
++/**
++ * rtl8125_set_mac_address - Change the Ethernet Address of the NIC
++ * @dev: network interface device structure
++ * @p:   pointer to an address structure
++ *
++ * Return 0 on success, negative on failure
++ **/
++static int
++rtl8125_set_mac_address(struct net_device *dev,
++                        void *p)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct sockaddr *addr = p;
++
++        if (!is_valid_ether_addr(addr->sa_data))
++                return -EADDRNOTAVAIL;
++
++        rtl8125_hw_address_set(dev, addr->sa_data);
++
++        rtl8125_rar_set(tp, dev->dev_addr);
++
++        return 0;
++}
++
++/******************************************************************************
++ * rtl8125_rar_set - Puts an ethernet address into a receive address register.
++ *
++ * tp - The private data structure for driver
++ * addr - Address to put into receive address register
++ *****************************************************************************/
++void
++rtl8125_rar_set(struct rtl8125_private *tp,
++                const u8 *addr)
++{
++        uint32_t rar_low = 0;
++        uint32_t rar_high = 0;
++
++        rar_low = ((uint32_t) addr[0] |
++                   ((uint32_t) addr[1] << 8) |
++                   ((uint32_t) addr[2] << 16) |
++                   ((uint32_t) addr[3] << 24));
++
++        rar_high = ((uint32_t) addr[4] |
++                    ((uint32_t) addr[5] << 8));
++
++        rtl8125_enable_cfg9346_write(tp);
++        RTL_W32(tp, MAC0, rar_low);
++        RTL_W32(tp, MAC4, rar_high);
++
++        rtl8125_disable_cfg9346_write(tp);
++}
++
++#ifdef ETHTOOL_OPS_COMPAT
++static int ethtool_get_settings(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_cmd cmd = { ETHTOOL_GSET };
++        int err;
++
++        if (!ethtool_ops->get_settings)
++                return -EOPNOTSUPP;
++
++        err = ethtool_ops->get_settings(dev, &cmd);
++        if (err < 0)
++                return err;
++
++        if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_settings(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_cmd cmd;
++
++        if (!ethtool_ops->set_settings)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
++                return -EFAULT;
++
++        return ethtool_ops->set_settings(dev, &cmd);
++}
++
++static int ethtool_get_drvinfo(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_drvinfo info;
++        struct ethtool_ops *ops = ethtool_ops;
++
++        if (!ops->get_drvinfo)
++                return -EOPNOTSUPP;
++
++        memset(&info, 0, sizeof(info));
++        info.cmd = ETHTOOL_GDRVINFO;
++        ops->get_drvinfo(dev, &info);
++
++        if (ops->self_test_count)
++                info.testinfo_len = ops->self_test_count(dev);
++        if (ops->get_stats_count)
++                info.n_stats = ops->get_stats_count(dev);
++        if (ops->get_regs_len)
++                info.regdump_len = ops->get_regs_len(dev);
++        if (ops->get_eeprom_len)
++                info.eedump_len = ops->get_eeprom_len(dev);
++
++        if (copy_to_user(useraddr, &info, sizeof(info)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_get_regs(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_regs regs;
++        struct ethtool_ops *ops = ethtool_ops;
++        void *regbuf;
++        int reglen, ret;
++
++        if (!ops->get_regs || !ops->get_regs_len)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&regs, useraddr, sizeof(regs)))
++                return -EFAULT;
++
++        reglen = ops->get_regs_len(dev);
++        if (regs.len > reglen)
++                regs.len = reglen;
++
++        regbuf = kmalloc(reglen, GFP_USER);
++        if (!regbuf)
++                return -ENOMEM;
++
++        ops->get_regs(dev, &regs, regbuf);
++
++        ret = -EFAULT;
++        if (copy_to_user(useraddr, &regs, sizeof(regs)))
++                goto out;
++        useraddr += offsetof(struct ethtool_regs, data);
++        if (copy_to_user(useraddr, regbuf, reglen))
++                goto out;
++        ret = 0;
++
++out:
++        kfree(regbuf);
++        return ret;
++}
++
++static int ethtool_get_wol(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
++
++        if (!ethtool_ops->get_wol)
++                return -EOPNOTSUPP;
++
++        ethtool_ops->get_wol(dev, &wol);
++
++        if (copy_to_user(useraddr, &wol, sizeof(wol)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_wol(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_wolinfo wol;
++
++        if (!ethtool_ops->set_wol)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&wol, useraddr, sizeof(wol)))
++                return -EFAULT;
++
++        return ethtool_ops->set_wol(dev, &wol);
++}
++
++static int ethtool_get_msglevel(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GMSGLVL };
++
++        if (!ethtool_ops->get_msglevel)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_msglevel(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_msglevel(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata;
++
++        if (!ethtool_ops->set_msglevel)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&edata, useraddr, sizeof(edata)))
++                return -EFAULT;
++
++        ethtool_ops->set_msglevel(dev, edata.data);
++        return 0;
++}
++
++static int ethtool_nway_reset(struct net_device *dev)
++{
++        if (!ethtool_ops->nway_reset)
++                return -EOPNOTSUPP;
++
++        return ethtool_ops->nway_reset(dev);
++}
++
++static int ethtool_get_link(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GLINK };
++
++        if (!ethtool_ops->get_link)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_link(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_get_eeprom(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_eeprom eeprom;
++        struct ethtool_ops *ops = ethtool_ops;
++        u8 *data;
++        int ret;
++
++        if (!ops->get_eeprom || !ops->get_eeprom_len)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
++                return -EFAULT;
++
++        /* Check for wrap and zero */
++        if (eeprom.offset + eeprom.len <= eeprom.offset)
++                return -EINVAL;
++
++        /* Check for exceeding total eeprom len */
++        if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
++                return -EINVAL;
++
++        data = kmalloc(eeprom.len, GFP_USER);
++        if (!data)
++                return -ENOMEM;
++
++        ret = -EFAULT;
++        if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
++                goto out;
++
++        ret = ops->get_eeprom(dev, &eeprom, data);
++        if (ret)
++                goto out;
++
++        ret = -EFAULT;
++        if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
++                goto out;
++        if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
++                goto out;
++        ret = 0;
++
++out:
++        kfree(data);
++        return ret;
++}
++
++static int ethtool_set_eeprom(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_eeprom eeprom;
++        struct ethtool_ops *ops = ethtool_ops;
++        u8 *data;
++        int ret;
++
++        if (!ops->set_eeprom || !ops->get_eeprom_len)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
++                return -EFAULT;
++
++        /* Check for wrap and zero */
++        if (eeprom.offset + eeprom.len <= eeprom.offset)
++                return -EINVAL;
++
++        /* Check for exceeding total eeprom len */
++        if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
++                return -EINVAL;
++
++        data = kmalloc(eeprom.len, GFP_USER);
++        if (!data)
++                return -ENOMEM;
++
++        ret = -EFAULT;
++        if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
++                goto out;
++
++        ret = ops->set_eeprom(dev, &eeprom, data);
++        if (ret)
++                goto out;
++
++        if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
++                ret = -EFAULT;
++
++out:
++        kfree(data);
++        return ret;
++}
++
++static int ethtool_get_coalesce(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
++
++        if (!ethtool_ops->get_coalesce)
++                return -EOPNOTSUPP;
++
++        ethtool_ops->get_coalesce(dev, &coalesce);
++
++        if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_coalesce(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_coalesce coalesce;
++
++        if (!ethtool_ops->get_coalesce)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
++                return -EFAULT;
++
++        return ethtool_ops->set_coalesce(dev, &coalesce);
++}
++
++static int ethtool_get_ringparam(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
++
++        if (!ethtool_ops->get_ringparam)
++                return -EOPNOTSUPP;
++
++        ethtool_ops->get_ringparam(dev, &ringparam);
++
++        if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_ringparam(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_ringparam ringparam;
++
++        if (!ethtool_ops->get_ringparam)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
++                return -EFAULT;
++
++        return ethtool_ops->set_ringparam(dev, &ringparam);
++}
++
++static int ethtool_get_pauseparam(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
++
++        if (!ethtool_ops->get_pauseparam)
++                return -EOPNOTSUPP;
++
++        ethtool_ops->get_pauseparam(dev, &pauseparam);
++
++        if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_pauseparam(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_pauseparam pauseparam;
++
++        if (!ethtool_ops->get_pauseparam)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
++                return -EFAULT;
++
++        return ethtool_ops->set_pauseparam(dev, &pauseparam);
++}
++
++static int ethtool_get_rx_csum(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GRXCSUM };
++
++        if (!ethtool_ops->get_rx_csum)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_rx_csum(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_rx_csum(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata;
++
++        if (!ethtool_ops->set_rx_csum)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&edata, useraddr, sizeof(edata)))
++                return -EFAULT;
++
++        ethtool_ops->set_rx_csum(dev, edata.data);
++        return 0;
++}
++
++static int ethtool_get_tx_csum(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GTXCSUM };
++
++        if (!ethtool_ops->get_tx_csum)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_tx_csum(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_tx_csum(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata;
++
++        if (!ethtool_ops->set_tx_csum)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&edata, useraddr, sizeof(edata)))
++                return -EFAULT;
++
++        return ethtool_ops->set_tx_csum(dev, edata.data);
++}
++
++static int ethtool_get_sg(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GSG };
++
++        if (!ethtool_ops->get_sg)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_sg(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_sg(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata;
++
++        if (!ethtool_ops->set_sg)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&edata, useraddr, sizeof(edata)))
++                return -EFAULT;
++
++        return ethtool_ops->set_sg(dev, edata.data);
++}
++
++static int ethtool_get_tso(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata = { ETHTOOL_GTSO };
++
++        if (!ethtool_ops->get_tso)
++                return -EOPNOTSUPP;
++
++        edata.data = ethtool_ops->get_tso(dev);
++
++        if (copy_to_user(useraddr, &edata, sizeof(edata)))
++                return -EFAULT;
++        return 0;
++}
++
++static int ethtool_set_tso(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_value edata;
++
++        if (!ethtool_ops->set_tso)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&edata, useraddr, sizeof(edata)))
++                return -EFAULT;
++
++        return ethtool_ops->set_tso(dev, edata.data);
++}
++
++static int ethtool_self_test(struct net_device *dev, char *useraddr)
++{
++        struct ethtool_test test;
++        struct ethtool_ops *ops = ethtool_ops;
++        u64 *data;
++        int ret;
++
++        if (!ops->self_test || !ops->self_test_count)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&test, useraddr, sizeof(test)))
++                return -EFAULT;
++
++        test.len = ops->self_test_count(dev);
++        data = kmalloc(test.len * sizeof(u64), GFP_USER);
++        if (!data)
++                return -ENOMEM;
++
++        ops->self_test(dev, &test, data);
++
++        ret = -EFAULT;
++        if (copy_to_user(useraddr, &test, sizeof(test)))
++                goto out;
++        useraddr += sizeof(test);
++        if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
++                goto out;
++        ret = 0;
++
++out:
++        kfree(data);
++        return ret;
++}
++
++static int ethtool_get_strings(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_gstrings gstrings;
++        struct ethtool_ops *ops = ethtool_ops;
++        u8 *data;
++        int ret;
++
++        if (!ops->get_strings)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
++                return -EFAULT;
++
++        switch (gstrings.string_set) {
++        case ETH_SS_TEST:
++                if (!ops->self_test_count)
++                        return -EOPNOTSUPP;
++                gstrings.len = ops->self_test_count(dev);
++                break;
++        case ETH_SS_STATS:
++                if (!ops->get_stats_count)
++                        return -EOPNOTSUPP;
++                gstrings.len = ops->get_stats_count(dev);
++                break;
++        default:
++                return -EINVAL;
++        }
++
++        data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
++        if (!data)
++                return -ENOMEM;
++
++        ops->get_strings(dev, gstrings.string_set, data);
++
++        ret = -EFAULT;
++        if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
++                goto out;
++        useraddr += sizeof(gstrings);
++        if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
++                goto out;
++        ret = 0;
++
++out:
++        kfree(data);
++        return ret;
++}
++
++static int ethtool_phys_id(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_value id;
++
++        if (!ethtool_ops->phys_id)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&id, useraddr, sizeof(id)))
++                return -EFAULT;
++
++        return ethtool_ops->phys_id(dev, id.data);
++}
++
++static int ethtool_get_stats(struct net_device *dev, void *useraddr)
++{
++        struct ethtool_stats stats;
++        struct ethtool_ops *ops = ethtool_ops;
++        u64 *data;
++        int ret;
++
++        if (!ops->get_ethtool_stats || !ops->get_stats_count)
++                return -EOPNOTSUPP;
++
++        if (copy_from_user(&stats, useraddr, sizeof(stats)))
++                return -EFAULT;
++
++        stats.n_stats = ops->get_stats_count(dev);
++        data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER);
++        if (!data)
++                return -ENOMEM;
++
++        ops->get_ethtool_stats(dev, &stats, data);
++
++        ret = -EFAULT;
++        if (copy_to_user(useraddr, &stats, sizeof(stats)))
++                goto out;
++        useraddr += sizeof(stats);
++        if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
++                goto out;
++        ret = 0;
++
++out:
++        kfree(data);
++        return ret;
++}
++
++static int ethtool_ioctl(struct ifreq *ifr)
++{
++        struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
++        void *useraddr = (void *) ifr->ifr_data;
++        u32 ethcmd;
++
++        /*
++         * XXX: This can be pushed down into the ethtool_* handlers that
++         * need it.  Keep existing behaviour for the moment.
++         */
++        if (!capable(CAP_NET_ADMIN))
++                return -EPERM;
++
++        if (!dev || !netif_device_present(dev))
++                return -ENODEV;
++
++        if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd)))
++                return -EFAULT;
++
++        switch (ethcmd) {
++        case ETHTOOL_GSET:
++                return ethtool_get_settings(dev, useraddr);
++        case ETHTOOL_SSET:
++                return ethtool_set_settings(dev, useraddr);
++        case ETHTOOL_GDRVINFO:
++                return ethtool_get_drvinfo(dev, useraddr);
++        case ETHTOOL_GREGS:
++                return ethtool_get_regs(dev, useraddr);
++        case ETHTOOL_GWOL:
++                return ethtool_get_wol(dev, useraddr);
++        case ETHTOOL_SWOL:
++                return ethtool_set_wol(dev, useraddr);
++        case ETHTOOL_GMSGLVL:
++                return ethtool_get_msglevel(dev, useraddr);
++        case ETHTOOL_SMSGLVL:
++                return ethtool_set_msglevel(dev, useraddr);
++        case ETHTOOL_NWAY_RST:
++                return ethtool_nway_reset(dev);
++        case ETHTOOL_GLINK:
++                return ethtool_get_link(dev, useraddr);
++        case ETHTOOL_GEEPROM:
++                return ethtool_get_eeprom(dev, useraddr);
++        case ETHTOOL_SEEPROM:
++                return ethtool_set_eeprom(dev, useraddr);
++        case ETHTOOL_GCOALESCE:
++                return ethtool_get_coalesce(dev, useraddr);
++        case ETHTOOL_SCOALESCE:
++                return ethtool_set_coalesce(dev, useraddr);
++        case ETHTOOL_GRINGPARAM:
++                return ethtool_get_ringparam(dev, useraddr);
++        case ETHTOOL_SRINGPARAM:
++                return ethtool_set_ringparam(dev, useraddr);
++        case ETHTOOL_GPAUSEPARAM:
++                return ethtool_get_pauseparam(dev, useraddr);
++        case ETHTOOL_SPAUSEPARAM:
++                return ethtool_set_pauseparam(dev, useraddr);
++        case ETHTOOL_GRXCSUM:
++                return ethtool_get_rx_csum(dev, useraddr);
++        case ETHTOOL_SRXCSUM:
++                return ethtool_set_rx_csum(dev, useraddr);
++        case ETHTOOL_GTXCSUM:
++                return ethtool_get_tx_csum(dev, useraddr);
++        case ETHTOOL_STXCSUM:
++                return ethtool_set_tx_csum(dev, useraddr);
++        case ETHTOOL_GSG:
++                return ethtool_get_sg(dev, useraddr);
++        case ETHTOOL_SSG:
++                return ethtool_set_sg(dev, useraddr);
++        case ETHTOOL_GTSO:
++                return ethtool_get_tso(dev, useraddr);
++        case ETHTOOL_STSO:
++                return ethtool_set_tso(dev, useraddr);
++        case ETHTOOL_TEST:
++                return ethtool_self_test(dev, useraddr);
++        case ETHTOOL_GSTRINGS:
++                return ethtool_get_strings(dev, useraddr);
++        case ETHTOOL_PHYS_ID:
++                return ethtool_phys_id(dev, useraddr);
++        case ETHTOOL_GSTATS:
++                return ethtool_get_stats(dev, useraddr);
++        default:
++                return -EOPNOTSUPP;
++        }
++
++        return -EOPNOTSUPP;
++}
++#endif //ETHTOOL_OPS_COMPAT
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0)
++static int rtl8125_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
++                                  void __user *data, int cmd)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret = 0;
++
++        switch (cmd) {
++#ifdef ENABLE_DASH_SUPPORT
++        case SIOCDEVPRIVATE_RTLDASH:
++                if (!netif_running(dev)) {
++                        ret = -ENODEV;
++                        break;
++                }
++                if (!capable(CAP_NET_ADMIN)) {
++                        ret = -EPERM;
++                        break;
++                }
++
++                ret = rtl8125_dash_ioctl(dev, ifr);
++                break;
++#endif
++
++#ifdef ENABLE_REALWOW_SUPPORT
++        case SIOCDEVPRIVATE_RTLREALWOW:
++                if (!netif_running(dev)) {
++                        ret = -ENODEV;
++                        break;
++                }
++
++                ret = rtl8125_realwow_ioctl(dev, ifr);
++                break;
++#endif
++
++        case SIOCRTLTOOL:
++                if (!capable(CAP_NET_ADMIN)) {
++                        ret = -EPERM;
++                        break;
++                }
++
++                ret = rtl8125_tool_ioctl(tp, ifr);
++                break;
++
++        default:
++                ret = -EOPNOTSUPP;
++        }
++
++        return ret;
++}
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0)
++
++static int
++rtl8125_do_ioctl(struct net_device *dev,
++                 struct ifreq *ifr,
++                 int cmd)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct mii_ioctl_data *data = if_mii(ifr);
++        int ret = 0;
++
++        switch (cmd) {
++        case SIOCGMIIPHY:
++                data->phy_id = 32; /* Internal PHY */
++                break;
++
++        case SIOCGMIIREG:
++                rtl8125_mdio_write(tp, 0x1F, 0x0000);
++                data->val_out = rtl8125_mdio_read(tp, data->reg_num);
++                break;
++
++        case SIOCSMIIREG:
++                if (!capable(CAP_NET_ADMIN))
++                        return -EPERM;
++                rtl8125_mdio_write(tp, 0x1F, 0x0000);
++                rtl8125_mdio_write(tp, data->reg_num, data->val_in);
++                break;
++
++#ifdef ETHTOOL_OPS_COMPAT
++        case SIOCETHTOOL:
++                ret = ethtool_ioctl(ifr);
++                break;
++#endif
++
++#ifdef ENABLE_PTP_SUPPORT
++        case SIOCSHWTSTAMP:
++        case SIOCGHWTSTAMP:
++                if (tp->EnablePtp)
++                        ret = rtl8125_ptp_ioctl(dev, ifr, cmd);
++                else
++                        ret = -EOPNOTSUPP;
++                break;
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0)
++#ifdef ENABLE_DASH_SUPPORT
++        case SIOCDEVPRIVATE_RTLDASH:
++                if (!netif_running(dev)) {
++                        ret = -ENODEV;
++                        break;
++                }
++                if (!capable(CAP_NET_ADMIN)) {
++                        ret = -EPERM;
++                        break;
++                }
++
++                ret = rtl8125_dash_ioctl(dev, ifr);
++                break;
++#endif
++
++#ifdef ENABLE_REALWOW_SUPPORT
++        case SIOCDEVPRIVATE_RTLREALWOW:
++                if (!netif_running(dev)) {
++                        ret = -ENODEV;
++                        break;
++                }
++
++                if (!capable(CAP_NET_ADMIN)) {
++                        ret = -EPERM;
++                        break;
++                }
++
++                ret = rtl8125_realwow_ioctl(dev, ifr);
++                break;
++#endif
++
++        case SIOCRTLTOOL:
++                if (!capable(CAP_NET_ADMIN)) {
++                        ret = -EPERM;
++                        break;
++                }
++
++                ret = rtl8125_tool_ioctl(tp, ifr);
++                break;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0)
++
++        default:
++                ret = -EOPNOTSUPP;
++                break;
++        }
++
++        return ret;
++}
++
++static void
++rtl8125_phy_power_up(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++
++        if (rtl8125_is_in_phy_disable_mode(dev))
++                return;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE);
++
++        //wait ups resume (phy state 3)
++        rtl8125_wait_phy_ups_resume(dev, 3);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++}
++
++static void
++rtl8125_phy_power_down(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long flags;
++
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp))
++                return;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++        rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE | BMCR_PDOWN);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++}
++
++static int __devinit
++rtl8125_init_board(struct pci_dev *pdev,
++                   struct net_device **dev_out,
++                   void __iomem **ioaddr_out)
++{
++        void __iomem *ioaddr;
++        struct net_device *dev;
++        struct rtl8125_private *tp;
++        int rc = -ENOMEM, i, pm_cap;
++
++        assert(ioaddr_out != NULL);
++
++        /* dev zeroed in alloc_etherdev */
++        dev = alloc_etherdev_mq(sizeof (*tp), R8125_MAX_QUEUES);
++        if (dev == NULL) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_drv(&debug))
++                        dev_err(&pdev->dev, "unable to alloc new ethernet\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                goto err_out;
++        }
++
++        SET_MODULE_OWNER(dev);
++        SET_NETDEV_DEV(dev, &pdev->dev);
++        tp = netdev_priv(dev);
++        tp->dev = dev;
++        tp->pci_dev = pdev;
++        tp->msg_enable = netif_msg_init(debug.msg_enable, R8125_MSG_DEFAULT);
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
++        if (!aspm)
++                pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1 |
++                                       PCIE_LINK_STATE_CLKPM);
++#endif
++
++        /* enable device (incl. PCI PM wakeup and hotplug setup) */
++        rc = pci_enable_device(pdev);
++        if (rc < 0) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_err(&pdev->dev, "enable failure\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                goto err_out_free_dev;
++        }
++
++        if (pci_set_mwi(pdev) < 0) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_drv(&debug))
++                        dev_info(&pdev->dev, "Mem-Wr-Inval unavailable.\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        }
++
++        /* save power state before pci_enable_device overwrites it */
++        pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM);
++        if (pm_cap) {
++                u16 pwr_command;
++
++                pci_read_config_word(pdev, pm_cap + PCI_PM_CTRL, &pwr_command);
++        } else {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp)) {
++                        dev_err(&pdev->dev, "PowerManagement capability not found.\n");
++                }
++#else
++                printk("PowerManagement capability not found.\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++
++        }
++
++        /* make sure PCI base addr 1 is MMIO */
++        if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_err(&pdev->dev, "region #1 not an MMIO resource, aborting\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                rc = -ENODEV;
++                goto err_out_mwi;
++        }
++        /* check for weird/broken PCI region reporting */
++        if (pci_resource_len(pdev, 2) < R8125_REGS_SIZE) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_err(&pdev->dev, "Invalid PCI region size(s), aborting\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                rc = -ENODEV;
++                goto err_out_mwi;
++        }
++
++        rc = pci_request_regions(pdev, MODULENAME);
++        if (rc < 0) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_err(&pdev->dev, "could not request regions.\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                goto err_out_mwi;
++        }
++
++        if ((sizeof(dma_addr_t) > 4) &&
++            use_dac &&
++            !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
++            !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
++                dev->features |= NETIF_F_HIGHDMA;
++        } else {
++                rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
++                if (rc < 0) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                        if (netif_msg_probe(tp))
++                                dev_err(&pdev->dev, "DMA configuration failed.\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                        goto err_out_free_res;
++                }
++        }
++
++        /* ioremap MMIO region */
++        ioaddr = ioremap(pci_resource_start(pdev, 2), pci_resource_len(pdev, 2));
++        if (ioaddr == NULL) {
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_err(&pdev->dev, "cannot remap MMIO, aborting\n");
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                rc = -EIO;
++                goto err_out_free_res;
++        }
++
++        tp->mmio_addr = ioaddr;
++
++        /* Identify chip attached to board */
++        rtl8125_get_mac_version(tp);
++
++        rtl8125_print_mac_version(tp);
++
++        for (i = ARRAY_SIZE(rtl_chip_info) - 1; i >= 0; i--) {
++                if (tp->mcfg == rtl_chip_info[i].mcfg)
++                        break;
++        }
++
++        if (i < 0) {
++                /* Unknown chip: assume array element #0, original RTL-8125 */
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                if (netif_msg_probe(tp))
++                        dev_printk(KERN_DEBUG, &pdev->dev, "unknown chip version, assuming %s\n", rtl_chip_info[0].name);
++#else
++                printk("Realtek unknown chip version, assuming %s\n", rtl_chip_info[0].name);
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
++                i++;
++        }
++
++        tp->chipset = i;
++
++        *ioaddr_out = ioaddr;
++        *dev_out = dev;
++out:
++        return rc;
++
++err_out_free_res:
++        pci_release_regions(pdev);
++err_out_mwi:
++        pci_clear_mwi(pdev);
++        pci_disable_device(pdev);
++err_out_free_dev:
++        free_netdev(dev);
++err_out:
++        *ioaddr_out = NULL;
++        *dev_out = NULL;
++        goto out;
++}
++
++static bool
++rtl8125_test_phy_ocp_v4(struct rtl8125_private *tp)
++{
++        bool restore = FALSE;
++        bool uc2_response;
++        u8 phy_fatal_err;
++        u16 val;
++
++        if (FALSE == HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                goto exit;
++
++        uc2_response = !!(rtl8125_mdio_direct_read_phy_ocp(tp, 0xB87A) & BIT_0);
++        phy_fatal_err = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB98E);
++
++        if (!uc2_response && (phy_fatal_err == 0))
++                goto exit;
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xC418, BIT_0);
++        mdelay(24);
++
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404);
++        if ((val & 0x03) != 0x00) {
++                u32 wait_cnt = 0;
++
++                while ((val & 0x03) != 0x00 && wait_cnt < 5) {
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC02, 0x000C);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00);
++
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC02, 0x000C);
++
++                        mdelay(100);
++                        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404);
++                        wait_cnt++;
++                }
++        }
++
++        rtl8125_restore_phy_fuse_dout(tp);
++
++        rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_INI, 5000000);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA468, BIT_0);
++
++        rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_LAN_ON, 500000);
++
++        if (phy_fatal_err) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801C);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, phy_fatal_err);
++        }
++        if (uc2_response) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801B);
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8);
++        }
++
++        rtl8125_restore_led_select(tp);
++
++        tp->HwHasWrRamCodeToMicroP = FALSE;
++
++        restore = TRUE;
++
++exit:
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xB87A, BIT_0);
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++
++        return restore;
++}
++
++static bool
++rtl8125_test_phy_ocp_v5(struct rtl8125_private *tp)
++{
++        bool restore = FALSE;
++        u8 phy_fatal_err;
++        u16 val;
++
++        if (FALSE == HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp))
++                goto exit;
++
++        phy_fatal_err = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB98C);
++
++        if (phy_fatal_err == 0)
++                goto exit;
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xC418, BIT_0);
++        mdelay(24);
++
++        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404);
++        if (val & 0x0F) {
++                u32 wait_cnt = 0;
++
++                while (val & 0x0F && wait_cnt < 5) {
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC02, 0x000C);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00);
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xBC06, 0x4F00);
++                        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                              0xBC06,
++                                                              0x7F00,
++                                                              0x4F00);
++
++                        rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10);
++
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC);
++                        rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC02, 0x000C);
++
++                        mdelay(100);
++                        val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404);
++                        wait_cnt++;
++                }
++        }
++
++        rtl8125_restore_phy_fuse_dout(tp);
++
++        rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_INI, 5000000);
++
++        if (tp->mcfg == CFG_METHOD_10)
++                rtl8125_set_phy_mcu_8125d_1_efuse(tp->dev);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA468, BIT_0);
++
++        rtl8125_clear_phy_ups_reg(tp->dev);
++
++        rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_LAN_ON, 500000);
++
++        if (phy_fatal_err) {
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801C);
++                rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, phy_fatal_err);
++        }
++
++        rtl8125_restore_led_select(tp);
++
++        tp->HwHasWrRamCodeToMicroP = FALSE;
++
++        restore = TRUE;
++
++exit:
++        rtl8125_mdio_write(tp, 0x1F, 0x0000);
++
++        return restore;
++}
++
++static bool
++rtl8125_test_phy_ocp(struct rtl8125_private *tp)
++{
++        unsigned long flags;
++        bool reset = false;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (tp->TestPhyOcpReg == FALSE)
++                goto unlock;
++
++        switch (tp->HwSuppEsdVer) {
++        case 4:
++                reset = rtl8125_test_phy_ocp_v4(tp);
++                break;
++        case 5:
++                reset = rtl8125_test_phy_ocp_v5(tp);
++                break;
++        default:
++                goto unlock;
++        }
++
++unlock:
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return reset;
++}
++
++static void
++rtl8125_esd_checker(struct rtl8125_private *tp)
++{
++        struct net_device *dev = tp->dev;
++        struct pci_dev *pdev = tp->pci_dev;
++        u8 cmd;
++        u16 io_base_l;
++        u16 mem_base_l;
++        u16 mem_base_h;
++        u8 ilr;
++        u16 resv_0x1c_h;
++        u16 resv_0x1c_l;
++        u16 resv_0x20_l;
++        u16 resv_0x20_h;
++        u16 resv_0x24_l;
++        u16 resv_0x24_h;
++        u16 resv_0x2c_h;
++        u16 resv_0x2c_l;
++        u32 pci_sn_l;
++        u32 pci_sn_h;
++
++        if (unlikely(tp->rtk_enable_diag))
++                goto exit;
++
++        tp->esd_flag = 0;
++
++        pci_read_config_byte(pdev, PCI_COMMAND, &cmd);
++        if (cmd != tp->pci_cfg_space.cmd) {
++                printk(KERN_ERR "%s: cmd = 0x%02x, should be 0x%02x \n.", dev->name, cmd, tp->pci_cfg_space.cmd);
++                pci_write_config_byte(pdev, PCI_COMMAND, tp->pci_cfg_space.cmd);
++                tp->esd_flag |= BIT_0;
++
++                pci_read_config_byte(pdev, PCI_COMMAND, &cmd);
++                if (cmd == 0xff) {
++                        printk(KERN_ERR "%s: pci link is down \n.", dev->name);
++                        goto exit;
++                }
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_0, &io_base_l);
++        if (io_base_l != tp->pci_cfg_space.io_base_l) {
++                printk(KERN_ERR "%s: io_base_l = 0x%04x, should be 0x%04x \n.", dev->name, io_base_l, tp->pci_cfg_space.io_base_l);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_0, tp->pci_cfg_space.io_base_l);
++                tp->esd_flag |= BIT_1;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_2, &mem_base_l);
++        if (mem_base_l != tp->pci_cfg_space.mem_base_l) {
++                printk(KERN_ERR "%s: mem_base_l = 0x%04x, should be 0x%04x \n.", dev->name, mem_base_l, tp->pci_cfg_space.mem_base_l);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_2, tp->pci_cfg_space.mem_base_l);
++                tp->esd_flag |= BIT_2;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, &mem_base_h);
++        if (mem_base_h!= tp->pci_cfg_space.mem_base_h) {
++                printk(KERN_ERR "%s: mem_base_h = 0x%04x, should be 0x%04x \n.", dev->name, mem_base_h, tp->pci_cfg_space.mem_base_h);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, tp->pci_cfg_space.mem_base_h);
++                tp->esd_flag |= BIT_3;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_3, &resv_0x1c_l);
++        if (resv_0x1c_l != tp->pci_cfg_space.resv_0x1c_l) {
++                printk(KERN_ERR "%s: resv_0x1c_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x1c_l, tp->pci_cfg_space.resv_0x1c_l);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_3, tp->pci_cfg_space.resv_0x1c_l);
++                tp->esd_flag |= BIT_4;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, &resv_0x1c_h);
++        if (resv_0x1c_h != tp->pci_cfg_space.resv_0x1c_h) {
++                printk(KERN_ERR "%s: resv_0x1c_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x1c_h, tp->pci_cfg_space.resv_0x1c_h);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, tp->pci_cfg_space.resv_0x1c_h);
++                tp->esd_flag |= BIT_5;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_4, &resv_0x20_l);
++        if (resv_0x20_l != tp->pci_cfg_space.resv_0x20_l) {
++                printk(KERN_ERR "%s: resv_0x20_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x20_l, tp->pci_cfg_space.resv_0x20_l);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_4, tp->pci_cfg_space.resv_0x20_l);
++                tp->esd_flag |= BIT_6;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, &resv_0x20_h);
++        if (resv_0x20_h != tp->pci_cfg_space.resv_0x20_h) {
++                printk(KERN_ERR "%s: resv_0x20_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x20_h, tp->pci_cfg_space.resv_0x20_h);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, tp->pci_cfg_space.resv_0x20_h);
++                tp->esd_flag |= BIT_7;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_5, &resv_0x24_l);
++        if (resv_0x24_l != tp->pci_cfg_space.resv_0x24_l) {
++                printk(KERN_ERR "%s: resv_0x24_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x24_l, tp->pci_cfg_space.resv_0x24_l);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_5, tp->pci_cfg_space.resv_0x24_l);
++                tp->esd_flag |= BIT_8;
++        }
++
++        pci_read_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, &resv_0x24_h);
++        if (resv_0x24_h != tp->pci_cfg_space.resv_0x24_h) {
++                printk(KERN_ERR "%s: resv_0x24_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x24_h, tp->pci_cfg_space.resv_0x24_h);
++                pci_write_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, tp->pci_cfg_space.resv_0x24_h);
++                tp->esd_flag |= BIT_9;
++        }
++
++        pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &ilr);
++        if (ilr != tp->pci_cfg_space.ilr) {
++                printk(KERN_ERR "%s: ilr = 0x%02x, should be 0x%02x \n.", dev->name, ilr, tp->pci_cfg_space.ilr);
++                pci_write_config_byte(pdev, PCI_INTERRUPT_LINE, tp->pci_cfg_space.ilr);
++                tp->esd_flag |= BIT_10;
++        }
++
++        pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &resv_0x2c_l);
++        if (resv_0x2c_l != tp->pci_cfg_space.resv_0x2c_l) {
++                printk(KERN_ERR "%s: resv_0x2c_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x2c_l, tp->pci_cfg_space.resv_0x2c_l);
++                pci_write_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, tp->pci_cfg_space.resv_0x2c_l);
++                tp->esd_flag |= BIT_11;
++        }
++
++        pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, &resv_0x2c_h);
++        if (resv_0x2c_h != tp->pci_cfg_space.resv_0x2c_h) {
++                printk(KERN_ERR "%s: resv_0x2c_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x2c_h, tp->pci_cfg_space.resv_0x2c_h);
++                pci_write_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, tp->pci_cfg_space.resv_0x2c_h);
++                tp->esd_flag |= BIT_12;
++        }
++
++        if (tp->HwPcieSNOffset > 0) {
++                pci_sn_l = rtl8125_csi_read(tp, tp->HwPcieSNOffset);
++                if (pci_sn_l != tp->pci_cfg_space.pci_sn_l) {
++                        printk(KERN_ERR "%s: pci_sn_l = 0x%08x, should be 0x%08x \n.", dev->name, pci_sn_l, tp->pci_cfg_space.pci_sn_l);
++                        rtl8125_csi_write(tp, tp->HwPcieSNOffset, tp->pci_cfg_space.pci_sn_l);
++                        tp->esd_flag |= BIT_13;
++                }
++
++                pci_sn_h = rtl8125_csi_read(tp, tp->HwPcieSNOffset + 4);
++                if (pci_sn_h != tp->pci_cfg_space.pci_sn_h) {
++                        printk(KERN_ERR "%s: pci_sn_h = 0x%08x, should be 0x%08x \n.", dev->name, pci_sn_h, tp->pci_cfg_space.pci_sn_h);
++                        rtl8125_csi_write(tp, tp->HwPcieSNOffset + 4, tp->pci_cfg_space.pci_sn_h);
++                        tp->esd_flag |= BIT_14;
++                }
++        }
++
++        if (tp->TestPhyOcpReg && rtl8125_test_phy_ocp(tp))
++                tp->esd_flag |= BIT_15;
++
++        if (tp->esd_flag != 0) {
++                printk(KERN_ERR "%s: esd_flag = 0x%04x\n.\n", dev->name, tp->esd_flag);
++                netif_carrier_off(dev);
++                netif_tx_disable(dev);
++                rtl8125_hw_reset(dev);
++                rtl8125_tx_clear(tp);
++                rtl8125_rx_clear(tp);
++                rtl8125_init_ring(dev);
++                rtl8125_up(dev);
++                rtl8125_enable_hw_linkchg_interrupt(tp);
++                rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising);
++                tp->esd_flag = 0;
++        }
++exit:
++        return;
++}
++/*
++static void
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++rtl8125_esd_timer(unsigned long __opaque)
++#else
++rtl8125_esd_timer(struct timer_list *t)
++#endif
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++        struct net_device *dev = (struct net_device *)__opaque;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct timer_list *timer = &tp->esd_timer;
++#else
++        struct rtl8125_private *tp = from_timer(tp, t, esd_timer);
++        //struct net_device *dev = tp->dev;
++        struct timer_list *timer = t;
++#endif
++        rtl8125_esd_checker(tp);
++
++        mod_timer(timer, jiffies + timeout);
++}
++*/
++
++/*
++static void
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++rtl8125_link_timer(unsigned long __opaque)
++#else
++rtl8125_link_timer(struct timer_list *t)
++#endif
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)
++        struct net_device *dev = (struct net_device *)__opaque;
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct timer_list *timer = &tp->link_timer;
++#else
++        struct rtl8125_private *tp = from_timer(tp, t, link_timer);
++        struct net_device *dev = tp->dev;
++        struct timer_list *timer = t;
++#endif
++        rtl8125_check_link_status(dev);
++
++        mod_timer(timer, jiffies + RTL8125_LINK_TIMEOUT);
++}
++*/
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)
++static int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
++                                 int minvec, int maxvec)
++{
++        int nvec = maxvec;
++        int rc;
++
++        if (maxvec < minvec)
++                return -ERANGE;
++
++        do {
++                rc = pci_enable_msix(dev, entries, nvec);
++                if (rc < 0) {
++                        return rc;
++                } else if (rc > 0) {
++                        if (rc < minvec)
++                                return -ENOSPC;
++                        nvec = rc;
++                }
++        } while (rc);
++
++        return nvec;
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) */
++
++static int rtl8125_enable_msix(struct rtl8125_private *tp)
++{
++        int i, nvecs = 0;
++        struct msix_entry msix_ent[R8125_MAX_MSIX_VEC];
++        //struct net_device *dev = tp->dev;
++        //const int len = sizeof(tp->irq_tbl[0].name);
++
++        for (i = 0; i < R8125_MAX_MSIX_VEC; i++) {
++                msix_ent[i].entry = i;
++                msix_ent[i].vector = 0;
++        }
++
++        nvecs = pci_enable_msix_range(tp->pci_dev, msix_ent,
++                                      tp->min_irq_nvecs, tp->max_irq_nvecs);
++        if (nvecs < 0)
++                goto out;
++
++        for (i = 0; i < nvecs; i++) {
++                struct r8125_irq *irq = &tp->irq_tbl[i];
++                irq->vector = msix_ent[i].vector;
++                //snprintf(irq->name, len, "%s-%d", dev->name, i);
++                //irq->handler = rtl8125_interrupt_msix;
++        }
++
++out:
++        return nvecs;
++}
++
++/* Cfg9346_Unlock assumed. */
++static int rtl8125_try_msi(struct rtl8125_private *tp)
++{
++        struct pci_dev *pdev = tp->pci_dev;
++        unsigned int hw_supp_irq_nvecs;
++        unsigned msi = 0;
++        int nvecs = 1;
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++                hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125A;
++                break;
++        case CFG_METHOD_4 ... CFG_METHOD_7:
++                hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125B;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125D;
++                break;
++        default:
++                hw_supp_irq_nvecs = 1;
++                break;
++        }
++        tp->hw_supp_irq_nvecs = clamp_val(hw_supp_irq_nvecs, 1,
++                                          R8125_MAX_MSIX_VEC);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++                tp->max_irq_nvecs = tp->hw_supp_irq_nvecs;
++                tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125B;
++                break;
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++                tp->max_irq_nvecs = tp->hw_supp_irq_nvecs;
++                tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125BP;
++                break;
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_13:
++                tp->max_irq_nvecs = tp->hw_supp_irq_nvecs;
++                tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125D;
++                break;
++        case CFG_METHOD_12:
++                tp->max_irq_nvecs = tp->hw_supp_irq_nvecs;
++                tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125CP;
++                break;
++        default:
++                tp->max_irq_nvecs = 1;
++                tp->min_irq_nvecs = 1;
++                break;
++        }
++#ifdef DISABLE_MULTI_MSIX_VECTOR
++        tp->max_irq_nvecs = 1;
++#endif
++
++#if defined(RTL_USE_NEW_INTR_API)
++        if ((nvecs = pci_alloc_irq_vectors(pdev, tp->min_irq_nvecs, tp->max_irq_nvecs, PCI_IRQ_MSIX)) > 0)
++                msi |= RTL_FEATURE_MSIX;
++        else if ((nvecs = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES)) > 0 &&
++                 pci_dev_msi_enabled(pdev))
++                msi |= RTL_FEATURE_MSI;
++#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++        if ((nvecs = rtl8125_enable_msix(tp)) > 0)
++                msi |= RTL_FEATURE_MSIX;
++        else if (!pci_enable_msi(pdev))
++                msi |= RTL_FEATURE_MSI;
++#endif
++        if (!(msi & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX)))
++                dev_info(&pdev->dev, "no MSI/MSI-X. Back to INTx.\n");
++
++        if (!(msi & RTL_FEATURE_MSIX) || nvecs < 1)
++                nvecs = 1;
++
++        tp->irq_nvecs = nvecs;
++
++        tp->features |= msi;
++
++        return nvecs;
++}
++
++static void rtl8125_disable_msi(struct pci_dev *pdev, struct rtl8125_private *tp)
++{
++#if defined(RTL_USE_NEW_INTR_API)
++        if (tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX))
++                pci_free_irq_vectors(pdev);
++#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13)
++        if (tp->features & (RTL_FEATURE_MSIX))
++                pci_disable_msix(pdev);
++        else if (tp->features & (RTL_FEATURE_MSI))
++                pci_disable_msi(pdev);
++#endif
++        tp->features &= ~(RTL_FEATURE_MSI | RTL_FEATURE_MSIX);
++}
++
++static int rtl8125_get_irq(struct pci_dev *pdev)
++{
++#if defined(RTL_USE_NEW_INTR_API)
++        return pci_irq_vector(pdev, 0);
++#else
++        return pdev->irq;
++#endif
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
++static void
++rtl8125_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct rtl8125_counters *counters = tp->tally_vaddr;
++        dma_addr_t paddr = tp->tally_paddr;
++
++        if (!counters)
++                return;
++
++        netdev_stats_to_stats64(stats, &dev->stats);
++        dev_fetch_sw_netstats(stats, dev->tstats);
++
++        /*
++         * Fetch additional counter values missing in stats collected by driver
++         * from tally counters.
++         */
++        rtl8125_dump_tally_counter(tp, paddr);
++
++        stats->tx_errors = le64_to_cpu(counters->tx_errors);
++        stats->collisions = le32_to_cpu(counters->tx_multi_collision);
++        stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted);
++        stats->rx_missed_errors = le16_to_cpu(counters->rx_missed);
++}
++#else
++/**
++ *  rtl8125_get_stats - Get rtl8125 read/write statistics
++ *  @dev: The Ethernet Device to get statistics for
++ *
++ *  Get TX/RX statistics for rtl8125
++ */
++static struct
++net_device_stats *rtl8125_get_stats(struct net_device *dev)
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++        struct rtl8125_private *tp = netdev_priv(dev);
++#endif
++        return &RTLDEV->stats;
++}
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++static const struct net_device_ops rtl8125_netdev_ops = {
++        .ndo_open       = rtl8125_open,
++        .ndo_stop       = rtl8125_close,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
++        .ndo_get_stats64    = rtl8125_get_stats64,
++#else
++        .ndo_get_stats      = rtl8125_get_stats,
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
++        .ndo_start_xmit     = rtl8125_start_xmit,
++        .ndo_tx_timeout     = rtl8125_tx_timeout,
++        .ndo_change_mtu     = rtl8125_change_mtu,
++        .ndo_set_mac_address    = rtl8125_set_mac_address,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0)
++        .ndo_do_ioctl       = rtl8125_do_ioctl,
++#else
++        .ndo_siocdevprivate = rtl8125_siocdevprivate,
++        .ndo_eth_ioctl      = rtl8125_do_ioctl,
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)
++        .ndo_set_multicast_list = rtl8125_set_rx_mode,
++#else
++        .ndo_set_rx_mode    = rtl8125_set_rx_mode,
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++#ifdef CONFIG_R8125_VLAN
++        .ndo_vlan_rx_register   = rtl8125_vlan_rx_register,
++#endif
++#else
++        .ndo_fix_features   = rtl8125_fix_features,
++        .ndo_set_features   = rtl8125_set_features,
++#endif
++#ifdef CONFIG_NET_POLL_CONTROLLER
++        .ndo_poll_controller    = rtl8125_netpoll,
++#endif
++};
++#endif
++
++
++#ifdef  CONFIG_R8125_NAPI
++
++static int rtl8125_poll(napi_ptr napi, napi_budget budget)
++{
++        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
++        struct rtl8125_private *tp = r8125napi->priv;
++        RTL_GET_NETDEV(tp)
++        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
++        unsigned int work_done = 0;
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++)
++                rtl8125_tx_interrupt(&tp->tx_ring[i], budget);
++
++        for (i = 0; i < tp->num_rx_rings; i++)
++                work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[i], budget);
++
++        work_done = min(work_done, work_to_do);
++
++        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
++
++        if (work_done < work_to_do) {
++#ifdef ENABLE_DASH_SUPPORT
++                if (rtl8125_check_dash_interrupt(tp))
++                        rtl8125_schedule_dash_work(tp);
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE)
++                        return RTL_NAPI_RETURN_VALUE;
++#else
++                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
++#endif
++                /*
++                 * 20040426: the barrier is not strictly required but the
++                 * behavior of the irq handler could be less predictable
++                 * without it. Btw, the lack of flush for the posted pci
++                 * write is safe - FR
++                 */
++                smp_wmb();
++
++                rtl8125_switch_to_timer_interrupt(tp);
++        }
++
++        return RTL_NAPI_RETURN_VALUE;
++}
++
++static int rtl8125_poll_msix_ring(napi_ptr napi, napi_budget budget)
++{
++        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
++        struct rtl8125_private *tp = r8125napi->priv;
++        RTL_GET_NETDEV(tp)
++        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
++        unsigned int work_done = 0;
++        const int message_id = r8125napi->index;
++
++        if (message_id < tp->num_tx_rings)
++                rtl8125_tx_interrupt_with_vector(tp, message_id, budget);
++
++        if (message_id < tp->num_rx_rings)
++                work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
++
++        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
++
++        if (work_done < work_to_do) {
++#ifdef ENABLE_DASH_SUPPORT
++                if (message_id == 31)
++                        if (rtl8125_check_dash_interrupt(tp))
++                                rtl8125_schedule_dash_work(tp);
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE)
++                        return RTL_NAPI_RETURN_VALUE;
++#else
++                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
++#endif
++                /*
++                 * 20040426: the barrier is not strictly required but the
++                 * behavior of the irq handler could be less predictable
++                 * without it. Btw, the lack of flush for the posted pci
++                 * write is safe - FR
++                 */
++                smp_wmb();
++
++                rtl8125_enable_hw_interrupt_v2(tp, message_id);
++        }
++
++        return RTL_NAPI_RETURN_VALUE;
++}
++
++static int rtl8125_poll_msix_tx(napi_ptr napi, napi_budget budget)
++{
++        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
++        struct rtl8125_private *tp = r8125napi->priv;
++        RTL_GET_NETDEV(tp)
++        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
++        unsigned int work_done = 0;
++        const int message_id = r8125napi->index;
++
++        //suppress unused variable
++        (void)(dev);
++
++        rtl8125_tx_interrupt_with_vector(tp, message_id, budget);
++
++        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
++
++        if (work_done < work_to_do) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE)
++                        return RTL_NAPI_RETURN_VALUE;
++#else
++                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
++#endif
++                /*
++                 * 20040426: the barrier is not strictly required but the
++                 * behavior of the irq handler could be less predictable
++                 * without it. Btw, the lack of flush for the posted pci
++                 * write is safe - FR
++                 */
++                smp_wmb();
++
++                rtl8125_enable_hw_interrupt_v2(tp, message_id);
++        }
++
++        return RTL_NAPI_RETURN_VALUE;
++}
++
++static int rtl8125_poll_msix_other(napi_ptr napi, napi_budget budget)
++{
++        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
++        struct rtl8125_private *tp = r8125napi->priv;
++        RTL_GET_NETDEV(tp)
++        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
++        const int message_id = r8125napi->index;
++
++        //suppress unused variable
++        (void)(dev);
++        (void)(work_to_do);
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++        RTL_NETIF_RX_COMPLETE(dev, napi, work_to_do);
++#else
++        RTL_NETIF_RX_COMPLETE(dev, napi, work_to_do);
++#endif
++
++        rtl8125_enable_hw_interrupt_v2(tp, message_id);
++
++        return 1;
++}
++
++static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget)
++{
++        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
++        struct rtl8125_private *tp = r8125napi->priv;
++        RTL_GET_NETDEV(tp)
++        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
++        unsigned int work_done = 0;
++        const int message_id = r8125napi->index;
++
++        if (message_id < tp->num_rx_rings)
++                work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
++
++        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
++
++        if (work_done < work_to_do) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
++                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE)
++                        return RTL_NAPI_RETURN_VALUE;
++#else
++                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
++#endif
++                /*
++                 * 20040426: the barrier is not strictly required but the
++                 * behavior of the irq handler could be less predictable
++                 * without it. Btw, the lack of flush for the posted pci
++                 * write is safe - FR
++                 */
++                smp_wmb();
++
++                rtl8125_enable_hw_interrupt_v2(tp, message_id);
++        }
++
++        return RTL_NAPI_RETURN_VALUE;
++}
++
++void rtl8125_enable_napi(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        int i;
++
++        for (i = 0; i < tp->irq_nvecs; i++)
++                RTL_NAPI_ENABLE(tp->dev, &tp->r8125napi[i].napi);
++#endif
++}
++
++static void rtl8125_disable_napi(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        int i;
++
++        for (i = 0; i < tp->irq_nvecs; i++)
++                RTL_NAPI_DISABLE(tp->dev, &tp->r8125napi[i].napi);
++#endif
++}
++
++static void rtl8125_del_napi(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        int i;
++
++        for (i = 0; i < tp->irq_nvecs; i++)
++                RTL_NAPI_DEL((&tp->r8125napi[i]));
++#endif
++}
++#endif //CONFIG_R8125_NAPI
++
++static void rtl8125_init_napi(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i=0; i<tp->irq_nvecs; i++) {
++                struct r8125_napi *r8125napi = &tp->r8125napi[i];
++#ifdef CONFIG_R8125_NAPI
++                int (*poll)(struct napi_struct *, int);
++
++                poll = rtl8125_poll;
++                if (tp->features & RTL_FEATURE_MSIX) {
++                        switch (tp->HwCurrIsrVer) {
++                        case 7:
++                                if (i < R8125_MAX_RX_QUEUES_VEC_V3)
++                                        poll = rtl8125_poll_msix_rx;
++                                else if (i == 27 || i == 28)
++                                        poll = rtl8125_poll_msix_tx;
++                                else
++                                        poll = rtl8125_poll_msix_other;
++                                break;
++                        case 5:
++                                if (i < R8125_MAX_RX_QUEUES_VEC_V3)
++                                        poll = rtl8125_poll_msix_rx;
++                                else if (i == 16 || i == 17)
++                                        poll = rtl8125_poll_msix_tx;
++                                else
++                                        poll = rtl8125_poll_msix_other;
++                                break;
++                        case 2:
++                                if (i < R8125_MAX_RX_QUEUES_VEC_V3)
++                                        poll = rtl8125_poll_msix_rx;
++                                else if (i == 16 || i == 18)
++                                        poll = rtl8125_poll_msix_tx;
++                                else
++                                        poll = rtl8125_poll_msix_other;
++                                break;
++                        case 3:
++                        case 4:
++                                if (i < R8125_MAX_RX_QUEUES_VEC_V3)
++                                        poll = rtl8125_poll_msix_ring;
++                                else
++                                        poll = rtl8125_poll_msix_other;
++                                break;
++                        }
++                }
++
++                RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT);
++#endif
++
++                r8125napi->priv = tp;
++                r8125napi->index = i;
++        }
++}
++
++static int
++rtl8125_set_real_num_queue(struct rtl8125_private *tp)
++{
++        int retval = 0;
++
++        retval = netif_set_real_num_tx_queues(tp->dev, tp->num_tx_rings);
++        if (retval < 0)
++                goto exit;
++
++        retval = netif_set_real_num_rx_queues(tp->dev, tp->num_rx_rings);
++        if (retval < 0)
++                goto exit;
++
++exit:
++        return retval;
++}
++
++static int __devinit
++rtl8125_init_one(struct pci_dev *pdev,
++                 const struct pci_device_id *ent)
++{
++        struct net_device *dev = NULL;
++        struct rtl8125_private *tp;
++        void __iomem *ioaddr = NULL;
++        static int board_idx = -1;
++
++        int rc;
++
++        assert(pdev != NULL);
++        assert(ent != NULL);
++
++        board_idx++;
++
++        if (netif_msg_drv(&debug))
++                printk(KERN_INFO "%s Ethernet controller driver %s loaded\n",
++                       MODULENAME, RTL8125_VERSION);
++
++        rc = rtl8125_init_board(pdev, &dev, &ioaddr);
++        if (rc)
++                goto out;
++
++        tp = netdev_priv(dev);
++        assert(ioaddr != NULL);
++
++        spin_lock_init(&tp->phy_lock);
++
++        tp->set_speed = rtl8125_set_speed_xmii;
++        tp->get_settings = rtl8125_gset_xmii;
++        tp->phy_reset_enable = rtl8125_xmii_reset_enable;
++        tp->phy_reset_pending = rtl8125_xmii_reset_pending;
++        tp->link_ok = rtl8125_xmii_link_ok;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
++        dev->tstats = devm_netdev_alloc_pcpu_stats(&pdev->dev,
++                        struct pcpu_sw_netstats);
++        if (!dev->tstats)
++                goto err_out_1;
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0)
++
++        rc = rtl8125_try_msi(tp);
++        if (rc < 0) {
++                dev_err(&pdev->dev, "Can't allocate interrupt\n");
++                goto err_out_1;
++        }
++
++        rtl8125_init_software_variable(dev);
++
++        RTL_NET_DEVICE_OPS(rtl8125_netdev_ops);
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)
++        SET_ETHTOOL_OPS(dev, &rtl8125_ethtool_ops);
++#endif
++
++        dev->watchdog_timeo = RTL8125_TX_TIMEOUT;
++        dev->irq = rtl8125_get_irq(pdev);
++        dev->base_addr = (unsigned long) ioaddr;
++
++        rtl8125_init_napi(tp);
++
++#ifdef CONFIG_R8125_VLAN
++        if (tp->mcfg != CFG_METHOD_DEFAULT) {
++                dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++                dev->vlan_rx_kill_vid = rtl8125_vlan_rx_kill_vid;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++        }
++#endif
++
++        /* There has been a number of reports that using SG/TSO results in
++         * tx timeouts. However for a lot of people SG/TSO works fine.
++         * Therefore disable both features by default, but allow users to
++         * enable them. Use at own risk!
++         */
++        tp->cp_cmd |= RTL_R16(tp, CPlusCmd);
++        if (tp->mcfg != CFG_METHOD_DEFAULT) {
++                dev->features |= NETIF_F_IP_CSUM;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++                tp->cp_cmd |= RxChkSum;
++#else
++                dev->features |= NETIF_F_RXCSUM;
++                switch (tp->mcfg) {
++                case CFG_METHOD_2:
++                case CFG_METHOD_3:
++                case CFG_METHOD_6:
++                        /* nothing to do */
++                        break;
++                default:
++                        dev->features |= NETIF_F_SG | NETIF_F_TSO;
++                        break;
++                };
++                dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
++                                   NETIF_F_RXCSUM | NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
++                dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
++                                     NETIF_F_HIGHDMA;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0)
++                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0)
++                dev->hw_features |= NETIF_F_RXALL;
++                dev->hw_features |= NETIF_F_RXFCS;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
++                dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
++                dev->features |= NETIF_F_IPV6_CSUM;
++                switch (tp->mcfg) {
++                case CFG_METHOD_2:
++                case CFG_METHOD_3:
++                case CFG_METHOD_6:
++                        /* nothing to do */
++                        break;
++                default:
++                        dev->features |= NETIF_F_TSO6;
++                        break;
++                };
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0)
++                netif_set_tso_max_size(dev, LSO_64K);
++                netif_set_tso_max_segs(dev, NIC_MAX_PHYS_BUF_COUNT_LSO2);
++#else //LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0)
++                netif_set_gso_max_size(dev, LSO_64K);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0)
++                dev->gso_max_segs = NIC_MAX_PHYS_BUF_COUNT_LSO2;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
++                dev->gso_min_segs = NIC_MIN_PHYS_BUF_COUNT;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0)
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0)
++
++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++
++#ifdef ENABLE_RSS_SUPPORT
++                if (tp->EnableRss) {
++                        dev->hw_features |= NETIF_F_RXHASH;
++                        dev->features |= NETIF_F_RXHASH;
++                }
++#endif
++        }
++
++        netdev_sw_irq_coalesce_default_on(dev);
++
++#ifdef ENABLE_LIB_SUPPORT
++        BLOCKING_INIT_NOTIFIER_HEAD(&tp->lib_nh);
++#endif
++        rtl8125_init_all_schedule_work(tp);
++
++        rc = rtl8125_set_real_num_queue(tp);
++        if (rc < 0)
++                goto err_out;
++
++        rtl8125_exit_oob(dev);
++
++        rtl8125_powerup_pll(dev);
++
++        rtl8125_hw_init(dev);
++
++        rtl8125_hw_reset(dev);
++
++        /* Get production from EEPROM */
++        rtl8125_eeprom_type(tp);
++
++        if (tp->eeprom_type == EEPROM_TYPE_93C46 || tp->eeprom_type == EEPROM_TYPE_93C56)
++                rtl8125_set_eeprom_sel_low(tp);
++
++        rtl8125_get_mac_address(dev);
++
++        tp->fw_name = rtl_chip_fw_infos[tp->mcfg].fw_name;
++
++        tp->tally_vaddr = dma_alloc_coherent(&pdev->dev, sizeof(*tp->tally_vaddr),
++                                             &tp->tally_paddr, GFP_KERNEL);
++        if (!tp->tally_vaddr) {
++                rc = -ENOMEM;
++                goto err_out;
++        }
++
++        rtl8125_tally_counter_clear(tp);
++
++        pci_set_drvdata(pdev, dev);
++
++        rc = register_netdev(dev);
++        if (rc)
++                goto err_out;
++
++        printk(KERN_INFO "%s: This product is covered by one or more of the following patents: US6,570,884, US6,115,776, and US6,327,625.\n", MODULENAME);
++
++        rtl8125_disable_rxdvgate(dev);
++
++        device_set_wakeup_enable(&pdev->dev, tp->wol_enabled);
++
++        netif_carrier_off(dev);
++
++#ifdef ENABLE_R8125_SYSFS
++        rtl8125_sysfs_init(dev);
++#endif /* ENABLE_R8125_SYSFS */
++
++        printk("%s", GPL_CLAIM);
++
++out:
++        return rc;
++
++err_out:
++        if (tp->tally_vaddr != NULL) {
++                dma_free_coherent(&pdev->dev, sizeof(*tp->tally_vaddr), tp->tally_vaddr,
++                                  tp->tally_paddr);
++
++                tp->tally_vaddr = NULL;
++        }
++#ifdef  CONFIG_R8125_NAPI
++        rtl8125_del_napi(tp);
++#endif
++        rtl8125_disable_msi(pdev, tp);
++
++err_out_1:
++        rtl8125_release_board(pdev, dev);
++
++        goto out;
++}
++
++static void __devexit
++rtl8125_remove_one(struct pci_dev *pdev)
++{
++        struct net_device *dev = pci_get_drvdata(pdev);
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        assert(dev != NULL);
++        assert(tp != NULL);
++
++        set_bit(R8125_FLAG_DOWN, tp->task_flags);
++
++        rtl8125_cancel_all_schedule_work(tp);
++
++        if (HW_DASH_SUPPORT_DASH(tp))
++                rtl8125_driver_stop(tp);
++
++        rtl8125_disable_pci_offset_180(tp);
++
++#ifdef ENABLE_R8125_SYSFS
++        rtl8125_sysfs_remove(dev);
++#endif //ENABLE_R8125_SYSFS
++
++        unregister_netdev(dev);
++#ifdef  CONFIG_R8125_NAPI
++        rtl8125_del_napi(tp);
++#endif
++        rtl8125_disable_msi(pdev, tp);
++#ifdef ENABLE_R8125_PROCFS
++        rtl8125_proc_remove(dev);
++#endif
++        if (tp->tally_vaddr != NULL) {
++                dma_free_coherent(&pdev->dev, sizeof(*tp->tally_vaddr), tp->tally_vaddr, tp->tally_paddr);
++                tp->tally_vaddr = NULL;
++        }
++
++        rtl8125_release_board(pdev, dev);
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++        rtl8125_release_firmware(tp);
++#endif
++
++        pci_set_drvdata(pdev, NULL);
++}
++
++#ifdef ENABLE_PAGE_REUSE
++static inline unsigned int rtl8125_rx_page_order(unsigned rx_buf_sz, unsigned page_size)
++{
++        unsigned truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
++                            SKB_DATA_ALIGN(rx_buf_sz + R8125_RX_ALIGN);
++
++        return get_order(truesize * 2);
++}
++#endif //ENABLE_PAGE_REUSE
++
++static void
++rtl8125_set_rxbufsize(struct rtl8125_private *tp,
++                      struct net_device *dev)
++{
++        unsigned int mtu = dev->mtu;
++
++        tp->rms = (mtu > ETH_DATA_LEN) ?
++                  mtu + ETH_HLEN + RT_VALN_HLEN + ETH_FCS_LEN:
++                  RX_BUF_SIZE;
++        tp->rx_buf_sz = tp->rms;
++#ifdef ENABLE_RX_PACKET_FRAGMENT
++        tp->rx_buf_sz =  SKB_DATA_ALIGN(RX_BUF_SIZE);
++#endif //ENABLE_RX_PACKET_FRAGMENT
++#ifdef ENABLE_PAGE_REUSE
++        tp->rx_buf_page_order = rtl8125_rx_page_order(tp->rx_buf_sz, PAGE_SIZE);
++        tp->rx_buf_page_size = rtl8125_rx_page_size(tp->rx_buf_page_order);
++#endif //ENABLE_PAGE_REUSE
++}
++
++static void
++rtl8125_set_rms(struct rtl8125_private *tp, u16 rms)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                rms |= AcceppVlanPhys;
++                break;
++        default:
++                rms &= ~AcceppVlanPhys;
++                break;
++        }
++        RTL_W16(tp, RxMaxSize, rms);
++}
++
++static void rtl8125_free_irq(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i=0; i<tp->irq_nvecs; i++) {
++                struct r8125_irq *irq = &tp->irq_tbl[i];
++                struct r8125_napi *r8125napi = &tp->r8125napi[i];
++
++                if (irq->requested) {
++                        irq->requested = 0;
++#if defined(RTL_USE_NEW_INTR_API)
++                        pci_free_irq(tp->pci_dev, i, r8125napi);
++#else
++                        free_irq(irq->vector, r8125napi);
++#endif
++                }
++        }
++}
++
++static int rtl8125_alloc_irq(struct rtl8125_private *tp)
++{
++        struct net_device *dev = tp->dev;
++        int rc = 0;
++        struct r8125_irq *irq;
++        struct r8125_napi *r8125napi;
++        int i = 0;
++        const int len = sizeof(tp->irq_tbl[0].name);
++
++#if defined(RTL_USE_NEW_INTR_API)
++        for (i=0; i<tp->irq_nvecs; i++) {
++                irq = &tp->irq_tbl[i];
++                if (tp->features & RTL_FEATURE_MSIX &&
++                    tp->HwCurrIsrVer > 1)
++                        irq->handler = rtl8125_interrupt_msix;
++                else
++                        irq->handler = rtl8125_interrupt;
++
++                r8125napi = &tp->r8125napi[i];
++                snprintf(irq->name, len, "%s-%d", dev->name, i);
++                rc = pci_request_irq(tp->pci_dev, i, irq->handler, NULL, r8125napi,
++                                     irq->name);
++                if (rc)
++                        break;
++
++                irq->vector = pci_irq_vector(tp->pci_dev, i);
++                irq->requested = 1;
++        }
++#else
++        unsigned long irq_flags = 0;
++#ifdef ENABLE_LIB_SUPPORT
++        irq_flags |= IRQF_NO_SUSPEND;
++#endif
++        if (tp->features & RTL_FEATURE_MSIX &&
++            tp->HwCurrIsrVer > 1) {
++                for (i=0; i<tp->irq_nvecs; i++) {
++                        irq = &tp->irq_tbl[i];
++                        irq->handler = rtl8125_interrupt_msix;
++                        r8125napi = &tp->r8125napi[i];
++                        snprintf(irq->name, len, "%s-%d", dev->name, i);
++                        rc = request_irq(irq->vector, irq->handler, irq_flags, irq->name, r8125napi);
++
++                        if (rc)
++                                break;
++
++                        irq->requested = 1;
++                }
++        } else {
++                irq = &tp->irq_tbl[0];
++                irq->handler = rtl8125_interrupt;
++                r8125napi = &tp->r8125napi[0];
++                snprintf(irq->name, len, "%s-0", dev->name);
++                if (!(tp->features & RTL_FEATURE_MSIX))
++                        irq->vector = dev->irq;
++                irq_flags |= (tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX)) ? 0 : SA_SHIRQ;
++                rc = request_irq(irq->vector, irq->handler, irq_flags, irq->name, r8125napi);
++
++                if (rc == 0)
++                        irq->requested = 1;
++        }
++#endif
++        if (rc)
++                rtl8125_free_irq(tp);
++
++        return rc;
++}
++
++static int rtl8125_alloc_tx_desc(struct rtl8125_private *tp)
++{
++        struct rtl8125_tx_ring *ring;
++        struct pci_dev *pdev = tp->pci_dev;
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                ring = &tp->tx_ring[i];
++                ring->TxDescAllocSize = (ring->num_tx_desc + 1) * sizeof(struct TxDesc);
++                ring->TxDescArray = dma_alloc_coherent(&pdev->dev,
++                                                       ring->TxDescAllocSize,
++                                                       &ring->TxPhyAddr,
++                                                       GFP_KERNEL);
++
++                if (!ring->TxDescArray)
++                        return -1;
++        }
++
++        return 0;
++}
++
++static int rtl8125_alloc_rx_desc(struct rtl8125_private *tp)
++{
++        struct rtl8125_rx_ring *ring;
++        struct pci_dev *pdev = tp->pci_dev;
++        int i;
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                ring = &tp->rx_ring[i];
++                ring->RxDescAllocSize = (ring->num_rx_desc + 1) * tp->RxDescLength;
++                ring->RxDescArray = dma_alloc_coherent(&pdev->dev,
++                                                       ring->RxDescAllocSize,
++                                                       &ring->RxPhyAddr,
++                                                       GFP_KERNEL);
++
++                if (!ring->RxDescArray)
++                        return -1;
++        }
++
++        return 0;
++}
++
++static void rtl8125_free_tx_desc(struct rtl8125_private *tp)
++{
++        struct rtl8125_tx_ring *ring;
++        struct pci_dev *pdev = tp->pci_dev;
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                ring = &tp->tx_ring[i];
++                if (ring->TxDescArray) {
++                        dma_free_coherent(&pdev->dev,
++                                          ring->TxDescAllocSize,
++                                          ring->TxDescArray,
++                                          ring->TxPhyAddr);
++                        ring->TxDescArray = NULL;
++                }
++        }
++}
++
++static void rtl8125_free_rx_desc(struct rtl8125_private *tp)
++{
++        struct rtl8125_rx_ring *ring;
++        struct pci_dev *pdev = tp->pci_dev;
++        int i;
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                ring = &tp->rx_ring[i];
++                if (ring->RxDescArray) {
++                        dma_free_coherent(&pdev->dev,
++                                          ring->RxDescAllocSize,
++                                          ring->RxDescArray,
++                                          ring->RxPhyAddr);
++                        ring->RxDescArray = NULL;
++                }
++        }
++}
++
++static void rtl8125_free_alloc_resources(struct rtl8125_private *tp)
++{
++        rtl8125_free_rx_desc(tp);
++
++        rtl8125_free_tx_desc(tp);
++}
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++static void rtl8125_request_firmware(struct rtl8125_private *tp)
++{
++        struct rtl8125_fw *rtl_fw;
++
++        /* firmware loaded already or no firmware available */
++        if (tp->rtl_fw || !tp->fw_name)
++                return;
++
++        rtl_fw = kzalloc(sizeof(*rtl_fw), GFP_KERNEL);
++        if (!rtl_fw)
++                return;
++
++        rtl_fw->phy_write = rtl8125_mdio_write;
++        rtl_fw->phy_read = rtl8125_mdio_read;
++        rtl_fw->mac_mcu_write = mac_mcu_write;
++        rtl_fw->mac_mcu_read = mac_mcu_read;
++        rtl_fw->fw_name = tp->fw_name;
++        rtl_fw->dev = tp_to_dev(tp);
++
++        if (rtl8125_fw_request_firmware(rtl_fw))
++                kfree(rtl_fw);
++        else
++                tp->rtl_fw = rtl_fw;
++}
++#endif
++
++int rtl8125_open(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int retval;
++
++        retval = -ENOMEM;
++
++#ifdef ENABLE_R8125_PROCFS
++        rtl8125_proc_init(dev);
++#endif
++        rtl8125_set_rxbufsize(tp, dev);
++        /*
++         * Rx and Tx descriptors needs 256 bytes alignment.
++         * pci_alloc_consistent provides more.
++         */
++        if (rtl8125_alloc_tx_desc(tp) < 0 || rtl8125_alloc_rx_desc(tp) < 0)
++                goto err_free_all_allocated_mem;
++
++        retval = rtl8125_init_ring(dev);
++        if (retval < 0)
++                goto err_free_all_allocated_mem;
++
++        retval = rtl8125_alloc_irq(tp);
++        if (retval < 0)
++                goto err_free_all_allocated_mem;
++
++        if (netif_msg_probe(tp)) {
++                printk(KERN_INFO "%s: 0x%lx, "
++                       "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, "
++                       "IRQ %d\n",
++                       dev->name,
++                       dev->base_addr,
++                       dev->dev_addr[0], dev->dev_addr[1],
++                       dev->dev_addr[2], dev->dev_addr[3],
++                       dev->dev_addr[4], dev->dev_addr[5], dev->irq);
++        }
++
++#ifdef ENABLE_USE_FIRMWARE_FILE
++        rtl8125_request_firmware(tp);
++#endif
++        pci_set_master(tp->pci_dev);
++
++#ifdef  CONFIG_R8125_NAPI
++        rtl8125_enable_napi(tp);
++#endif
++
++        rtl8125_exit_oob(dev);
++
++        rtl8125_up(dev);
++
++#ifdef ENABLE_PTP_SUPPORT
++        if (tp->EnablePtp)
++                rtl8125_ptp_init(tp);
++#endif
++        clear_bit(R8125_FLAG_DOWN, tp->task_flags);
++
++        if (tp->resume_not_chg_speed)
++                _rtl8125_check_link_status(dev, R8125_LINK_STATE_UNKNOWN);
++        else
++                rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising);
++
++        if (tp->esd_flag == 0) {
++                //rtl8125_request_esd_timer(dev);
++
++                rtl8125_schedule_esd_work(tp);
++        }
++
++        //rtl8125_request_link_timer(dev);
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp))
++                rtl8125_schedule_link_work(tp);
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        rtl8125_enable_hw_linkchg_interrupt(tp);
++out:
++
++        return retval;
++
++err_free_all_allocated_mem:
++        rtl8125_free_alloc_resources(tp);
++
++        goto out;
++}
++
++static void
++_rtl8125_set_l1_l0s_entry_latency(struct rtl8125_private *tp, u8 setting)
++{
++        u32 csi_tmp;
++        u32 temp;
++
++        temp = setting & 0x3f;
++        temp <<= 24;
++        /*set PCI configuration space offset 0x70F to setting*/
++        /*When the register offset of PCI configuration space larger than 0xff, use CSI to access it.*/
++
++        csi_tmp = rtl8125_csi_read(tp, 0x70c) & 0xc0ffffff;
++        rtl8125_csi_write(tp, 0x70c, csi_tmp | temp);
++}
++
++static void
++rtl8125_set_l1_l0s_entry_latency(struct rtl8125_private *tp)
++{
++        _rtl8125_set_l1_l0s_entry_latency(tp, 0x27);
++}
++
++static void
++_rtl8125_set_mrrs(struct rtl8125_private *tp, u8 setting)
++{
++        struct pci_dev *pdev = tp->pci_dev;
++        u8 device_control;
++
++        pci_read_config_byte(pdev, 0x79, &device_control);
++        device_control &= ~0x70;
++        device_control |= setting;
++        pci_write_config_byte(pdev, 0x79, device_control);
++}
++
++static void
++rtl8125_set_mrrs(struct rtl8125_private *tp)
++{
++        if (hwoptimize & HW_PATCH_SOC_LAN)
++                return;
++
++        _rtl8125_set_mrrs(tp, 0x40);
++}
++
++void
++rtl8125_hw_set_rx_packet_filter(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        u32 mc_filter[2];   /* Multicast hash filter */
++        int rx_mode;
++        u32 tmp = 0;
++
++        if (dev->flags & IFF_PROMISC) {
++                /* Unconditionally log net taps. */
++                if (netif_msg_link(tp))
++                        printk(KERN_NOTICE "%s: Promiscuous mode enabled.\n",
++                               dev->name);
++
++                rx_mode =
++                        AcceptBroadcast | AcceptMulticast | AcceptMyPhys |
++                        AcceptAllPhys;
++                mc_filter[1] = mc_filter[0] = 0xffffffff;
++        } else if (dev->flags & IFF_ALLMULTI) {
++                /* accept all multicasts. */
++                rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys;
++                mc_filter[1] = mc_filter[0] = 0xffffffff;
++        } else {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
++                struct dev_mc_list *mclist;
++                unsigned int i;
++
++                rx_mode = AcceptBroadcast | AcceptMyPhys;
++                mc_filter[1] = mc_filter[0] = 0;
++                for (i = 0, mclist = dev->mc_list; mclist && i < dev->mc_count;
++                     i++, mclist = mclist->next) {
++                        int bit_nr = ether_crc(ETH_ALEN, mclist->dmi_addr) >> 26;
++                        mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
++                        rx_mode |= AcceptMulticast;
++                }
++#else
++                struct netdev_hw_addr *ha;
++
++                rx_mode = AcceptBroadcast | AcceptMyPhys;
++                mc_filter[1] = mc_filter[0] = 0;
++                netdev_for_each_mc_addr(ha, dev) {
++                        int bit_nr = ether_crc(ETH_ALEN, ha->addr) >> 26;
++                        mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
++                        rx_mode |= AcceptMulticast;
++                }
++#endif
++        }
++
++        if (dev->features & NETIF_F_RXALL)
++                rx_mode |= (AcceptErr | AcceptRunt);
++
++        tmp = mc_filter[0];
++        mc_filter[0] = swab32(mc_filter[1]);
++        mc_filter[1] = swab32(tmp);
++
++        tmp = tp->rtl8125_rx_config | rx_mode | (RTL_R32(tp, RxConfig) & rtl_chip_info[tp->chipset].RxConfigMask);
++
++        RTL_W32(tp, RxConfig, tmp);
++        RTL_W32(tp, MAR0 + 0, mc_filter[0]);
++        RTL_W32(tp, MAR0 + 4, mc_filter[1]);
++}
++
++static void
++rtl8125_set_rx_mode(struct net_device *dev)
++{
++        rtl8125_hw_set_rx_packet_filter(dev);
++}
++
++void
++rtl8125_set_rx_q_num(struct rtl8125_private *tp,
++                     unsigned int num_rx_queues)
++{
++        u16 q_ctrl;
++        u16 rx_q_num;
++
++        rx_q_num = (u16)ilog2(num_rx_queues);
++        rx_q_num &= (BIT_0 | BIT_1 | BIT_2);
++        rx_q_num <<= 2;
++        q_ctrl = RTL_R16(tp, Q_NUM_CTRL_8125);
++        q_ctrl &= ~(BIT_2 | BIT_3 | BIT_4);
++        q_ctrl |= rx_q_num;
++        RTL_W16(tp, Q_NUM_CTRL_8125, q_ctrl);
++}
++
++void
++rtl8125_set_tx_q_num(struct rtl8125_private *tp,
++                     unsigned int num_tx_queues)
++{
++        u16 mac_ocp_data;
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE63E);
++        mac_ocp_data &= ~(BIT_11 | BIT_10);
++        mac_ocp_data |= ((ilog2(num_tx_queues) & 0x03) << 10);
++        rtl8125_mac_ocp_write(tp, 0xE63E, mac_ocp_data);
++}
++
++void
++rtl8125_enable_mcu(struct rtl8125_private *tp, bool enable)
++{
++        if (FALSE == HW_SUPPORT_MAC_MCU(tp))
++                return;
++
++        if (enable)
++                rtl8125_set_mac_ocp_bit(tp, 0xC0B4, BIT_0);
++        else
++                rtl8125_clear_mac_ocp_bit(tp, 0xC0B4, BIT_0);
++}
++
++static void
++rtl8125_clear_tcam_entries(struct rtl8125_private *tp)
++{
++        if (FALSE == HW_SUPPORT_TCAM(tp))
++                return;
++
++        rtl8125_set_mac_ocp_bit(tp, 0xEB54, BIT_0);
++        udelay(1);
++        rtl8125_clear_mac_ocp_bit(tp, 0xEB54, BIT_0);
++}
++
++static void
++rtl8125_enable_tcam(struct rtl8125_private *tp)
++{
++        if (tp->HwSuppTcamVer != 1)
++                return;
++
++        RTL_W16(tp, 0x382, 0x221B);
++}
++
++static u8
++rtl8125_get_l1off_cap_bits(struct rtl8125_private *tp)
++{
++        u8 l1offCapBits = 0;
++
++        l1offCapBits = (BIT_0 | BIT_1);
++        switch (tp->mcfg) {
++        case CFG_METHOD_4:
++        case CFG_METHOD_5:
++        case CFG_METHOD_7:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_10:
++        case CFG_METHOD_11:
++        case CFG_METHOD_12:
++        case CFG_METHOD_13:
++                l1offCapBits |= (BIT_2 | BIT_3);
++                break;
++        default:
++                break;
++        }
++
++        return l1offCapBits;
++}
++
++void
++rtl8125_hw_config(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        struct pci_dev *pdev = tp->pci_dev;
++        u16 mac_ocp_data;
++
++        rtl8125_disable_rx_packet_filter(tp);
++
++        rtl8125_hw_reset(dev);
++
++        rtl8125_enable_cfg9346_write(tp);
++
++        rtl8125_enable_force_clkreq(tp, 0);
++        rtl8125_enable_aspm_clkreq_lock(tp, 0);
++
++        rtl8125_set_eee_lpi_timer(tp);
++
++        //keep magic packet only
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xC0B6);
++        mac_ocp_data &= BIT_0;
++        rtl8125_mac_ocp_write(tp, 0xC0B6, mac_ocp_data);
++
++        rtl8125_tally_counter_addr_fill(tp);
++
++        rtl8125_enable_extend_tally_couter(tp);
++
++        rtl8125_desc_addr_fill(tp);
++
++        /* Set DMA burst size and Interframe Gap Time */
++        RTL_W32(tp, TxConfig, (TX_DMA_BURST_unlimited << TxDMAShift) |
++                (InterFrameGap << TxInterFrameGapShift));
++
++        if (tp->EnableTxNoClose)
++                RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | BIT_6));
++
++        if (enable_double_vlan)
++                rtl8125_enable_double_vlan(tp);
++        else
++                rtl8125_disable_double_vlan(tp);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2 ... CFG_METHOD_7:
++                rtl8125_enable_tcam(tp);
++                break;
++        }
++
++        rtl8125_set_l1_l0s_entry_latency(tp);
++
++        rtl8125_set_mrrs(tp);
++
++#ifdef ENABLE_RSS_SUPPORT
++        rtl8125_config_rss(tp);
++#else
++        RTL_W32(tp, RSS_CTRL_8125, 0x00);
++#endif
++        rtl8125_set_rx_q_num(tp, rtl8125_tot_rx_rings(tp));
++
++        RTL_W8(tp, Config1, RTL_R8(tp, Config1) & ~0x10);
++
++        rtl8125_mac_ocp_write(tp, 0xC140, 0xFFFF);
++        rtl8125_mac_ocp_write(tp, 0xC142, 0xFFFF);
++
++        //new tx desc format
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB58);
++        mac_ocp_data |= (BIT_0);
++        rtl8125_mac_ocp_write(tp, 0xEB58, mac_ocp_data);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE614);
++        mac_ocp_data &= ~(BIT_10 | BIT_9 | BIT_8);
++        if (tp->mcfg == CFG_METHOD_4 || tp->mcfg == CFG_METHOD_5 ||
++            tp->mcfg == CFG_METHOD_7)
++                mac_ocp_data |= ((2 & 0x07) << 8);
++        else
++                mac_ocp_data |= ((3 & 0x07) << 8);
++        rtl8125_mac_ocp_write(tp, 0xE614, mac_ocp_data);
++
++        rtl8125_set_tx_q_num(tp, rtl8125_tot_tx_rings(tp));
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE63E);
++        mac_ocp_data &= ~(BIT_5 | BIT_4);
++        mac_ocp_data |= (0x02 << 4);
++        rtl8125_mac_ocp_write(tp, 0xE63E, mac_ocp_data);
++
++        rtl8125_enable_mcu(tp, 0);
++        rtl8125_enable_mcu(tp, 1);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xC0B4);
++        mac_ocp_data |= (BIT_3 | BIT_2);
++        rtl8125_mac_ocp_write(tp, 0xC0B4, mac_ocp_data);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB6A);
++        mac_ocp_data &= ~(BIT_7 | BIT_6 | BIT_5 | BIT_4 | BIT_3 | BIT_2 | BIT_1 | BIT_0);
++        mac_ocp_data |= (BIT_5 | BIT_4 | BIT_1 | BIT_0);
++        rtl8125_mac_ocp_write(tp, 0xEB6A, mac_ocp_data);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB50);
++        mac_ocp_data &= ~(BIT_9 | BIT_8 | BIT_7 | BIT_6 | BIT_5);
++        mac_ocp_data |= (BIT_6);
++        rtl8125_mac_ocp_write(tp, 0xEB50, mac_ocp_data);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE056);
++        mac_ocp_data &= ~(BIT_7 | BIT_6 | BIT_5 | BIT_4);
++        //mac_ocp_data |= (BIT_4 | BIT_5);
++        rtl8125_mac_ocp_write(tp, 0xE056, mac_ocp_data);
++
++        RTL_W8(tp, TDFNR, 0x10);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE040);
++        mac_ocp_data &= ~(BIT_12);
++        rtl8125_mac_ocp_write(tp, 0xE040, mac_ocp_data);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEA1C);
++        mac_ocp_data &= ~(BIT_1 | BIT_0);
++        mac_ocp_data |= (BIT_0);
++        rtl8125_mac_ocp_write(tp, 0xEA1C, mac_ocp_data);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_12:
++                rtl8125_oob_mutex_lock(tp);
++                break;
++        }
++
++        if (tp->mcfg == CFG_METHOD_10 || tp->mcfg == CFG_METHOD_11 ||
++            tp->mcfg == CFG_METHOD_13)
++                rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4403);
++        else
++                rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4000);
++
++        rtl8125_set_mac_ocp_bit(tp, 0xE052, (BIT_6 | BIT_5));
++        rtl8125_clear_mac_ocp_bit(tp, 0xE052, BIT_3 | BIT_7);
++
++        switch (tp->mcfg) {
++        case CFG_METHOD_2:
++        case CFG_METHOD_3:
++        case CFG_METHOD_6:
++        case CFG_METHOD_8:
++        case CFG_METHOD_9:
++        case CFG_METHOD_12:
++                rtl8125_oob_mutex_unlock(tp);
++                break;
++        }
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xD430);
++        mac_ocp_data &= ~(BIT_11 | BIT_10 | BIT_9 | BIT_8 | BIT_7 | BIT_6 | BIT_5 | BIT_4 | BIT_3 | BIT_2 | BIT_1 | BIT_0);
++        mac_ocp_data |= 0x45F;
++        rtl8125_mac_ocp_write(tp, 0xD430, mac_ocp_data);
++
++        //rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4F87);
++        if (!tp->DASH)
++                RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) | BIT_6 | BIT_7);
++        else
++                RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) & ~(BIT_6 | BIT_7));
++
++        if (tp->mcfg == CFG_METHOD_2 || tp->mcfg == CFG_METHOD_3 ||
++            tp->mcfg == CFG_METHOD_6)
++                RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) | BIT_0);
++
++        if (tp->mcfg != CFG_METHOD_10 && tp->mcfg != CFG_METHOD_11 &&
++            tp->mcfg != CFG_METHOD_13)
++                rtl8125_disable_eee_plus(tp);
++
++        mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEA1C);
++        mac_ocp_data &= ~(BIT_2);
++        rtl8125_mac_ocp_write(tp, 0xEA1C, mac_ocp_data);
++
++        rtl8125_clear_tcam_entries(tp);
++
++        RTL_W16(tp, 0x1880, RTL_R16(tp, 0x1880) & ~(BIT_4 | BIT_5));
++
++        if (tp->HwSuppRxDescType == RX_DESC_RING_TYPE_4) {
++                if (tp->InitRxDescType == RX_DESC_RING_TYPE_4)
++                        RTL_W8(tp, 0xd8, RTL_R8(tp, 0xd8) |
++                               EnableRxDescV4_0);
++                else
++                        RTL_W8(tp, 0xd8, RTL_R8(tp, 0xd8) &
++                               ~EnableRxDescV4_0);
++        }
++
++        if (tp->mcfg == CFG_METHOD_12) {
++                rtl8125_clear_mac_ocp_bit(tp, 0xE00C, BIT_12);
++
++                rtl8125_clear_mac_ocp_bit(tp, 0xC0C2, BIT_6);
++        }
++
++        /* csum offload command for RTL8125 */
++        tp->tx_tcp_csum_cmd = TxTCPCS_C;
++        tp->tx_udp_csum_cmd = TxUDPCS_C;
++        tp->tx_ip_csum_cmd = TxIPCS_C;
++        tp->tx_ipv6_csum_cmd = TxIPV6F_C;
++
++        /* config interrupt type for RTL8125B */
++        if (tp->HwSuppIsrVer > 1)
++                rtl8125_hw_set_interrupt_type(tp, tp->HwCurrIsrVer);
++
++        //other hw parameters
++        rtl8125_hw_clear_timer_int(dev);
++
++        rtl8125_hw_clear_int_miti(dev);
++
++        if (tp->use_timer_interrupt &&
++            (tp->HwCurrIsrVer > 1) &&
++            (tp->HwSuppIntMitiVer > 3) &&
++            (tp->features & RTL_FEATURE_MSIX)) {
++                int i;
++                for (i = 0; i < tp->irq_nvecs; i++)
++                        rtl8125_hw_set_timer_int(tp, i, timer_count_v2);
++        }
++
++        rtl8125_enable_exit_l1_mask(tp);
++
++        rtl8125_mac_ocp_write(tp, 0xE098, 0xC302);
++
++        if (aspm && (tp->org_pci_offset_99 & (BIT_2 | BIT_5 | BIT_6)))
++                rtl8125_init_pci_offset_99(tp);
++        else
++                rtl8125_disable_pci_offset_99(tp);
++
++        if (aspm && (tp->org_pci_offset_180 & rtl8125_get_l1off_cap_bits(tp)))
++                rtl8125_init_pci_offset_180(tp);
++        else
++                rtl8125_disable_pci_offset_180(tp);
++
++        if (tp->RequiredPfmPatch)
++                rtl8125_set_pfm_patch(tp, 0);
++
++        tp->cp_cmd &= ~(EnableBist | Macdbgo_oe | Force_halfdup |
++                        Force_rxflow_en | Force_txflow_en | Cxpl_dbg_sel |
++                        ASF | Macdbgo_sel);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
++        RTL_W16(tp, CPlusCmd, tp->cp_cmd);
++#else
++        rtl8125_hw_set_features(dev, dev->features);
++#endif
++        rtl8125_set_rms(tp, tp->rms);
++
++        rtl8125_disable_rxdvgate(dev);
++
++        if (!tp->pci_cfg_is_read) {
++                pci_read_config_byte(pdev, PCI_COMMAND, &tp->pci_cfg_space.cmd);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_0, &tp->pci_cfg_space.io_base_l);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_0 + 2, &tp->pci_cfg_space.io_base_h);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_2, &tp->pci_cfg_space.mem_base_l);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, &tp->pci_cfg_space.mem_base_h);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_3, &tp->pci_cfg_space.resv_0x1c_l);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, &tp->pci_cfg_space.resv_0x1c_h);
++                pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &tp->pci_cfg_space.ilr);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_4, &tp->pci_cfg_space.resv_0x20_l);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, &tp->pci_cfg_space.resv_0x20_h);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_5, &tp->pci_cfg_space.resv_0x24_l);
++                pci_read_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, &tp->pci_cfg_space.resv_0x24_h);
++                pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &tp->pci_cfg_space.resv_0x2c_l);
++                pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, &tp->pci_cfg_space.resv_0x2c_h);
++                if (tp->HwPcieSNOffset > 0) {
++                        tp->pci_cfg_space.pci_sn_l = rtl8125_csi_read(tp, tp->HwPcieSNOffset);
++                        tp->pci_cfg_space.pci_sn_h = rtl8125_csi_read(tp, tp->HwPcieSNOffset + 4);
++                }
++
++                tp->pci_cfg_is_read = 1;
++        }
++
++        /* Set Rx packet filter */
++        rtl8125_hw_set_rx_packet_filter(dev);
++
++#ifdef ENABLE_DASH_SUPPORT
++        rtl8125_check_and_enable_dash_interrupt(tp);
++#endif
++
++        rtl8125_enable_aspm_clkreq_lock(tp, aspm ? 1 : 0);
++
++        rtl8125_disable_cfg9346_write(tp);
++
++        udelay(10);
++}
++
++void
++rtl8125_hw_start(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++#ifdef ENABLE_LIB_SUPPORT
++        rtl8125_init_lib_ring(tp);
++#endif
++
++        RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
++
++        rtl8125_enable_hw_interrupt(tp);
++
++        rtl8125_lib_reset_complete(tp);
++}
++
++static int
++rtl8125_change_mtu(struct net_device *dev,
++                   int new_mtu)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret = 0;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0)
++        if (new_mtu < ETH_MIN_MTU)
++                return -EINVAL;
++        else if (new_mtu > tp->max_jumbo_frame_size)
++                new_mtu = tp->max_jumbo_frame_size;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0)
++
++        dev->mtu = new_mtu;
++
++        tp->eee.tx_lpi_timer = dev->mtu + ETH_HLEN + 0x20;
++
++        if (!netif_running(dev))
++                goto out;
++
++        rtl8125_down(dev);
++
++        rtl8125_set_rxbufsize(tp, dev);
++
++        ret = rtl8125_init_ring(dev);
++
++        if (ret < 0)
++                goto err_out;
++
++#ifdef CONFIG_R8125_NAPI
++        rtl8125_enable_napi(tp);
++#endif//CONFIG_R8125_NAPI
++
++        if (tp->link_ok(dev))
++                rtl8125_link_on_patch(dev);
++        else
++                rtl8125_link_down_patch(dev);
++
++        //mod_timer(&tp->esd_timer, jiffies + RTL8125_ESD_TIMEOUT);
++        //mod_timer(&tp->link_timer, jiffies + RTL8125_LINK_TIMEOUT);
++out:
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)
++        netdev_update_features(dev);
++#endif
++
++err_out:
++        return ret;
++}
++
++static inline void
++rtl8125_set_desc_dma_addr(struct rtl8125_private *tp,
++                          struct RxDesc *desc,
++                          dma_addr_t mapping)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                ((struct RxDescV3 *)desc)->addr = cpu_to_le64(mapping);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                ((struct RxDescV4 *)desc)->addr = cpu_to_le64(mapping);
++                break;
++        default:
++                desc->addr = cpu_to_le64(mapping);
++                break;
++        }
++}
++
++static inline void
++rtl8125_mark_to_asic_v1(struct RxDesc *desc,
++                        u32 rx_buf_sz)
++{
++        u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
++
++        WRITE_ONCE(desc->opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz));
++}
++
++static inline void
++rtl8125_mark_to_asic_v3(struct RxDescV3 *descv3,
++                        u32 rx_buf_sz)
++{
++        u32 eor = le32_to_cpu(descv3->RxDescNormalDDWord4.opts1) & RingEnd;
++
++        WRITE_ONCE(descv3->RxDescNormalDDWord4.opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz));
++}
++
++static inline void
++rtl8125_mark_to_asic_v4(struct RxDescV4 *descv4,
++                        u32 rx_buf_sz)
++{
++        u32 eor = le32_to_cpu(descv4->RxDescNormalDDWord2.opts1) & RingEnd;
++
++        WRITE_ONCE(descv4->RxDescNormalDDWord2.opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz));
++}
++
++void
++rtl8125_mark_to_asic(struct rtl8125_private *tp,
++                     struct RxDesc *desc,
++                     u32 rx_buf_sz)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                rtl8125_mark_to_asic_v3((struct RxDescV3 *)desc, rx_buf_sz);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                rtl8125_mark_to_asic_v4((struct RxDescV4 *)desc, rx_buf_sz);
++                break;
++        default:
++                rtl8125_mark_to_asic_v1(desc, rx_buf_sz);
++                break;
++        }
++}
++
++static inline void
++rtl8125_map_to_asic(struct rtl8125_private *tp,
++                    struct rtl8125_rx_ring *ring,
++                    struct RxDesc *desc,
++                    dma_addr_t mapping,
++                    u32 rx_buf_sz,
++                    const u32 cur_rx)
++{
++        ring->RxDescPhyAddr[cur_rx] = mapping;
++        rtl8125_set_desc_dma_addr(tp, desc, mapping);
++        wmb();
++        rtl8125_mark_to_asic(tp, desc, rx_buf_sz);
++}
++
++#ifdef ENABLE_PAGE_REUSE
++
++static int
++rtl8125_alloc_rx_page(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring,
++                      struct rtl8125_rx_buffer *rxb)
++{
++        struct page *page;
++        dma_addr_t dma;
++        unsigned int order = tp->rx_buf_page_order;
++
++        //get free page
++        page = dev_alloc_pages(order);
++
++        if (unlikely(!page))
++                return -ENOMEM;
++
++        dma = dma_map_page_attrs(&tp->pci_dev->dev, page, 0,
++                                 tp->rx_buf_page_size,
++                                 DMA_FROM_DEVICE,
++                                 (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
++
++        if (unlikely(dma_mapping_error(&tp->pci_dev->dev, dma))) {
++                __free_pages(page, order);
++                return -ENOMEM;
++        }
++
++        rxb->page = page;
++        rxb->data = page_address(page);
++        rxb->page_offset = ring->rx_offset;
++        rxb->dma = dma;
++
++        //after page alloc, page refcount already = 1
++
++        return 0;
++}
++
++static void
++rtl8125_free_rx_page(struct rtl8125_private *tp, struct rtl8125_rx_buffer *rxb)
++{
++        if (!rxb->page)
++                return;
++
++        dma_unmap_page_attrs(&tp->pci_dev->dev, rxb->dma,
++                             tp->rx_buf_page_size,
++                             DMA_FROM_DEVICE,
++                             (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
++        __free_pages(rxb->page, tp->rx_buf_page_order);
++        rxb->page = NULL;
++}
++
++static void
++_rtl8125_rx_clear(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring)
++{
++        int i;
++        struct rtl8125_rx_buffer *rxb;
++
++        for (i = 0; i < ring->num_rx_desc; i++) {
++                rxb = &ring->rx_buffer[i];
++                if (rxb->skb) {
++                        dev_kfree_skb(rxb->skb);
++                        rxb->skb = NULL;
++                }
++                rtl8125_free_rx_page(tp, rxb);
++        }
++}
++
++static u32
++rtl8125_rx_fill(struct rtl8125_private *tp,
++                struct rtl8125_rx_ring *ring,
++                struct net_device *dev,
++                u32 start,
++                u32 end,
++                u8 in_intr)
++{
++        u32 cur;
++        struct rtl8125_rx_buffer *rxb;
++
++        for (cur = start; end - cur > 0; cur++) {
++                int ret, i = cur % ring->num_rx_desc;
++
++                rxb = &ring->rx_buffer[i];
++                if (rxb->page)
++                        continue;
++
++                ret = rtl8125_alloc_rx_page(tp, ring, rxb);
++                if (ret)
++                        break;
++
++                dma_sync_single_range_for_device(tp_to_dev(tp),
++                                                 rxb->dma,
++                                                 rxb->page_offset,
++                                                 tp->rx_buf_sz,
++                                                 DMA_FROM_DEVICE);
++
++                rtl8125_map_to_asic(tp, ring,
++                                    rtl8125_get_rxdesc(tp, ring->RxDescArray, i),
++                                    rxb->dma + rxb->page_offset,
++                                    tp->rx_buf_sz, i);
++        }
++        return cur - start;
++}
++
++#else //ENABLE_PAGE_REUSE
++
++static void
++rtl8125_free_rx_skb(struct rtl8125_private *tp,
++                    struct rtl8125_rx_ring *ring,
++                    struct sk_buff **sk_buff,
++                    struct RxDesc *desc,
++                    const u32 cur_rx)
++{
++        struct pci_dev *pdev = tp->pci_dev;
++
++        dma_unmap_single(&pdev->dev, ring->RxDescPhyAddr[cur_rx], tp->rx_buf_sz,
++                         DMA_FROM_DEVICE);
++        dev_kfree_skb(*sk_buff);
++        *sk_buff = NULL;
++        rtl8125_make_unusable_by_asic(tp, desc);
++}
++
++static int
++rtl8125_alloc_rx_skb(struct rtl8125_private *tp,
++                     struct rtl8125_rx_ring *ring,
++                     struct sk_buff **sk_buff,
++                     struct RxDesc *desc,
++                     int rx_buf_sz,
++                     const u32 cur_rx,
++                     u8 in_intr)
++{
++        struct sk_buff *skb;
++        dma_addr_t mapping;
++        int ret = 0;
++
++        if (in_intr)
++                skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, rx_buf_sz + R8125_RX_ALIGN);
++        else
++                skb = dev_alloc_skb(rx_buf_sz + R8125_RX_ALIGN);
++
++        if (unlikely(!skb))
++                goto err_out;
++
++        if (!in_intr || !R8125_USE_NAPI_ALLOC_SKB)
++                skb_reserve(skb, R8125_RX_ALIGN);
++
++        mapping = dma_map_single(tp_to_dev(tp), skb->data, rx_buf_sz,
++                                 DMA_FROM_DEVICE);
++        if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) {
++                if (unlikely(net_ratelimit()))
++                        netif_err(tp, drv, tp->dev, "Failed to map RX DMA!\n");
++                goto err_out;
++        }
++
++        *sk_buff = skb;
++        rtl8125_map_to_asic(tp, ring, desc, mapping, rx_buf_sz, cur_rx);
++out:
++        return ret;
++
++err_out:
++        if (skb)
++                dev_kfree_skb(skb);
++        ret = -ENOMEM;
++        rtl8125_make_unusable_by_asic(tp, desc);
++        goto out;
++}
++
++static void
++_rtl8125_rx_clear(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring)
++{
++        int i;
++
++        for (i = 0; i < ring->num_rx_desc; i++) {
++                if (ring->Rx_skbuff[i]) {
++                        rtl8125_free_rx_skb(tp,
++                                            ring,
++                                            ring->Rx_skbuff + i,
++                                            rtl8125_get_rxdesc(tp, ring->RxDescArray, i),
++                                            i);
++                        ring->Rx_skbuff[i] = NULL;
++                }
++        }
++}
++
++static u32
++rtl8125_rx_fill(struct rtl8125_private *tp,
++                struct rtl8125_rx_ring *ring,
++                struct net_device *dev,
++                u32 start,
++                u32 end,
++                u8 in_intr)
++{
++        u32 cur;
++
++        for (cur = start; end - cur > 0; cur++) {
++                int ret, i = cur % ring->num_rx_desc;
++
++                if (ring->Rx_skbuff[i])
++                        continue;
++
++                ret = rtl8125_alloc_rx_skb(tp,
++                                           ring,
++                                           ring->Rx_skbuff + i,
++                                           rtl8125_get_rxdesc(tp, ring->RxDescArray, i),
++                                           tp->rx_buf_sz,
++                                           i,
++                                           in_intr);
++                if (ret < 0)
++                        break;
++        }
++        return cur - start;
++}
++
++#endif //ENABLE_PAGE_REUSE
++
++void
++rtl8125_rx_clear(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++
++                _rtl8125_rx_clear(tp, ring);
++        }
++}
++
++static void
++rtl8125_mark_as_last_descriptor_v1(struct RxDesc *desc)
++{
++        desc->opts1 |= cpu_to_le32(RingEnd);
++}
++
++static void
++rtl8125_mark_as_last_descriptor_v3(struct RxDescV3 *descv3)
++{
++        descv3->RxDescNormalDDWord4.opts1 |= cpu_to_le32(RingEnd);
++}
++
++static void
++rtl8125_mark_as_last_descriptor_v4(struct RxDescV4 *descv4)
++{
++        descv4->RxDescNormalDDWord2.opts1 |= cpu_to_le32(RingEnd);
++}
++
++void
++rtl8125_mark_as_last_descriptor(struct rtl8125_private *tp,
++                                struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                rtl8125_mark_as_last_descriptor_v3((struct RxDescV3 *)desc);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                rtl8125_mark_as_last_descriptor_v4((struct RxDescV4 *)desc);
++                break;
++        default:
++                rtl8125_mark_as_last_descriptor_v1(desc);
++                break;
++        }
++}
++
++static void
++rtl8125_desc_addr_fill(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++                RTL_W32(tp, ring->tdsar_reg, ((u64)ring->TxPhyAddr & DMA_BIT_MASK(32)));
++                RTL_W32(tp, ring->tdsar_reg + 4, ((u64)ring->TxPhyAddr >> 32));
++        }
++
++        if (rtl8125_num_lib_rx_rings(tp) == 0) {
++                for (i = 0; i < tp->num_rx_rings; i++) {
++                        struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++                        RTL_W32(tp, ring->rdsar_reg, ((u64)ring->RxPhyAddr & DMA_BIT_MASK(32)));
++                        RTL_W32(tp, ring->rdsar_reg + 4, ((u64)ring->RxPhyAddr >> 32));
++                }
++        }
++}
++
++static void
++rtl8125_tx_desc_init(struct rtl8125_private *tp)
++{
++        int i = 0;
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++                memset(ring->TxDescArray, 0x0, ring->TxDescAllocSize);
++
++                ring->TxDescArray[ring->num_tx_desc - 1].opts1 = cpu_to_le32(RingEnd);
++        }
++}
++
++static void
++rtl8125_rx_desc_init(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++                memset(ring->RxDescArray, 0x0, ring->RxDescAllocSize);
++        }
++}
++
++int
++rtl8125_init_ring(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++
++        rtl8125_init_ring_indexes(tp);
++
++        rtl8125_tx_desc_init(tp);
++        rtl8125_rx_desc_init(tp);
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++                memset(ring->tx_skb, 0x0, sizeof(ring->tx_skb));
++        }
++
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring = &tp->rx_ring[i];
++#ifdef ENABLE_PAGE_REUSE
++                ring->rx_offset = R8125_RX_ALIGN;
++#else
++                memset(ring->Rx_skbuff, 0x0, sizeof(ring->Rx_skbuff));
++#endif //ENABLE_PAGE_REUSE
++                if (rtl8125_rx_fill(tp, ring, dev, 0, ring->num_rx_desc, 0) != ring->num_rx_desc)
++                        goto err_out;
++
++                rtl8125_mark_as_last_descriptor(tp, rtl8125_get_rxdesc(tp, ring->RxDescArray, ring->num_rx_desc - 1));
++        }
++
++        return 0;
++
++err_out:
++        rtl8125_rx_clear(tp);
++        return -ENOMEM;
++}
++
++static void
++rtl8125_unmap_tx_skb(struct pci_dev *pdev,
++                     struct ring_info *tx_skb,
++                     struct TxDesc *desc)
++{
++        unsigned int len = tx_skb->len;
++
++        dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len, DMA_TO_DEVICE);
++
++        desc->opts1 = cpu_to_le32(RTK_MAGIC_DEBUG_VALUE);
++        desc->opts2 = 0x00;
++        desc->addr = RTL8125_MAGIC_NUMBER;
++        tx_skb->len = 0;
++}
++
++static void
++rtl8125_tx_clear_range(struct rtl8125_private *tp,
++                       struct rtl8125_tx_ring *ring,
++                       u32 start,
++                       unsigned int n)
++{
++        unsigned int i;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
++        struct net_device *dev = tp->dev;
++#endif
++
++        for (i = 0; i < n; i++) {
++                unsigned int entry = (start + i) % ring->num_tx_desc;
++                struct ring_info *tx_skb = ring->tx_skb + entry;
++                unsigned int len = tx_skb->len;
++
++                if (len) {
++                        struct sk_buff *skb = tx_skb->skb;
++
++                        rtl8125_unmap_tx_skb(tp->pci_dev, tx_skb,
++                                             ring->TxDescArray + entry);
++                        if (skb) {
++                                RTLDEV->stats.tx_dropped++;
++                                dev_kfree_skb_any(skb);
++                                tx_skb->skb = NULL;
++                        }
++                }
++        }
++}
++
++void
++rtl8125_tx_clear(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++) {
++                struct rtl8125_tx_ring *ring = &tp->tx_ring[i];
++                rtl8125_tx_clear_range(tp, ring, ring->dirty_tx, ring->num_tx_desc);
++                ring->cur_tx = ring->dirty_tx = 0;
++        }
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        set_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->reset_task, 4);
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++}
++
++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        set_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->esd_task, RTL8125_ESD_TIMEOUT);
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++}
++
++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        set_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->linkchg_task, 4);
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++}
++
++static void rtl8125_schedule_link_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        set_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->link_task, RTL8125_LINK_TIMEOUT);
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++}
++
++static void rtl8125_schedule_dash_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        set_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->dash_task, RTL8125_DASH_TIMEOUT);
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++}
++
++#define rtl8125_cancel_schedule_reset_work(a)
++#define rtl8125_cancel_schedule_esd_work(a)
++#define rtl8125_cancel_schedule_linkchg_work(a)
++#define rtl8125_cancel_schedule_link_work(a)
++#define rtl8125_cancel_schedule_dash_work(a)
++
++#else
++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp)
++{
++        set_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->reset_task, 4);
++}
++
++static void rtl8125_cancel_schedule_reset_work(struct rtl8125_private *tp)
++{
++        struct work_struct *work = &tp->reset_task.work;
++
++        if (!work->func)
++                return;
++
++        cancel_delayed_work_sync(&tp->reset_task);
++}
++
++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp)
++{
++        set_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->esd_task, RTL8125_ESD_TIMEOUT);
++}
++
++static void rtl8125_cancel_schedule_esd_work(struct rtl8125_private *tp)
++{
++        struct work_struct *work = &tp->esd_task.work;
++
++        if (!work->func)
++                return;
++
++        cancel_delayed_work_sync(&tp->esd_task);
++}
++
++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp)
++{
++        set_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->linkchg_task, 4);
++}
++
++static void rtl8125_cancel_schedule_linkchg_work(struct rtl8125_private *tp)
++{
++        struct work_struct *work = &tp->linkchg_task.work;
++
++        if (!work->func)
++                return;
++
++        cancel_delayed_work_sync(&tp->linkchg_task);
++}
++
++static void rtl8125_schedule_link_work(struct rtl8125_private *tp)
++{
++        set_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->link_task, RTL8125_LINK_TIMEOUT);
++}
++
++static void rtl8125_cancel_schedule_link_work(struct rtl8125_private *tp)
++{
++        struct work_struct *work = &tp->link_task.work;
++
++        if (!work->func)
++                return;
++
++        cancel_delayed_work_sync(&tp->link_task);
++}
++
++void rtl8125_schedule_dash_work(struct rtl8125_private *tp)
++{
++        set_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags);
++        schedule_delayed_work(&tp->dash_task, RTL8125_DASH_TIMEOUT);
++}
++
++static void rtl8125_cancel_schedule_dash_work(struct rtl8125_private *tp)
++{
++        struct work_struct *work = &tp->dash_task.work;
++
++        if (!work->func)
++                return;
++
++        cancel_delayed_work_sync(&tp->dash_task);
++}
++#endif
++
++static void rtl8125_init_all_schedule_work(struct rtl8125_private *tp)
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++        INIT_WORK(&tp->reset_task, rtl8125_reset_task, dev);
++        INIT_WORK(&tp->esd_task, rtl8125_esd_task, dev);
++        INIT_WORK(&tp->linkchg_task, rtl8125_linkchg_task, dev);
++        INIT_WORK(&tp->link_task, rtl8125_link_task, dev);
++        INIT_WORK(&tp->dash_task, rtl8125_dash_task, dev);
++#else
++        INIT_DELAYED_WORK(&tp->reset_task, rtl8125_reset_task);
++        INIT_DELAYED_WORK(&tp->esd_task, rtl8125_esd_task);
++        INIT_DELAYED_WORK(&tp->linkchg_task, rtl8125_linkchg_task);
++        INIT_DELAYED_WORK(&tp->link_task, rtl8125_link_task);
++        INIT_DELAYED_WORK(&tp->dash_task, rtl8125_dash_task);
++#endif
++}
++
++static void rtl8125_cancel_all_schedule_work(struct rtl8125_private *tp)
++{
++        rtl8125_cancel_schedule_reset_work(tp);
++        rtl8125_cancel_schedule_esd_work(tp);
++        rtl8125_cancel_schedule_linkchg_work(tp);
++        rtl8125_cancel_schedule_link_work(tp);
++        rtl8125_cancel_schedule_dash_work(tp);
++}
++
++static void
++rtl8125_wait_for_irq_complete(struct rtl8125_private *tp)
++{
++        if (tp->features & RTL_FEATURE_MSIX) {
++                int i;
++                for (i = 0; i < tp->irq_nvecs; i++)
++                        synchronize_irq(tp->irq_tbl[i].vector);
++        } else {
++                synchronize_irq(tp->dev->irq);
++        }
++}
++
++void
++_rtl8125_wait_for_quiescence(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        /* Wait for any pending NAPI task to complete */
++#ifdef CONFIG_R8125_NAPI
++        rtl8125_disable_napi(tp);
++#endif//CONFIG_R8125_NAPI
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,67)
++        /* Give a racing hard_start_xmit a few cycles to complete. */
++        synchronize_net();
++#endif
++
++        rtl8125_irq_mask_and_ack(tp);
++
++        rtl8125_wait_for_irq_complete(tp);
++}
++
++static void
++rtl8125_wait_for_quiescence(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        //suppress unused variable
++        (void)(tp);
++
++        _rtl8125_wait_for_quiescence(dev);
++
++#ifdef CONFIG_R8125_NAPI
++        rtl8125_enable_napi(tp);
++#endif//CONFIG_R8125_NAPI
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_reset_task(void *_data)
++{
++        struct net_device *dev = _data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++#else
++static void rtl8125_reset_task(struct work_struct *work)
++{
++        struct rtl8125_private *tp =
++                container_of(work, struct rtl8125_private, reset_task.work);
++        struct net_device *dev = tp->dev;
++#endif
++        int i;
++
++        rtnl_lock();
++
++        if (!netif_running(dev) ||
++            test_bit(R8125_FLAG_DOWN, tp->task_flags) ||
++            !test_and_clear_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags))
++                goto out_unlock;
++
++        netdev_err(dev, "Device reseting!\n");
++
++        netif_carrier_off(dev);
++        netif_tx_disable(dev);
++        _rtl8125_wait_for_quiescence(dev);
++        rtl8125_hw_reset(dev);
++
++        rtl8125_tx_clear(tp);
++
++        rtl8125_init_ring_indexes(tp);
++
++        rtl8125_tx_desc_init(tp);
++        for (i = 0; i < tp->num_rx_rings; i++) {
++                struct rtl8125_rx_ring *ring;
++                u32 entry;
++
++                ring = &tp->rx_ring[i];
++                for (entry = 0; entry < ring->num_rx_desc; entry++) {
++                        struct RxDesc *desc;
++
++                        desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
++                        rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
++                }
++        }
++
++#ifdef ENABLE_PTP_SUPPORT
++        rtl8125_ptp_reset(tp);
++#endif
++
++#ifdef CONFIG_R8125_NAPI
++        rtl8125_enable_napi(tp);
++#endif //CONFIG_R8125_NAPI
++
++        if (tp->resume_not_chg_speed) {
++                _rtl8125_check_link_status(dev, R8125_LINK_STATE_UNKNOWN);
++
++                tp->resume_not_chg_speed = 0;
++        } else {
++                rtl8125_enable_hw_linkchg_interrupt(tp);
++
++                rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising);
++        }
++
++out_unlock:
++        rtnl_unlock();
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_esd_task(void *_data)
++{
++        struct net_device *dev = _data;
++        struct rtl8125_private *tp = netdev_priv(dev);
++#else
++static void rtl8125_esd_task(struct work_struct *work)
++{
++        struct rtl8125_private *tp =
++                container_of(work, struct rtl8125_private, esd_task.work);
++        struct net_device *dev = tp->dev;
++#endif
++        rtnl_lock();
++
++        if (!netif_running(dev) ||
++            test_bit(R8125_FLAG_DOWN, tp->task_flags) ||
++            !test_and_clear_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags))
++                goto out_unlock;
++
++        rtl8125_esd_checker(tp);
++
++        rtl8125_schedule_esd_work(tp);
++
++out_unlock:
++        rtnl_unlock();
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_linkchg_task(void *_data)
++{
++        struct net_device *dev = _data;
++        //struct rtl8125_private *tp = netdev_priv(dev);
++#else
++static void rtl8125_linkchg_task(struct work_struct *work)
++{
++        struct rtl8125_private *tp =
++                container_of(work, struct rtl8125_private, linkchg_task.work);
++        struct net_device *dev = tp->dev;
++#endif
++        rtnl_lock();
++
++        if (!netif_running(dev) ||
++            test_bit(R8125_FLAG_DOWN, tp->task_flags) ||
++            !test_and_clear_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags))
++                goto out_unlock;
++
++        rtl8125_check_link_status(dev);
++
++out_unlock:
++        rtnl_unlock();
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_link_task(void *_data)
++{
++        struct net_device *dev = _data;
++        //struct rtl8125_private *tp = netdev_priv(dev);
++#else
++static void rtl8125_link_task(struct work_struct *work)
++{
++        struct rtl8125_private *tp =
++                container_of(work, struct rtl8125_private, link_task.work);
++        struct net_device *dev = tp->dev;
++#endif
++        rtnl_lock();
++
++        if (!netif_running(dev) ||
++            test_bit(R8125_FLAG_DOWN, tp->task_flags) ||
++            !test_and_clear_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING,
++                                tp->task_flags))
++                goto out_unlock;
++
++        if (netif_carrier_ok(dev) != tp->link_ok(dev))
++                rtl8125_schedule_linkchg_work(tp);
++
++        rtl8125_schedule_link_work(tp);
++
++out_unlock:
++        rtnl_unlock();
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
++static void rtl8125_dash_task(void *_data)
++{
++        struct net_device *dev = _data;
++        //struct rtl8125_private *tp = netdev_priv(dev);
++#else
++static void rtl8125_dash_task(struct work_struct *work)
++{
++        struct rtl8125_private *tp =
++                container_of(work, struct rtl8125_private, dash_task.work);
++        struct net_device *dev = tp->dev;
++#endif
++        rtnl_lock();
++
++        if (!netif_running(dev) ||
++            test_bit(R8125_FLAG_DOWN, tp->task_flags) ||
++            !test_and_clear_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags))
++                goto out_unlock;
++
++#ifdef ENABLE_DASH_SUPPORT
++        rtl8125_handle_dash_interrupt(dev);
++#endif
++
++out_unlock:
++        rtnl_unlock();
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
++static void
++rtl8125_tx_timeout(struct net_device *dev, unsigned int txqueue)
++#else
++static void
++rtl8125_tx_timeout(struct net_device *dev)
++#endif
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        netdev_err(dev, "Transmit timeout reset Device!\n");
++
++        /* Let's wait a bit while any (async) irq lands on */
++        rtl8125_schedule_reset_work(tp);
++}
++
++static u32
++rtl8125_get_txd_opts1(struct rtl8125_tx_ring *ring,
++                      u32 opts1,
++                      u32 len,
++                      unsigned int entry)
++{
++        u32 status = opts1 | len;
++
++        if (entry == ring->num_tx_desc - 1)
++                status |= RingEnd;
++
++        return status;
++}
++
++static int
++rtl8125_xmit_frags(struct rtl8125_private *tp,
++                   struct rtl8125_tx_ring *ring,
++                   struct sk_buff *skb,
++                   const u32 *opts)
++{
++        struct skb_shared_info *info = skb_shinfo(skb);
++        unsigned int cur_frag, entry;
++        struct TxDesc *txd = NULL;
++        const unsigned char nr_frags = info->nr_frags;
++        unsigned long PktLenCnt = 0;
++        bool LsoPatchEnabled = FALSE;
++
++        entry = ring->cur_tx;
++        for (cur_frag = 0; cur_frag < nr_frags; cur_frag++) {
++                skb_frag_t *frag = info->frags + cur_frag;
++                dma_addr_t mapping;
++                u32 status, len;
++                void *addr;
++
++                entry = (entry + 1) % ring->num_tx_desc;
++
++                txd = ring->TxDescArray + entry;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
++                len = frag->size;
++                addr = ((void *) page_address(frag->page)) + frag->page_offset;
++#else
++                len = skb_frag_size(frag);
++                addr = skb_frag_address(frag);
++#endif
++                if (tp->RequireLSOPatch  &&
++                    (cur_frag == nr_frags - 1) &&
++                    (opts[0] & (GiantSendv4|GiantSendv6)) &&
++                    PktLenCnt < ETH_FRAME_LEN &&
++                    len > 1) {
++                        len -= 1;
++                        mapping = dma_map_single(tp_to_dev(tp), addr, len, DMA_TO_DEVICE);
++
++                        if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) {
++                                if (unlikely(net_ratelimit()))
++                                        netif_err(tp, drv, tp->dev,
++                                                  "Failed to map TX fragments DMA!\n");
++                                goto err_out;
++                        }
++
++                        /* anti gcc 2.95.3 bugware (sic) */
++                        status = rtl8125_get_txd_opts1(ring, opts[0], len, entry);
++
++                        txd->addr = cpu_to_le64(mapping);
++
++                        ring->tx_skb[entry].len = len;
++
++                        txd->opts2 = cpu_to_le32(opts[1]);
++                        wmb();
++                        txd->opts1 = cpu_to_le32(status);
++
++                        //second txd
++                        addr += len;
++                        len = 1;
++                        entry = (entry + 1) % ring->num_tx_desc;
++                        txd = ring->TxDescArray + entry;
++                        cur_frag += 1;
++
++                        LsoPatchEnabled = TRUE;
++                }
++
++                mapping = dma_map_single(tp_to_dev(tp), addr, len, DMA_TO_DEVICE);
++
++                if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) {
++                        if (unlikely(net_ratelimit()))
++                                netif_err(tp, drv, tp->dev,
++                                          "Failed to map TX fragments DMA!\n");
++                        goto err_out;
++                }
++
++                /* anti gcc 2.95.3 bugware (sic) */
++                status = rtl8125_get_txd_opts1(ring, opts[0], len, entry);
++                if (cur_frag == (nr_frags - 1) || LsoPatchEnabled == TRUE)
++                        status |= LastFrag;
++
++                txd->addr = cpu_to_le64(mapping);
++
++                ring->tx_skb[entry].len = len;
++
++                txd->opts2 = cpu_to_le32(opts[1]);
++                wmb();
++                txd->opts1 = cpu_to_le32(status);
++
++                PktLenCnt += len;
++        }
++
++        return cur_frag;
++
++err_out:
++        rtl8125_tx_clear_range(tp, ring, ring->cur_tx + 1, cur_frag);
++        return -EIO;
++}
++
++static inline
++__be16 get_protocol(struct sk_buff *skb)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
++        return vlan_get_protocol(skb);
++#else
++        __be16 protocol;
++
++        if (skb->protocol == htons(ETH_P_8021Q))
++                protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
++        else
++                protocol = skb->protocol;
++
++        return protocol;
++#endif
++}
++
++static inline
++u8 rtl8125_get_l4_protocol(struct sk_buff *skb)
++{
++        int no = skb_network_offset(skb);
++        struct ipv6hdr *i6h, _i6h;
++        struct iphdr *ih, _ih;
++        u8 ip_protocol = IPPROTO_RAW;
++
++        switch (get_protocol(skb)) {
++        case  __constant_htons(ETH_P_IP):
++                ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih);
++                if (ih)
++                        ip_protocol = ih->protocol;
++                break;
++        case  __constant_htons(ETH_P_IPV6):
++                i6h = skb_header_pointer(skb, no, sizeof(_i6h), &_i6h);
++                if (i6h)
++                        ip_protocol = i6h->nexthdr;
++                break;
++        }
++
++        return ip_protocol;
++}
++
++static bool rtl8125_skb_pad_with_len(struct sk_buff *skb, unsigned int len)
++{
++        if (skb_padto(skb, len))
++                return false;
++        skb_put(skb, len - skb->len);
++        return true;
++}
++
++static bool rtl8125_skb_pad(struct sk_buff *skb)
++{
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0)
++        return rtl8125_skb_pad_with_len(skb, ETH_ZLEN);
++#else
++        return !eth_skb_pad(skb);
++#endif
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
++/* msdn_giant_send_check()
++ * According to the document of microsoft, the TCP Pseudo Header excludes the
++ * packet length for IPv6 TCP large packets.
++ */
++static int msdn_giant_send_check(struct sk_buff *skb)
++{
++        const struct ipv6hdr *ipv6h;
++        struct tcphdr *th;
++        int ret;
++
++        ret = skb_cow_head(skb, 0);
++        if (ret)
++                return ret;
++
++        ipv6h = ipv6_hdr(skb);
++        th = tcp_hdr(skb);
++
++        th->check = 0;
++        th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0);
++
++        return ret;
++}
++#endif
++
++static bool rtl8125_require_pad_ptp_pkt(struct rtl8125_private *tp)
++{
++        switch (tp->mcfg) {
++        case CFG_METHOD_2 ... CFG_METHOD_7:
++                return true;
++        default:
++                return false;
++        }
++}
++
++#define MIN_PATCH_LEN (47)
++static u32
++rtl8125_get_patch_pad_len(struct rtl8125_private *tp,
++                          struct sk_buff *skb)
++{
++        u32 pad_len = 0;
++        int trans_data_len;
++        u32 hdr_len;
++        u32 pkt_len = skb->len;
++        u8 ip_protocol;
++        bool has_trans = skb_transport_header_was_set(skb);
++
++        if (!rtl8125_require_pad_ptp_pkt(tp))
++                goto no_padding;
++
++        if (!(has_trans && (pkt_len < 175))) //128 + MIN_PATCH_LEN
++                goto no_padding;
++
++        ip_protocol = rtl8125_get_l4_protocol(skb);
++        if (!(ip_protocol == IPPROTO_TCP || ip_protocol == IPPROTO_UDP))
++                goto no_padding;
++
++        trans_data_len = pkt_len -
++                         (skb->transport_header -
++                          skb_headroom(skb));
++        if (ip_protocol == IPPROTO_UDP) {
++                if (trans_data_len > 3 && trans_data_len < MIN_PATCH_LEN) {
++                        u16 dest_port = 0;
++
++                        skb_copy_bits(skb, skb->transport_header - skb_headroom(skb) + 2, &dest_port, 2);
++                        dest_port = ntohs(dest_port);
++
++                        if (dest_port == 0x13f ||
++                            dest_port == 0x140) {
++                                pad_len = MIN_PATCH_LEN - trans_data_len;
++                                goto out;
++                        }
++                }
++        }
++
++        hdr_len = 0;
++        if (ip_protocol == IPPROTO_TCP)
++                hdr_len = 20;
++        else if (ip_protocol == IPPROTO_UDP)
++                hdr_len = 8;
++        if (trans_data_len < hdr_len)
++                pad_len = hdr_len - trans_data_len;
++
++out:
++        if ((pkt_len + pad_len) < ETH_ZLEN)
++                pad_len = ETH_ZLEN - pkt_len;
++
++        return pad_len;
++
++no_padding:
++
++        return 0;
++}
++
++static bool
++rtl8125_tso_csum(struct sk_buff *skb,
++                 struct net_device *dev,
++                 u32 *opts,
++                 unsigned int *bytecount,
++                 unsigned short *gso_segs)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned long large_send = 0;
++        u32 csum_cmd = 0;
++        u8 sw_calc_csum = false;
++        u8 check_patch_required = true;
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++        if (dev->features & (NETIF_F_TSO | NETIF_F_TSO6)) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)
++                u32 mss = skb_shinfo(skb)->tso_size;
++#else
++                u32 mss = skb_shinfo(skb)->gso_size;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)
++
++                /* TCP Segmentation Offload (or TCP Large Send) */
++                if (mss) {
++                        union {
++                                struct iphdr *v4;
++                                struct ipv6hdr *v6;
++                                unsigned char *hdr;
++                        } ip;
++                        union {
++                                struct tcphdr *tcp;
++                                struct udphdr *udp;
++                                unsigned char *hdr;
++                        } l4;
++                        u32 l4_offset, hdr_len;
++
++                        ip.hdr = skb_network_header(skb);
++                        l4.hdr = skb_checksum_start(skb);
++
++                        l4_offset = skb_transport_offset(skb);
++                        assert((l4_offset%2) == 0);
++                        switch (get_protocol(skb)) {
++                        case __constant_htons(ETH_P_IP):
++                                if (l4_offset <= GTTCPHO_MAX) {
++                                        opts[0] |= GiantSendv4;
++                                        opts[0] |= l4_offset << GTTCPHO_SHIFT;
++                                        opts[1] |= min(mss, MSS_MAX) << 18;
++                                        large_send = 1;
++                                }
++                                break;
++                        case __constant_htons(ETH_P_IPV6):
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
++                                if (msdn_giant_send_check(skb))
++                                        return false;
++#endif
++                                if (l4_offset <= GTTCPHO_MAX) {
++                                        opts[0] |= GiantSendv6;
++                                        opts[0] |= l4_offset << GTTCPHO_SHIFT;
++                                        opts[1] |= min(mss, MSS_MAX) << 18;
++                                        large_send = 1;
++                                }
++                                break;
++                        default:
++                                if (unlikely(net_ratelimit()))
++                                        dprintk("tso proto=%x!\n", skb->protocol);
++                                break;
++                        }
++
++                        if (large_send == 0)
++                                return false;
++
++
++                        /* compute length of segmentation header */
++                        hdr_len = (l4.tcp->doff * 4) + l4_offset;
++                        /* update gso size and bytecount with header size */
++                        *gso_segs = skb_shinfo(skb)->gso_segs;
++                        *bytecount += (*gso_segs - 1) * hdr_len;
++
++                        return true;
++                }
++        }
++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++
++        if (skb->ip_summed == CHECKSUM_PARTIAL) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
++                const struct iphdr *ip = skb->nh.iph;
++
++                if (dev->features & NETIF_F_IP_CSUM) {
++                        if (ip->protocol == IPPROTO_TCP)
++                                csum_cmd = tp->tx_ip_csum_cmd | tp->tx_tcp_csum_cmd;
++                        else if (ip->protocol == IPPROTO_UDP)
++                                csum_cmd = tp->tx_ip_csum_cmd | tp->tx_udp_csum_cmd;
++                        else if (ip->protocol == IPPROTO_IP)
++                                csum_cmd = tp->tx_ip_csum_cmd;
++                }
++#else
++                u8 ip_protocol = IPPROTO_RAW;
++
++                switch (get_protocol(skb)) {
++                case  __constant_htons(ETH_P_IP):
++                        if (dev->features & NETIF_F_IP_CSUM) {
++                                ip_protocol = ip_hdr(skb)->protocol;
++                                csum_cmd = tp->tx_ip_csum_cmd;
++                        }
++                        break;
++                case  __constant_htons(ETH_P_IPV6):
++                        if (dev->features & NETIF_F_IPV6_CSUM) {
++                                if (skb_transport_offset(skb) > 0 && skb_transport_offset(skb) <= TCPHO_MAX) {
++                                        ip_protocol = ipv6_hdr(skb)->nexthdr;
++                                        csum_cmd = tp->tx_ipv6_csum_cmd;
++                                        csum_cmd |= skb_transport_offset(skb) << TCPHO_SHIFT;
++                                }
++                        }
++                        break;
++                default:
++                        if (unlikely(net_ratelimit()))
++                                dprintk("checksum_partial proto=%x!\n", skb->protocol);
++                        break;
++                }
++
++                if (ip_protocol == IPPROTO_TCP)
++                        csum_cmd |= tp->tx_tcp_csum_cmd;
++                else if (ip_protocol == IPPROTO_UDP)
++                        csum_cmd |= tp->tx_udp_csum_cmd;
++#endif
++                if (csum_cmd == 0) {
++                        sw_calc_csum = true;
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++                        WARN_ON(1); /* we need a WARN() */
++#endif
++                }
++
++                if (ip_protocol == IPPROTO_TCP)
++                        check_patch_required = false;
++        }
++
++        if (check_patch_required) {
++                u32 pad_len = rtl8125_get_patch_pad_len(tp, skb);
++
++                if (pad_len > 0) {
++                        if (!rtl8125_skb_pad_with_len(skb, skb->len + pad_len))
++                                return false;
++
++                        if (csum_cmd != 0)
++                                sw_calc_csum = true;
++                }
++        }
++
++        if (skb->len < ETH_ZLEN) {
++                if (tp->UseSwPaddingShortPkt ||
++                    (tp->ShortPacketSwChecksum && csum_cmd != 0)) {
++                        if (!rtl8125_skb_pad(skb))
++                                return false;
++
++                        if (csum_cmd != 0)
++                                sw_calc_csum = true;
++                }
++        }
++
++        if (sw_calc_csum) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,7)
++                skb_checksum_help(&skb, 0);
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++                skb_checksum_help(skb, 0);
++#else
++                skb_checksum_help(skb);
++#endif
++        } else
++                opts[1] |= csum_cmd;
++
++        return true;
++}
++
++static bool rtl8125_tx_slots_avail(struct rtl8125_private *tp,
++                                   struct rtl8125_tx_ring *ring)
++{
++        unsigned int slots_avail = READ_ONCE(ring->dirty_tx) + ring->num_tx_desc
++                                   - READ_ONCE(ring->cur_tx);
++
++        /* A skbuff with nr_frags needs nr_frags+1 entries in the tx queue */
++        return slots_avail > MAX_SKB_FRAGS;
++}
++
++static inline u32
++rtl8125_fast_mod_mask(const u32 input, const u32 mask)
++{
++        return input > mask ? input & mask : input;
++}
++
++static void rtl8125_doorbell(struct rtl8125_private *tp,
++                             struct rtl8125_tx_ring *ring)
++{
++        if (tp->EnableTxNoClose) {
++                if (tp->HwSuppTxNoCloseVer > 3)
++                        RTL_W32(tp, ring->sw_tail_ptr_reg, ring->cur_tx);
++                else
++                        RTL_W16(tp, ring->sw_tail_ptr_reg, ring->cur_tx);
++        } else
++                RTL_W16(tp, TPPOLL_8125, BIT(ring->index));    /* set polling bit */
++}
++
++static netdev_tx_t
++rtl8125_start_xmit(struct sk_buff *skb,
++                   struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        unsigned int   bytecount;
++        unsigned short gso_segs;
++        struct ring_info *last;
++        unsigned int last_entry;
++        unsigned int entry;
++        struct TxDesc *txd;
++        dma_addr_t mapping;
++        u32 len;
++        u32 opts[2];
++        netdev_tx_t ret = NETDEV_TX_OK;
++        int frags;
++        u8 EnableTxNoClose = tp->EnableTxNoClose;
++        const u16 queue_mapping = skb_get_queue_mapping(skb);
++        struct rtl8125_tx_ring *ring;
++        bool stop_queue;
++
++        assert(queue_mapping < tp->num_tx_rings);
++
++        ring = &tp->tx_ring[queue_mapping];
++
++        if (unlikely(!rtl8125_tx_slots_avail(tp, ring))) {
++                if (netif_msg_drv(tp)) {
++                        printk(KERN_ERR
++                               "%s: BUG! Tx Ring[%d] full when queue awake!\n",
++                               dev->name,
++                               queue_mapping);
++                }
++                goto err_stop;
++        }
++
++        entry = ring->cur_tx % ring->num_tx_desc;
++        txd = ring->TxDescArray + entry;
++
++        if (!EnableTxNoClose) {
++                if (unlikely(le32_to_cpu(txd->opts1) & DescOwn)) {
++                        if (netif_msg_drv(tp)) {
++                                printk(KERN_ERR
++                                       "%s: BUG! Tx Desc is own by hardware!\n",
++                                       dev->name);
++                        }
++                        goto err_stop;
++                }
++        }
++
++        bytecount = skb->len;
++        gso_segs = 1;
++
++        opts[0] = DescOwn;
++        opts[1] = rtl8125_tx_vlan_tag(tp, skb);
++
++        if (unlikely(!rtl8125_tso_csum(skb, dev, opts, &bytecount, &gso_segs)))
++                goto err_dma_0;
++
++        frags = rtl8125_xmit_frags(tp, ring, skb, opts);
++        if (unlikely(frags < 0))
++                goto err_dma_0;
++        if (frags) {
++                len = skb_headlen(skb);
++                opts[0] |= FirstFrag;
++        } else {
++                len = skb->len;
++                opts[0] |= FirstFrag | LastFrag;
++        }
++
++        opts[0] = rtl8125_get_txd_opts1(ring, opts[0], len, entry);
++        mapping = dma_map_single(tp_to_dev(tp), skb->data, len, DMA_TO_DEVICE);
++        if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) {
++                if (unlikely(net_ratelimit()))
++                        netif_err(tp, drv, dev, "Failed to map TX DMA!\n");
++                goto err_dma_1;
++        }
++
++#ifdef ENABLE_PTP_SUPPORT
++        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
++                if (!test_and_set_bit_lock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state)) {
++                        if (tp->hwtstamp_config.tx_type == HWTSTAMP_TX_ON &&
++                            !tp->ptp_tx_skb) {
++                                skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
++
++                                tp->ptp_tx_skb = skb_get(skb);
++                                tp->ptp_tx_start = jiffies;
++                                schedule_work(&tp->ptp_tx_work);
++                        } else
++                                tp->tx_hwtstamp_skipped++;
++                }
++        }
++#endif
++        /* set first fragment's length */
++        ring->tx_skb[entry].len = len;
++
++        /* set skb to last fragment */
++        last_entry = (entry + frags) % ring->num_tx_desc;
++        last = &ring->tx_skb[last_entry];
++        last->skb = skb;
++        last->gso_segs = gso_segs;
++        last->bytecount = bytecount;
++
++        txd->addr = cpu_to_le64(mapping);
++        txd->opts2 = cpu_to_le32(opts[1]);
++        wmb();
++        txd->opts1 = cpu_to_le32(opts[0]);
++
++        netdev_tx_sent_queue(txring_txq(ring), bytecount);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
++        dev->trans_start = jiffies;
++#else
++        skb_tx_timestamp(skb);
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
++
++        /* rtl_tx needs to see descriptor changes before updated tp->cur_tx */
++        smp_wmb();
++
++        WRITE_ONCE(ring->cur_tx, ring->cur_tx + frags + 1);
++
++        stop_queue = !rtl8125_tx_slots_avail(tp, ring);
++        if (unlikely(stop_queue)) {
++                /* Avoid wrongly optimistic queue wake-up: rtl_tx thread must
++                 * not miss a ring update when it notices a stopped queue.
++                 */
++                smp_wmb();
++                netif_stop_subqueue(dev, queue_mapping);
++        }
++
++        if (netif_xmit_stopped(txring_txq(ring)) || !netdev_xmit_more())
++                rtl8125_doorbell(tp, ring);
++
++        if (unlikely(stop_queue)) {
++                /* Sync with rtl_tx:
++                 * - publish queue status and cur_tx ring index (write barrier)
++                 * - refresh dirty_tx ring index (read barrier).
++                 * May the current thread have a pessimistic view of the ring
++                 * status and forget to wake up queue, a racing rtl_tx thread
++                 * can't.
++                 */
++                smp_mb();
++                if (rtl8125_tx_slots_avail(tp, ring))
++                        netif_start_subqueue(dev, queue_mapping);
++        }
++out:
++        return ret;
++err_dma_1:
++        rtl8125_tx_clear_range(tp, ring, ring->cur_tx + 1, frags);
++err_dma_0:
++        RTLDEV->stats.tx_dropped++;
++        dev_kfree_skb_any(skb);
++        ret = NETDEV_TX_OK;
++        goto out;
++err_stop:
++        netif_stop_subqueue(dev, queue_mapping);
++        ret = NETDEV_TX_BUSY;
++        RTLDEV->stats.tx_dropped++;
++        goto out;
++}
++
++/* recycle tx no close desc*/
++static int
++rtl8125_tx_interrupt_noclose(struct rtl8125_tx_ring *ring, int budget)
++{
++        unsigned int total_bytes = 0, total_packets = 0;
++        struct rtl8125_private *tp = ring->priv;
++        struct net_device *dev = tp->dev;
++        unsigned int dirty_tx, tx_left;
++        unsigned int tx_desc_closed;
++        unsigned int count = 0;
++
++        dirty_tx = ring->dirty_tx;
++        ring->NextHwDesCloPtr = rtl8125_get_hw_clo_ptr(ring);
++        tx_desc_closed = rtl8125_fast_mod_mask(ring->NextHwDesCloPtr -
++                                               ring->BeginHwDesCloPtr,
++                                               tp->MaxTxDescPtrMask);
++        tx_left = min((READ_ONCE(ring->cur_tx) - dirty_tx), tx_desc_closed);
++        ring->BeginHwDesCloPtr += tx_left;
++
++        while (tx_left > 0) {
++                unsigned int entry = dirty_tx % ring->num_tx_desc;
++                struct ring_info *tx_skb = ring->tx_skb + entry;
++
++                rtl8125_unmap_tx_skb(tp->pci_dev,
++                                     tx_skb,
++                                     ring->TxDescArray + entry);
++
++                if (tx_skb->skb != NULL) {
++                        /* update the statistics for this packet */
++                        total_bytes += tx_skb->bytecount;
++                        total_packets += tx_skb->gso_segs;
++
++                        RTL_NAPI_CONSUME_SKB_ANY(tx_skb->skb, budget);
++                        tx_skb->skb = NULL;
++                }
++                dirty_tx++;
++                tx_left--;
++        }
++
++        if (total_packets) {
++                netdev_tx_completed_queue(txring_txq(ring),
++                                          total_packets, total_bytes);
++
++                RTLDEV->stats.tx_bytes += total_bytes;
++                RTLDEV->stats.tx_packets+= total_packets;
++        }
++
++        if (ring->dirty_tx != dirty_tx) {
++                count = dirty_tx - ring->dirty_tx;
++                WRITE_ONCE(ring->dirty_tx, dirty_tx);
++                smp_wmb();
++                if (__netif_subqueue_stopped(dev, ring->index) &&
++                    rtl8125_tx_slots_avail(tp, ring) && netif_carrier_ok(dev)) {
++                        netif_start_subqueue(dev, ring->index);
++                }
++        }
++
++        return count;
++}
++
++/* recycle tx close desc*/
++static int
++rtl8125_tx_interrupt_close(struct rtl8125_tx_ring *ring, int budget)
++{
++        unsigned int total_bytes = 0, total_packets = 0;
++        struct rtl8125_private *tp = ring->priv;
++        struct net_device *dev = tp->dev;
++        unsigned int dirty_tx, tx_left;
++        unsigned int count = 0;
++
++        dirty_tx = ring->dirty_tx;
++        tx_left = READ_ONCE(ring->cur_tx) - dirty_tx;
++
++        while (tx_left > 0) {
++                unsigned int entry = dirty_tx % ring->num_tx_desc;
++                struct ring_info *tx_skb = ring->tx_skb + entry;
++
++                if (le32_to_cpu(READ_ONCE(ring->TxDescArray[entry].opts1)) & DescOwn)
++                        break;
++
++                rtl8125_unmap_tx_skb(tp->pci_dev,
++                                     tx_skb,
++                                     ring->TxDescArray + entry);
++
++                if (tx_skb->skb != NULL) {
++                        /* update the statistics for this packet */
++                        total_bytes += tx_skb->bytecount;
++                        total_packets += tx_skb->gso_segs;
++
++                        RTL_NAPI_CONSUME_SKB_ANY(tx_skb->skb, budget);
++                        tx_skb->skb = NULL;
++                }
++                dirty_tx++;
++                tx_left--;
++        }
++
++        if (total_packets) {
++                netdev_tx_completed_queue(txring_txq(ring),
++                                          total_packets, total_bytes);
++
++                RTLDEV->stats.tx_bytes += total_bytes;
++                RTLDEV->stats.tx_packets+= total_packets;
++        }
++
++        if (ring->dirty_tx != dirty_tx) {
++                count = dirty_tx - ring->dirty_tx;
++                WRITE_ONCE(ring->dirty_tx, dirty_tx);
++                smp_wmb();
++                if (__netif_subqueue_stopped(dev, ring->index) &&
++                    rtl8125_tx_slots_avail(tp, ring) && netif_carrier_ok(dev)) {
++                        netif_start_subqueue(dev, ring->index);
++                }
++
++                if (READ_ONCE(ring->cur_tx) != dirty_tx)
++                        rtl8125_doorbell(tp, ring);
++        }
++
++        return count;
++}
++
++static int
++rtl8125_tx_interrupt(struct rtl8125_tx_ring *ring, int budget)
++{
++        struct rtl8125_private *tp = ring->priv;
++
++        if (tp->EnableTxNoClose)
++                return rtl8125_tx_interrupt_noclose(ring, budget);
++        else
++                return rtl8125_tx_interrupt_close(ring, budget);
++}
++
++static int
++rtl8125_tx_interrupt_with_vector(struct rtl8125_private *tp,
++                                 const int message_id,
++                                 int budget)
++{
++        int count = 0;
++
++        switch (tp->HwCurrIsrVer) {
++        case 3:
++        case 4:
++                if (message_id < tp->num_tx_rings)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[message_id], budget);
++                break;
++        case 5:
++                if (message_id == 16)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget);
++#ifdef ENABLE_MULTIPLE_TX_QUEUE
++                else if (message_id == 17 && tp->num_tx_rings > 1)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget);
++#endif
++                break;
++        case 7:
++                if (message_id == 27)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget);
++#ifdef ENABLE_MULTIPLE_TX_QUEUE
++                else if (message_id == 28 && tp->num_tx_rings > 1)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget);
++#endif
++                break;
++        default:
++                if (message_id == 16)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget);
++#ifdef ENABLE_MULTIPLE_TX_QUEUE
++                else if (message_id == 18 && tp->num_tx_rings > 1)
++                        count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget);
++#endif
++                break;
++        }
++
++        return count;
++}
++
++static inline int
++rtl8125_fragmented_frame(struct rtl8125_private *tp, u32 status)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                return (status & (FirstFrag_V3 | LastFrag_V3)) != (FirstFrag_V3 | LastFrag_V3);
++        case RX_DESC_RING_TYPE_4:
++                return (status & (FirstFrag_V4 | LastFrag_V4)) != (FirstFrag_V4 | LastFrag_V4);
++        default:
++                return (status & (FirstFrag | LastFrag)) != (FirstFrag | LastFrag);
++        }
++}
++
++static inline int
++rtl8125_is_non_eop(struct rtl8125_private *tp, u32 status)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                return !(status & LastFrag_V3);
++        case RX_DESC_RING_TYPE_4:
++                return !(status & LastFrag_V4);
++        default:
++                return !(status & LastFrag);
++        }
++}
++
++static inline int
++rtl8125_rx_desc_type(u32 status)
++{
++        return ((status >> 26) & 0x0F);
++}
++
++static inline void
++rtl8125_rx_v1_csum(struct rtl8125_private *tp,
++                   struct sk_buff *skb,
++                   struct RxDesc *desc)
++{
++        u32 opts1 = le32_to_cpu(desc->opts1);
++
++        if (((opts1 & RxTCPT) && !(opts1 & RxTCPF)) ||
++            ((opts1 & RxUDPT) && !(opts1 & RxUDPF)))
++                skb->ip_summed = CHECKSUM_UNNECESSARY;
++        else
++                skb_checksum_none_assert(skb);
++}
++
++static inline void
++rtl8125_rx_v3_csum(struct rtl8125_private *tp,
++                   struct sk_buff *skb,
++                   struct RxDescV3 *descv3)
++{
++        u32 opts2 = le32_to_cpu(descv3->RxDescNormalDDWord4.opts2);
++
++        /* rx csum offload for RTL8125 */
++        if (((opts2 & RxTCPT_v3) && !(opts2 & RxTCPF_v3)) ||
++            ((opts2 & RxUDPT_v3) && !(opts2 & RxUDPF_v3)))
++                skb->ip_summed = CHECKSUM_UNNECESSARY;
++        else
++                skb_checksum_none_assert(skb);
++}
++
++static inline void
++rtl8125_rx_v4_csum(struct rtl8125_private *tp,
++                   struct sk_buff *skb,
++                   struct RxDescV4 *descv4)
++{
++        u32 opts1 = le32_to_cpu(descv4->RxDescNormalDDWord2.opts1);
++
++        /* rx csum offload for RTL8125 */
++        if (((opts1 & RxTCPT_v4) && !(opts1 & RxTCPF_v4)) ||
++            ((opts1 & RxUDPT_v4) && !(opts1 & RxUDPF_v4)))
++                skb->ip_summed = CHECKSUM_UNNECESSARY;
++        else
++                skb_checksum_none_assert(skb);
++}
++
++static inline void
++rtl8125_rx_csum(struct rtl8125_private *tp,
++                struct sk_buff *skb,
++                struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                rtl8125_rx_v3_csum(tp, skb, (struct RxDescV3 *)desc);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                rtl8125_rx_v4_csum(tp, skb, (struct RxDescV4 *)desc);
++                break;
++        default:
++                rtl8125_rx_v1_csum(tp, skb, desc);
++                break;
++        }
++}
++
++/*
++static inline int
++rtl8125_try_rx_copy(struct rtl8125_private *tp,
++                    struct rtl8125_rx_ring *ring,
++                    struct sk_buff **sk_buff,
++                    int pkt_size,
++                    struct RxDesc *desc,
++                    int rx_buf_sz)
++{
++        int ret = -1;
++
++        struct sk_buff *skb;
++
++        skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, pkt_size + R8125_RX_ALIGN);
++        if (skb) {
++                u8 *data;
++
++                data = sk_buff[0]->data;
++                if (!R8125_USE_NAPI_ALLOC_SKB)
++                    skb_reserve(skb, R8125_RX_ALIGN);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
++                prefetch(data - R8125_RX_ALIGN);
++#endif
++                eth_copy_and_sum(skb, data, pkt_size, 0);
++                *sk_buff = skb;
++                rtl8125_mark_to_asic(tp, desc, rx_buf_sz);
++                ret = 0;
++        }
++
++        return ret;
++}
++*/
++
++static inline void
++rtl8125_rx_skb(struct rtl8125_private *tp,
++               struct sk_buff *skb,
++               u32 ring_index)
++{
++#ifdef CONFIG_R8125_NAPI
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++        netif_receive_skb(skb);
++#else
++        napi_gro_receive(&tp->r8125napi[ring_index].napi, skb);
++#endif
++#else
++        netif_rx(skb);
++#endif
++}
++
++static int
++rtl8125_check_rx_desc_error(struct net_device *dev,
++                            struct rtl8125_private *tp,
++                            u32 status)
++{
++        int ret = 0;
++
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                if (unlikely(status & RxRES_V3)) {
++                        if (status & (RxRWT_V3 | RxRUNT_V3))
++                                RTLDEV->stats.rx_length_errors++;
++                        if (status & RxCRC_V3)
++                                RTLDEV->stats.rx_crc_errors++;
++
++                        ret = -1;
++                }
++                break;
++        case RX_DESC_RING_TYPE_4:
++                if (unlikely(status & RxRES_V4)) {
++                        if (status & RxRUNT_V4)
++                                RTLDEV->stats.rx_length_errors++;
++                        if (status & RxCRC_V4)
++                                RTLDEV->stats.rx_crc_errors++;
++
++                        ret = -1;
++                }
++                break;
++        default:
++                if (unlikely(status & RxRES)) {
++                        if (status & (RxRWT | RxRUNT))
++                                RTLDEV->stats.rx_length_errors++;
++                        if (status & RxCRC)
++                                RTLDEV->stats.rx_crc_errors++;
++
++                        ret = -1;
++                }
++                break;
++        }
++
++        return ret;
++}
++
++#ifdef ENABLE_PAGE_REUSE
++
++static inline bool
++rtl8125_reuse_rx_ok(struct page *page)
++{
++        /* avoid re-using remote pages */
++        if (!dev_page_is_reusable(page)) {
++                //printk(KERN_INFO "r8125 page pfmemalloc, can't reuse!\n");
++                return false;
++        }
++        /* if we are only owner of page we can reuse it */
++        if (unlikely(page_ref_count(page) != 1)) {
++                //printk(KERN_INFO "r8125 page refcnt %d, can't reuse!\n", page_ref_count(page));
++                return false;
++        }
++
++        return true;
++}
++
++static void
++rtl8125_reuse_rx_buffer(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring, u32 cur_rx, struct rtl8125_rx_buffer *rxb)
++{
++        struct page *page = rxb->page;
++
++        u32 dirty_rx = ring->dirty_rx;
++        u32 entry = dirty_rx % ring->num_rx_desc;
++        struct rtl8125_rx_buffer *nrxb = &ring->rx_buffer[entry];
++
++        u32 noffset;
++
++        //the page gonna be shared by us and kernel, keep page ref = 2
++        page_ref_inc(page);
++
++        //flip the buffer in page to use next
++        noffset = rxb->page_offset ^ (tp->rx_buf_page_size / 2); //one page, two buffer, ping-pong
++
++        nrxb->dma = rxb->dma;
++        nrxb->page_offset = noffset;
++        nrxb->data = rxb->data;
++
++        if (cur_rx != dirty_rx) {
++                //move the buffer to other slot
++                nrxb->page = page;
++                rxb->page = NULL;
++        }
++}
++
++static void rtl8125_put_rx_buffer(struct rtl8125_private *tp,
++                                  struct rtl8125_rx_ring *ring,
++                                  u32 cur_rx,
++                                  struct rtl8125_rx_buffer *rxb)
++{
++        struct rtl8125_rx_buffer *nrxb;
++        struct page *page = rxb->page;
++        u32 entry;
++
++        entry = ring->dirty_rx % ring->num_rx_desc;
++        nrxb = &ring->rx_buffer[entry];
++        if (likely(rtl8125_reuse_rx_ok(page))) {
++                /* hand second half of page back to the ring */
++                rtl8125_reuse_rx_buffer(tp, ring, cur_rx, rxb);
++        } else {
++                tp->page_reuse_fail_cnt++;
++
++                dma_unmap_page_attrs(&tp->pci_dev->dev, rxb->dma,
++                                     tp->rx_buf_page_size,
++                                     DMA_FROM_DEVICE,
++                                     (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
++                //the page ref is kept 1, uniquely owned by kernel now
++                rxb->page = NULL;
++
++                return;
++        }
++
++        dma_sync_single_range_for_device(tp_to_dev(tp),
++                                         nrxb->dma,
++                                         nrxb->page_offset,
++                                         tp->rx_buf_sz,
++                                         DMA_FROM_DEVICE);
++
++        rtl8125_map_to_asic(tp, ring,
++                            rtl8125_get_rxdesc(tp, ring->RxDescArray, entry),
++                            nrxb->dma + nrxb->page_offset,
++                            tp->rx_buf_sz, entry);
++
++        ring->dirty_rx++;
++}
++
++#endif //ENABLE_PAGE_REUSE
++
++static int
++rtl8125_rx_interrupt(struct net_device *dev,
++                     struct rtl8125_private *tp,
++                     struct rtl8125_rx_ring *ring,
++                     napi_budget budget)
++{
++        unsigned int cur_rx, rx_left;
++        unsigned int delta, count = 0;
++        unsigned int entry;
++        struct RxDesc *desc;
++        struct sk_buff *skb;
++        u32 status;
++        u32 rx_quota;
++        u32 ring_index = ring->index;
++#ifdef ENABLE_PAGE_REUSE
++        struct rtl8125_rx_buffer *rxb;
++#else //ENABLE_PAGE_REUSE
++        u64 rx_buf_phy_addr;
++#endif //ENABLE_PAGE_REUSE
++        unsigned int total_rx_multicast_packets = 0;
++        unsigned int total_rx_bytes = 0, total_rx_packets = 0;
++
++        assert(dev != NULL);
++        assert(tp != NULL);
++
++        if (ring->RxDescArray == NULL)
++                goto rx_out;
++
++        rx_quota = RTL_RX_QUOTA(budget);
++        cur_rx = ring->cur_rx;
++        rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx;
++        rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota);
++
++        for (; rx_left > 0; rx_left--, cur_rx++) {
++#ifdef ENABLE_PTP_SUPPORT
++                u8 desc_type = RXDESC_TYPE_NORMAL;
++                struct RxDescV3 ptp_desc;
++#endif //ENABLE_PTP_SUPPORT
++#ifndef ENABLE_PAGE_REUSE
++                const void *rx_buf;
++#endif //!ENABLE_PAGE_REUSE
++                u32 pkt_size;
++
++                entry = cur_rx % ring->num_rx_desc;
++                desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
++                status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
++                if (status & DescOwn) {
++                        RTL_R8(tp, tp->imr_reg[0]);
++                        status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
++                        if (status & DescOwn)
++                                break;
++                }
++
++                rmb();
++
++                if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) {
++                        if (netif_msg_rx_err(tp)) {
++                                printk(KERN_INFO
++                                       "%s: Rx ERROR. status = %08x\n",
++                                       dev->name, status);
++                        }
++
++                        RTLDEV->stats.rx_errors++;
++
++                        if (!(dev->features & NETIF_F_RXALL))
++                                goto release_descriptor;
++                }
++                pkt_size = status & 0x00003fff;
++                if (likely(!(dev->features & NETIF_F_RXFCS))) {
++#ifdef ENABLE_RX_PACKET_FRAGMENT
++                        if (rtl8125_is_non_eop(tp, status) &&
++                            pkt_size == tp->rx_buf_sz) {
++                                struct RxDesc *desc_next;
++                                unsigned int entry_next;
++                                int pkt_size_next;
++                                u32 status_next;
++
++                                entry_next = (cur_rx + 1) % ring->num_rx_desc;
++                                desc_next = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry_next);
++                                status_next = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc_next));
++                                if (!(status_next & DescOwn)) {
++                                        pkt_size_next = status_next & 0x00003fff;
++                                        if (pkt_size_next < ETH_FCS_LEN)
++                                                pkt_size -= (ETH_FCS_LEN - pkt_size_next);
++                                }
++                        }
++#endif //ENABLE_RX_PACKET_FRAGMENT
++                        if (!rtl8125_is_non_eop(tp, status)) {
++                                if (pkt_size < ETH_FCS_LEN) {
++#ifdef ENABLE_RX_PACKET_FRAGMENT
++                                        pkt_size = 0;
++#else
++                                        goto drop_packet;
++#endif //ENABLE_RX_PACKET_FRAGMENT
++                                } else
++                                        pkt_size -= ETH_FCS_LEN;
++                        }
++                }
++
++                if (unlikely(pkt_size > tp->rx_buf_sz))
++                        goto drop_packet;
++
++#if !defined(ENABLE_RX_PACKET_FRAGMENT) || !defined(ENABLE_PAGE_REUSE)
++                /*
++                 * The driver does not support incoming fragmented
++                 * frames. They are seen as a symptom of over-mtu
++                 * sized frames.
++                 */
++                if (unlikely(rtl8125_fragmented_frame(tp, status)))
++                        goto drop_packet;
++#endif //!ENABLE_RX_PACKET_FRAGMENT || !ENABLE_PAGE_REUSE
++
++#ifdef ENABLE_PTP_SUPPORT
++                if (tp->HwSuppPtpVer == 1) {
++                        desc_type = rtl8125_rx_desc_type(status);
++                        if (desc_type == RXDESC_TYPE_NEXT && rx_left > 0) {
++                                u32 status_next;
++                                struct RxDescV3 *desc_next;
++                                unsigned int entry_next;
++
++                                cur_rx++;
++                                rx_left--;
++                                entry_next = cur_rx % ring->num_rx_desc;
++                                desc_next = (struct RxDescV3 *)rtl8125_get_rxdesc(tp, ring->RxDescArray, entry_next);
++                                status_next = le32_to_cpu(desc_next->RxDescNormalDDWord4.opts1);
++                                if (unlikely(status_next & DescOwn)) {
++                                        udelay(1);
++                                        status_next = le32_to_cpu(desc_next->RxDescNormalDDWord4.opts1);
++                                        if (unlikely(status_next & DescOwn)) {
++                                                if (netif_msg_rx_err(tp)) {
++                                                        printk(KERN_ERR
++                                                               "%s: Rx Next Desc ERROR. status = %08x\n",
++                                                               dev->name, status_next);
++                                                }
++                                                rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next,
++                                                                          ring->RxDescPhyAddr[entry_next]);
++                                                wmb();
++                                                rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz);
++                                                goto drop_packet;
++                                        }
++                                }
++
++                                rmb();
++
++                                desc_type = rtl8125_rx_desc_type(status_next);
++                                if (desc_type == RXDESC_TYPE_PTP) {
++                                        ptp_desc = *desc_next;
++                                        rmb();
++                                        rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next,
++                                                                  ring->RxDescPhyAddr[entry_next]);
++                                        wmb();
++                                        rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz);
++                                } else {
++                                        WARN_ON(1);
++                                        rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next,
++                                                                  ring->RxDescPhyAddr[entry_next]);
++                                        wmb();
++                                        rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz);
++                                        goto drop_packet;
++                                }
++                        } else
++                                WARN_ON(desc_type != RXDESC_TYPE_NORMAL);
++                }
++#endif
++#ifdef ENABLE_PAGE_REUSE
++                rxb = &ring->rx_buffer[entry];
++                skb = rxb->skb;
++                rxb->skb = NULL;
++                if (!skb) {
++                        skb = RTL_BUILD_SKB_INTR(rxb->data + rxb->page_offset - ring->rx_offset, tp->rx_buf_page_size / 2);
++                        if (!skb) {
++                                //netdev_err(tp->dev, "Failed to allocate RX skb!\n");
++                                goto drop_packet;
++                        }
++
++                        skb->dev = dev;
++                        if (!R8125_USE_NAPI_ALLOC_SKB)
++                                skb_reserve(skb, R8125_RX_ALIGN);
++                        skb_put(skb, pkt_size);
++#ifdef ENABLE_RSS_SUPPORT
++                        rtl8125_rx_hash(tp, desc, skb);
++#endif
++                        rtl8125_rx_csum(tp, skb, desc);
++                } else
++                        skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rxb->page,
++                                        rxb->page_offset, pkt_size, tp->rx_buf_page_size / 2);
++
++                //recycle desc
++                rtl8125_put_rx_buffer(tp, ring, cur_rx, rxb);
++
++                dma_sync_single_range_for_cpu(tp_to_dev(tp),
++                                              rxb->dma,
++                                              rxb->page_offset,
++                                              tp->rx_buf_sz,
++                                              DMA_FROM_DEVICE);
++#else //ENABLE_PAGE_REUSE
++                skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, pkt_size + R8125_RX_ALIGN);
++                if (!skb) {
++                        //netdev_err(tp->dev, "Failed to allocate RX skb!\n");
++                        goto drop_packet;
++                }
++
++                skb->dev = dev;
++                if (!R8125_USE_NAPI_ALLOC_SKB)
++                        skb_reserve(skb, R8125_RX_ALIGN);
++                skb_put(skb, pkt_size);
++
++                rx_buf_phy_addr = ring->RxDescPhyAddr[entry];
++                dma_sync_single_for_cpu(tp_to_dev(tp),
++                                        rx_buf_phy_addr, tp->rx_buf_sz,
++                                        DMA_FROM_DEVICE);
++                rx_buf = ring->Rx_skbuff[entry]->data;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
++                prefetch(rx_buf - R8125_RX_ALIGN);
++#endif
++                eth_copy_and_sum(skb, rx_buf, pkt_size, 0);
++
++                dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr,
++                                           tp->rx_buf_sz, DMA_FROM_DEVICE);
++#endif //ENABLE_PAGE_REUSE
++
++#ifdef ENABLE_PTP_SUPPORT
++                if (tp->HwSuppPtpVer == 1 && desc_type == RXDESC_TYPE_PTP)
++                        rtl8125_rx_mac_ptp_pktstamp(tp, skb, &ptp_desc);
++                else if (tp->HwSuppPtpVer == 3 && (tp->flags & RTL_FLAG_RX_HWTSTAMP_ENABLED))
++                        rtl8125_rx_phy_ptp_timestamp(tp, skb);
++#endif // ENABLE_PTP_SUPPORT
++
++#ifdef ENABLE_RX_PACKET_FRAGMENT
++                if (rtl8125_is_non_eop(tp, status)) {
++                        unsigned int entry_next;
++                        entry_next = (entry + 1) % ring->num_rx_desc;
++                        rxb = &ring->rx_buffer[entry_next];
++                        rxb->skb = skb;
++                        continue;
++                }
++#endif //ENABLE_RX_PACKET_FRAGMENT
++
++#ifndef ENABLE_PAGE_REUSE
++#ifdef ENABLE_RSS_SUPPORT
++                rtl8125_rx_hash(tp, desc, skb);
++#endif
++                rtl8125_rx_csum(tp, skb, desc);
++#endif /* !ENABLE_PAGE_REUSE */
++
++                skb->protocol = eth_type_trans(skb, dev);
++
++                total_rx_bytes += skb->len;
++
++                if (skb->pkt_type == PACKET_MULTICAST)
++                        total_rx_multicast_packets++;
++
++                if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0)
++                        rtl8125_rx_skb(tp, skb, ring_index);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
++                dev->last_rx = jiffies;
++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
++                total_rx_packets++;
++
++#ifdef ENABLE_PAGE_REUSE
++                rxb->skb = NULL;
++                continue;
++#endif
++
++release_descriptor:
++                switch (tp->InitRxDescType) {
++                case RX_DESC_RING_TYPE_3:
++                case RX_DESC_RING_TYPE_4:
++                        rtl8125_set_desc_dma_addr(tp, desc,
++                                                  ring->RxDescPhyAddr[entry]);
++                        wmb();
++                        break;
++                }
++                rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
++                continue;
++drop_packet:
++                RTLDEV->stats.rx_dropped++;
++                RTLDEV->stats.rx_length_errors++;
++                goto release_descriptor;
++        }
++
++        count = cur_rx - ring->cur_rx;
++        ring->cur_rx = cur_rx;
++
++        delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1);
++        if (!delta && count && netif_msg_intr(tp))
++                printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name);
++        ring->dirty_rx += delta;
++
++        RTLDEV->stats.rx_bytes += total_rx_bytes;
++        RTLDEV->stats.rx_packets += total_rx_packets;
++        RTLDEV->stats.multicast += total_rx_multicast_packets;
++
++        /*
++         * FIXME: until there is periodic timer to try and refill the ring,
++         * a temporary shortage may definitely kill the Rx process.
++         * - disable the asic to try and avoid an overflow and kick it again
++         *   after refill ?
++         * - how do others driver handle this condition (Uh oh...).
++         */
++        if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp))
++                printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name);
++
++rx_out:
++        return total_rx_packets;
++}
++
++static bool
++rtl8125_linkchg_interrupt(struct rtl8125_private *tp, u32 status)
++{
++        switch (tp->HwCurrIsrVer) {
++        case 2:
++        case 3:
++                return status & ISRIMR_V2_LINKCHG;
++        case 4:
++                return status & ISRIMR_V4_LINKCHG;
++        case 5:
++                return status & ISRIMR_V5_LINKCHG;
++        case 7:
++                return status & ISRIMR_V7_LINKCHG;
++        default:
++                return status & LinkChg;
++        }
++}
++
++static u32
++rtl8125_get_linkchg_message_id(struct rtl8125_private *tp)
++{
++        switch (tp->HwCurrIsrVer) {
++        case 4:
++        case 7:
++                return 29;
++        case 5:
++                return 18;
++        default:
++                return 21;
++        }
++}
++
++/*
++ *The interrupt handler does all of the Rx thread work and cleans up after
++ *the Tx thread.
++ */
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance, struct pt_regs *regs)
++#else
++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance)
++#endif
++{
++        struct r8125_napi *r8125napi = dev_instance;
++        struct rtl8125_private *tp = r8125napi->priv;
++        struct net_device *dev = tp->dev;
++        u32 status;
++        int handled = 0;
++
++        do {
++                status = RTL_R32(tp, tp->isr_reg[0]);
++
++                if (!(tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX))) {
++                        /* hotplug/major error/no more work/shared irq */
++                        if (!status)
++                                break;
++
++                        if (status == 0xFFFFFFFF)
++                                break;
++
++                        if (!(status & (tp->intr_mask | tp->timer_intr_mask)))
++                                break;
++                }
++
++                handled = 1;
++
++#if defined(RTL_USE_NEW_INTR_API)
++                if (!tp->irq_tbl[0].requested)
++                        break;
++#endif
++                rtl8125_disable_hw_interrupt(tp);
++
++                RTL_W32(tp, tp->isr_reg[0], status&~RxFIFOOver);
++
++                if (rtl8125_linkchg_interrupt(tp, status))
++                        rtl8125_schedule_linkchg_work(tp);
++
++#ifdef ENABLE_DASH_SUPPORT
++                if ((status & ISRIMR_V4_LAYER2_INTR_STS) &&
++                    rtl8125_check_dash_interrupt(tp))
++                        rtl8125_schedule_dash_work(tp);
++#endif
++
++#ifdef CONFIG_R8125_NAPI
++                if (status & tp->intr_mask || tp->keep_intr_cnt-- > 0) {
++                        if (status & tp->intr_mask)
++                                tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT;
++
++                        if (likely(RTL_NETIF_RX_SCHEDULE_PREP(dev, &tp->r8125napi[0].napi)))
++                                __RTL_NETIF_RX_SCHEDULE(dev, &tp->r8125napi[0].napi);
++                        else if (netif_msg_intr(tp))
++                                printk(KERN_INFO "%s: interrupt %04x in poll\n",
++                                       dev->name, status);
++                } else {
++                        tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT;
++                        rtl8125_switch_to_hw_interrupt(tp);
++                }
++#else
++                if (status & tp->intr_mask || tp->keep_intr_cnt-- > 0) {
++                        u32 budget = ~(u32)0;
++                        int i;
++
++                        if (status & tp->intr_mask)
++                                tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT;
++
++                        for (i = 0; i < tp->num_tx_rings; i++)
++                                rtl8125_tx_interrupt(&tp->tx_ring[i], ~(u32)0);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++                        rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[0], &budget);
++#else
++                        rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[0], budget);
++#endif	//LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++
++#ifdef ENABLE_DASH_SUPPORT
++                        if ((status & ISRIMR_V4_LAYER2_INTR_STS) &&
++                            rtl8125_check_dash_interrupt(tp))
++                                rtl8125_schedule_dash_work(tp);
++#endif
++
++                        rtl8125_switch_to_timer_interrupt(tp);
++                } else {
++                        tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT;
++                        rtl8125_switch_to_hw_interrupt(tp);
++                }
++#endif
++        } while (false);
++
++        return IRQ_RETVAL(handled);
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance, struct pt_regs *regs)
++#else
++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance)
++#endif
++{
++        struct r8125_napi *r8125napi = dev_instance;
++        struct rtl8125_private *tp = r8125napi->priv;
++        struct net_device *dev = tp->dev;
++        int message_id = r8125napi->index;
++#ifndef CONFIG_R8125_NAPI
++        u32 budget = ~(u32)0;
++#endif
++
++        do {
++#if defined(RTL_USE_NEW_INTR_API)
++                if (!tp->irq_tbl[message_id].requested)
++                        break;
++#endif
++                //link change
++                if (message_id == rtl8125_get_linkchg_message_id(tp)) {
++                        rtl8125_disable_hw_interrupt_v2(tp, message_id);
++                        rtl8125_clear_hw_isr_v2(tp, message_id);
++                        rtl8125_schedule_linkchg_work(tp);
++                        break;
++                }
++
++#ifdef ENABLE_DASH_SUPPORT
++                if (message_id == 31) {
++                        if (rtl8125_check_dash_interrupt(tp))
++                                rtl8125_disable_hw_interrupt_v2(tp, message_id);
++                        rtl8125_clear_hw_isr_v2(tp, message_id);
++                        rtl8125_schedule_dash_work(tp);
++                        rtl8125_enable_hw_interrupt_v2(tp, message_id);
++                        break;
++                }
++#endif
++
++#ifdef CONFIG_R8125_NAPI
++                if (likely(RTL_NETIF_RX_SCHEDULE_PREP(dev, &r8125napi->napi))) {
++                        rtl8125_disable_hw_interrupt_v2(tp, message_id);
++                        __RTL_NETIF_RX_SCHEDULE(dev, &r8125napi->napi);
++                } else if (netif_msg_intr(tp))
++                        printk(KERN_INFO "%s: interrupt message id %d in poll_msix\n",
++                               dev->name, message_id);
++                rtl8125_clear_hw_isr_v2(tp, message_id);
++#else
++                rtl8125_disable_hw_interrupt_v2(tp, message_id);
++
++                rtl8125_clear_hw_isr_v2(tp, message_id);
++
++                rtl8125_tx_interrupt_with_vector(tp, message_id, ~(u32)0);
++
++                if (message_id < tp->num_rx_rings) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++                        rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], &budget);
++#else
++                        rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
++#endif	//LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++                }
++
++                rtl8125_enable_hw_interrupt_v2(tp, message_id);
++#endif
++
++        } while (false);
++
++        return IRQ_HANDLED;
++}
++
++static void rtl8125_down(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        //rtl8125_delete_esd_timer(dev, &tp->esd_timer);
++
++        //rtl8125_delete_link_timer(dev, &tp->link_timer);
++
++        netif_carrier_off(dev);
++
++        netif_tx_disable(dev);
++
++        _rtl8125_wait_for_quiescence(dev);
++
++        rtl8125_hw_reset(dev);
++
++        rtl8125_tx_clear(tp);
++
++        rtl8125_rx_clear(tp);
++}
++
++static int rtl8125_resource_freed(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < tp->num_tx_rings; i++)
++                if (tp->tx_ring[i].TxDescArray)
++                        return 0;
++
++        for (i = 0; i < tp->num_rx_rings; i++)
++                if (tp->rx_ring[i].RxDescArray)
++                        return 0;
++
++        return 1;
++}
++
++int rtl8125_close(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (!rtl8125_resource_freed(tp)) {
++                set_bit(R8125_FLAG_DOWN, tp->task_flags);
++
++                rtl8125_down(dev);
++
++                pci_clear_master(tp->pci_dev);
++
++#ifdef ENABLE_PTP_SUPPORT
++                rtl8125_ptp_stop(tp);
++#endif
++                rtl8125_hw_d3_para(dev);
++
++                rtl8125_powerdown_pll(dev, 0);
++
++                rtl8125_free_irq(tp);
++
++                rtl8125_free_alloc_resources(tp);
++        } else {
++                rtl8125_hw_d3_para(dev);
++
++                rtl8125_powerdown_pll(dev, 0);
++        }
++
++        return 0;
++}
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)
++static void rtl8125_shutdown(struct pci_dev *pdev)
++{
++        struct net_device *dev = pci_get_drvdata(pdev);
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        rtnl_lock();
++
++        if (HW_DASH_SUPPORT_DASH(tp))
++                rtl8125_driver_stop(tp);
++
++        rtl8125_disable_pci_offset_180(tp);
++
++        if (s5_keep_curr_mac == 0 && tp->random_mac == 0)
++                rtl8125_rar_set(tp, tp->org_mac_addr);
++
++        if (s5wol == 0)
++                tp->wol_enabled = WOL_DISABLED;
++
++        rtl8125_close(dev);
++        rtl8125_disable_msi(pdev, tp);
++
++        rtnl_unlock();
++
++        if (system_state == SYSTEM_POWER_OFF) {
++                pci_clear_master(tp->pci_dev);
++                pci_wake_from_d3(pdev, tp->wol_enabled);
++                pci_set_power_state(pdev, PCI_D3hot);
++        }
++}
++#endif
++
++#ifdef CONFIG_PM
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
++static int
++rtl8125_suspend(struct pci_dev *pdev, u32 state)
++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++static int
++rtl8125_suspend(struct device *device)
++#else
++static int
++rtl8125_suspend(struct pci_dev *pdev, pm_message_t state)
++#endif
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++        struct pci_dev *pdev = to_pci_dev(device);
++        struct net_device *dev = pci_get_drvdata(pdev);
++#else
++        struct net_device *dev = pci_get_drvdata(pdev);
++#endif
++        struct rtl8125_private *tp = netdev_priv(dev);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)
++        u32 pci_pm_state = pci_choose_state(pdev, state);
++#endif
++        rtnl_lock();
++
++        if (!netif_running(dev))
++                goto out;
++
++        set_bit(R8125_FLAG_DOWN, tp->task_flags);
++
++        netif_carrier_off(dev);
++
++        netif_tx_disable(dev);
++
++        netif_device_detach(dev);
++
++#ifdef ENABLE_PTP_SUPPORT
++        rtl8125_ptp_suspend(tp);
++#endif
++        rtl8125_hw_reset(dev);
++
++        pci_clear_master(pdev);
++
++        rtl8125_hw_d3_para(dev);
++
++        rtl8125_powerdown_pll(dev, 1);
++
++out:
++        if (HW_DASH_SUPPORT_DASH(tp))
++                rtl8125_driver_stop(tp);
++
++        rtnl_unlock();
++
++        pci_disable_device(pdev);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)
++        pci_save_state(pdev, &pci_pm_state);
++#else
++        pci_save_state(pdev);
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++        pci_enable_wake(pdev, pci_choose_state(pdev, state), tp->wol_enabled);
++#endif
++
++        pci_prepare_to_sleep(pdev);
++
++        return 0;
++}
++
++static int
++rtl8125_hw_d3_not_power_off(struct net_device *dev)
++{
++        return rtl8125_check_hw_phy_mcu_code_ver(dev);
++}
++
++static int rtl8125_wait_phy_nway_complete_sleep(struct rtl8125_private *tp)
++{
++        int i, val;
++
++        for (i = 0; i < 30; i++) {
++                val = rtl8125_mdio_read(tp, MII_BMSR) & BMSR_ANEGCOMPLETE;
++                if (val)
++                        return 0;
++
++                mdelay(100);
++        }
++
++        return -1;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++static int
++rtl8125_resume(struct pci_dev *pdev)
++#else
++static int
++rtl8125_resume(struct device *device)
++#endif
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++        struct pci_dev *pdev = to_pci_dev(device);
++        struct net_device *dev = pci_get_drvdata(pdev);
++#else
++        struct net_device *dev = pci_get_drvdata(pdev);
++#endif
++        struct rtl8125_private *tp = netdev_priv(dev);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)
++        u32 pci_pm_state = PCI_D0;
++#endif
++        unsigned long flags;
++        u32 err;
++
++        rtnl_lock();
++
++        err = pci_enable_device(pdev);
++        if (err) {
++                dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n");
++                goto out_unlock;
++        }
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)
++        pci_restore_state(pdev, &pci_pm_state);
++#else
++        pci_restore_state(pdev);
++#endif
++        pci_enable_wake(pdev, PCI_D0, 0);
++
++        /* restore last modified mac address */
++        rtl8125_rar_set(tp, dev->dev_addr);
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_check_hw_phy_mcu_code_ver(dev);
++
++        tp->resume_not_chg_speed = 0;
++        if (tp->check_keep_link_speed &&
++            //tp->link_ok(dev) &&
++            rtl8125_hw_d3_not_power_off(dev) &&
++            rtl8125_wait_phy_nway_complete_sleep(tp) == 0)
++                tp->resume_not_chg_speed = 1;
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        if (!netif_running(dev))
++                goto out_unlock;
++
++        pci_set_master(pdev);
++
++        rtl8125_exit_oob(dev);
++
++        rtl8125_up(dev);
++
++        clear_bit(R8125_FLAG_DOWN, tp->task_flags);
++
++        rtl8125_schedule_reset_work(tp);
++
++        rtl8125_schedule_esd_work(tp);
++
++#ifdef ENABLE_FIBER_SUPPORT
++        if (HW_FIBER_MODE_ENABLED(tp))
++                rtl8125_schedule_link_work(tp);
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        //mod_timer(&tp->esd_timer, jiffies + RTL8125_ESD_TIMEOUT);
++        //mod_timer(&tp->link_timer, jiffies + RTL8125_LINK_TIMEOUT);
++out_unlock:
++        netif_device_attach(dev);
++
++        rtnl_unlock();
++
++        return err;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++
++static struct dev_pm_ops rtl8125_pm_ops = {
++        .suspend = rtl8125_suspend,
++        .resume = rtl8125_resume,
++        .freeze = rtl8125_suspend,
++        .thaw = rtl8125_resume,
++        .poweroff = rtl8125_suspend,
++        .restore = rtl8125_resume,
++};
++
++#define RTL8125_PM_OPS	(&rtl8125_pm_ops)
++
++#endif
++
++#else /* !CONFIG_PM */
++
++#define RTL8125_PM_OPS	NULL
++
++#endif /* CONFIG_PM */
++
++static struct pci_driver rtl8125_pci_driver = {
++        .name       = MODULENAME,
++        .id_table   = rtl8125_pci_tbl,
++        .probe      = rtl8125_init_one,
++        .remove     = __devexit_p(rtl8125_remove_one),
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)
++        .shutdown   = rtl8125_shutdown,
++#endif
++#ifdef CONFIG_PM
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29)
++        .suspend    = rtl8125_suspend,
++        .resume     = rtl8125_resume,
++#else
++        .driver.pm	= RTL8125_PM_OPS,
++#endif
++#endif
++};
++
++static int __init
++rtl8125_init_module(void)
++{
++        int ret = 0;
++#ifdef ENABLE_R8125_PROCFS
++        rtl8125_proc_module_init();
++#endif
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
++
++        ret = pci_register_driver(&rtl8125_pci_driver);
++#else
++        ret = pci_module_init(&rtl8125_pci_driver);
++#endif
++
++        return ret;
++}
++
++static void __exit
++rtl8125_cleanup_module(void)
++{
++        pci_unregister_driver(&rtl8125_pci_driver);
++
++#ifdef ENABLE_R8125_PROCFS
++        if (rtl8125_proc) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++                remove_proc_subtree(MODULENAME, init_net.proc_net);
++#else
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
++                remove_proc_entry(MODULENAME, init_net.proc_net);
++#else
++                remove_proc_entry(MODULENAME, proc_net);
++#endif  //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
++#endif  //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
++                rtl8125_proc = NULL;
++        }
++#endif
++}
++
++module_init(rtl8125_init_module);
++module_exit(rtl8125_cleanup_module);
+diff --git a/drivers/net/ethernet/realtek/r8125_ptp.c b/drivers/net/ethernet/realtek/r8125_ptp.c
+new file mode 100755
+index 000000000000..457fa6d395d6
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_ptp.c
+@@ -0,0 +1,1472 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/pci.h>
++#include <linux/netdevice.h>
++#include <linux/delay.h>
++#include <linux/mii.h>
++#include <linux/in.h>
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
++#include <linux/hrtimer.h>
++
++#include "r8125.h"
++#include "r8125_ptp.h"
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
++static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
++{
++        return *(const struct timespec *)&ts64;
++}
++
++static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
++{
++        return *(const struct timespec64 *)&ts;
++}
++#endif
++
++static int _rtl8125_mac_phc_gettime(struct rtl8125_private *tp, struct timespec64 *ts64)
++{
++        //get local time
++        RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_LATCHED_LOCAL_TIME | PTP_EXEC_CMD));
++
++        /* nanoseconds */
++        //0x6808[29:0]
++        ts64->tv_nsec = (RTL_R32(tp, PTP_SOFT_CONFIG_Time_NS_8125) & 0x3fffffff);
++
++        /* seconds */
++        //0x680C[47:0]
++        ts64->tv_sec = RTL_R16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4);
++        ts64->tv_sec <<= 32;
++        ts64->tv_sec |= RTL_R32(tp, PTP_SOFT_CONFIG_Time_S_8125);
++
++        return 0;
++}
++
++static void rtl8125_wait_phy_clkadj_ready(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++)
++                if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & CLKADJ_MODE_SET))
++                        break;
++}
++
++static void rtl8125_phy_set_clkadj_mode(struct rtl8125_private *tp, u16 cmd)
++{
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              PTP_CLK_CFG_8126,
++                                              BIT_3 | BIT_2 | BIT_1,
++                                              CLKADJ_MODE_SET | cmd);
++
++        rtl8125_wait_phy_clkadj_ready(tp);
++}
++
++static int _rtl8125_phy_phc_gettime(struct rtl8125_private *tp, struct timespec64 *ts64)
++{
++        unsigned long flags;
++        int i;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        //Direct Read
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              PTP_CLK_CFG_8126,
++                                              BIT_3 | BIT_2 | BIT_1,
++                                              (PTP_CLKADJ_MODE_SET | DIRECT_READ));
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET))
++                        break;
++        }
++
++        /* nanoseconds */
++        //Ns[29:16] E414[13:0]
++        ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_NS_HI_8126) & 0x3fff;
++        ts64->tv_nsec <<= 16;
++        //Ns[15:0]  E412[15:0]
++        ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_NS_LO_8126);
++
++
++        /* seconds */
++        //S[47:32] E41A[15:0]
++        ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_HI_8126);
++        ts64->tv_sec <<= 16;
++        //S[31:16] E418[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_MI_8126);
++        ts64->tv_sec <<= 16;
++        //S[15:0]  E416[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_LO_8126);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return 0;
++}
++
++static int _rtl8125_mac_phc_settime(struct rtl8125_private *tp, const struct timespec64 *ts64)
++{
++        /* nanoseconds */
++        //0x6808[29:0]
++        RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, (ts64->tv_nsec & 0x3fffffff));
++
++        /* seconds */
++        //0x680C[47:0]
++        RTL_W32(tp, PTP_SOFT_CONFIG_Time_S_8125, ts64->tv_sec);
++        RTL_W16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4, (ts64->tv_sec >> 32));
++
++        //set local time
++        RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD));
++
++        return 0;
++}
++
++static int _rtl8125_phy_phc_settime(struct rtl8125_private *tp, const struct timespec64 *ts64)
++{
++        unsigned long flags;
++        int i;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        /* nanoseconds */
++        //Ns[15:0]  E412[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, ts64->tv_nsec);
++        //Ns[29:16] E414[13:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (ts64->tv_nsec & 0x3fff0000) >> 16);
++
++        /* seconds */
++        //S[15:0]  E416[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_LO_8126, ts64->tv_sec);
++        //S[31:16] E418[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_MI_8126, (ts64->tv_sec >> 16));
++        //S[47:32] E41A[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_HI_8126, (ts64->tv_sec >> 32));
++
++        //Direct Write
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              PTP_CLK_CFG_8126,
++                                              BIT_3 | BIT_2 | BIT_1,
++                                              (PTP_CLKADJ_MODE_SET | DIRECT_WRITE));
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET))
++                        break;
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return 0;
++}
++
++static int _rtl8125_mac_phc_adjtime(struct rtl8125_private *tp, s64 delta)
++{
++        struct timespec64 d;
++        bool negative = false;
++        u64 tohw;
++        u32 nsec;
++        u64 sec;
++
++        if (delta < 0) {
++                negative = true;
++                tohw = -delta;
++        } else {
++                tohw = delta;
++        }
++
++        d = ns_to_timespec64(tohw);
++
++        nsec = d.tv_nsec;
++        sec = d.tv_sec;
++
++        if (negative) {
++                nsec = -nsec;
++                sec = -sec;
++        }
++
++        nsec &= 0x3fffffff;
++        sec &= 0x0000ffffffffffff;
++
++        if (negative) {
++                nsec |= PTP_SOFT_CONFIG_TIME_NS_NEGATIVE;
++                sec |= PTP_SOFT_CONFIG_TIME_S_NEGATIVE;
++        }
++
++        /* nanoseconds */
++        //0x6808[29:0]
++        RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, nsec);
++
++        /* seconds */
++        //0x680C[47:0]
++        RTL_W32(tp, PTP_SOFT_CONFIG_Time_S_8125, sec);
++        RTL_W16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4, (sec >> 32));
++
++        //adjust local time
++        //RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_DRIFT_LOCAL_TIME | PTP_EXEC_CMD));
++        RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD));
++
++        return 0;
++}
++
++static int rtl8125_mac_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc adjust time\n");
++
++        rtnl_lock();
++        ret = _rtl8125_mac_phc_adjtime(tp, delta);
++        rtnl_unlock();
++
++        return ret;
++}
++
++static int _rtl8125_phy_phc_adjtime(struct rtl8125_private *tp, s64 delta)
++{
++        unsigned long flags;
++        struct timespec64 d;
++        bool negative = false;
++        int i;
++        u64 tohw;
++        u32 nsec;
++        u64 sec;
++
++        if (delta < 0) {
++                negative = true;
++                tohw = -delta;
++        } else {
++                tohw = delta;
++        }
++
++        d = ns_to_timespec64(tohw);
++
++        nsec = d.tv_nsec;
++        sec = d.tv_sec;
++
++        nsec &= 0x3fffffff;
++        sec &= 0x0000ffffffffffff;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        /* nanoseconds */
++        //Ns[15:0]  E412[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, nsec);
++        //Ns[29:16] E414[13:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (nsec >> 16));
++
++        /* seconds */
++        //S[15:0]  E416[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_LO_8126, sec);
++        //S[31:16] E418[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_MI_8126, (sec >> 16));
++        //S[47:32] E41A[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_HI_8126, (sec >> 32));
++
++        if (negative)
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      PTP_CLK_CFG_8126,
++                                                      BIT_3 | BIT_2 | BIT_1,
++                                                      (PTP_CLKADJ_MODE_SET | DECREMENT_STEP));
++        else
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      PTP_CLK_CFG_8126,
++                                                      BIT_3 | BIT_2 | BIT_1,
++                                                      (PTP_CLKADJ_MODE_SET | INCREMENT_STEP));
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                udelay(R8125_CHANNEL_WAIT_TIME);
++
++                if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET))
++                        break;
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return 0;
++}
++
++static int rtl8125_phy_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc adjust time\n");
++
++        ret = _rtl8125_phy_phc_adjtime(tp, delta);
++
++        return ret;
++}
++
++/*
++ * 1ppm means every 125MHz plus 125Hz. It also means every 8ns minus 8ns*10^(-6)
++ * 1ns=2^30 sub_ns
++ * 8ns*10^(-6) = 8 * 2^30 sub_ns * 10^(-6) = 2^33 sub_ns * 10^(-6) = 8590 = 0x218E sub_ns
++ *
++ * 1ppb means every 125MHz plus 0.125Hz. It also means every 8ns minus 8ns*10^(-9)
++ * 1ns=2^30 sub_ns
++ * 8ns*10^(-9) = 8 * 2^30 sub_ns * 10^(-9) = 2^33 sub_ns * 10^(-9) = 8.59 sub_ns = 9 sub_ns
++ */
++static int _rtl8125_mac_phc_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        bool negative = false;
++        u32 sub_ns;
++
++        if (ppb < 0) {
++                negative = true;
++                ppb = -ppb;
++        }
++
++        sub_ns = ppb * 9;
++        if (negative) {
++                sub_ns = -sub_ns;
++                sub_ns &= 0x3fffffff;
++                sub_ns |= PTP_ADJUST_TIME_NS_NEGATIVE;
++        } else
++                sub_ns &= 0x3fffffff;
++
++        /* nanoseconds */
++        //0x6808[29:0]
++        RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, sub_ns);
++
++        //adjust local time
++        RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_DRIFT_LOCAL_TIME | PTP_EXEC_CMD));
++        //RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD));
++
++        return 0;
++}
++
++/*
++ * delta = delta * 10^6 ppm = delta * 10^9 ppb (in this equation ppm and ppb are not variable)
++ *
++ * in adjfreq ppb is a variable
++ * ppb = delta * 10^9
++ * delta = ppb / 10^9
++ * rate_value = |delta| * 2^32 = |ppb| / 10^9 * 2^32 = (|ppb| << 32) / 10^9
++ */
++static int _rtl8125_phy_phc_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        unsigned long flags;
++        u32 rate_value;
++
++        if (ppb < 0) {
++                rate_value = ((u64)-ppb << 32) / 1000000000;
++                rate_value = ~rate_value + 1;
++        } else
++                rate_value = ((u64)ppb << 32) / 1000000000;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        /* nanoseconds */
++        //Ns[15:0]  E412[15:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, rate_value);
++        //Ns[22:16] E414[13:0]
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (rate_value & 0x003f0000) >> 16);
++
++        rtl8125_phy_set_clkadj_mode(tp, RATE_WRITE);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return 0;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0)
++static int rtl8125_mac_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
++{
++        s32 ppb = scaled_ppm_to_ppb(scaled_ppm);
++
++        if (ppb > ptp->max_adj || ppb < -ptp->max_adj)
++                return -EINVAL;
++
++        rtnl_lock();
++        _rtl8125_mac_phc_adjfreq(ptp, ppb);
++        rtnl_unlock();
++
++        return 0;
++}
++#else
++static int rtl8125_mac_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
++{
++        //struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++
++        //netif_info(tp, drv, tp->dev, "phc adjust freq\n");
++
++        if (delta > ptp->max_adj || delta < -ptp->max_adj)
++                return -EINVAL;
++
++        rtnl_lock();
++        _rtl8125_mac_phc_adjfreq(ptp, delta);
++        rtnl_unlock();
++
++        return 0;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
++static int rtl8125_mac_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64,
++                                   struct ptp_system_timestamp *sts)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc get ts\n");
++
++        rtnl_lock();
++        ptp_read_system_prets(sts);
++        ret = _rtl8125_mac_phc_gettime(tp, ts64);
++        ptp_read_system_postts(sts);
++        rtnl_unlock();
++
++        return ret;
++}
++#else
++static int rtl8125_mac_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc get ts\n");
++
++        rtnl_lock();
++        ret = _rtl8125_mac_phc_gettime(tp, ts64);
++        rtnl_unlock();
++
++        return ret;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0)
++static int rtl8125_phy_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
++{
++        s32 ppb = scaled_ppm_to_ppb(scaled_ppm);
++
++        if (ppb > ptp->max_adj || ppb < -ptp->max_adj)
++                return -EINVAL;
++
++        _rtl8125_phy_phc_adjfreq(ptp, ppb);
++
++        return 0;
++}
++
++#else
++static int rtl8125_phy_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
++{
++        //netif_info(tp, drv, tp->dev, "phc adjust freq\n");
++
++        if (delta > ptp->max_adj || delta < -ptp->max_adj)
++                return -EINVAL;
++
++        _rtl8125_phy_phc_adjfreq(ptp, delta);
++
++        return 0;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
++static int rtl8125_phy_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64,
++                                   struct ptp_system_timestamp *sts)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc get ts\n");
++
++        ptp_read_system_prets(sts);
++        ret = _rtl8125_phy_phc_gettime(tp, ts64);
++        ptp_read_system_postts(sts);
++
++        return ret;
++}
++#else
++static int rtl8125_phy_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc get ts\n");
++
++        ret = _rtl8125_phy_phc_gettime(tp, ts64);
++
++        return ret;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */
++
++static int rtl8125_mac_phc_settime(struct ptp_clock_info *ptp,
++                                   const struct timespec64 *ts64)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc set ts\n");
++
++        rtnl_lock();
++        ret = _rtl8125_mac_phc_settime(tp, ts64);
++        rtnl_unlock();
++
++        return ret;
++}
++
++static int rtl8125_phy_phc_settime(struct ptp_clock_info *ptp,
++                                   const struct timespec64 *ts64)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "phc set ts\n");
++
++        ret = _rtl8125_phy_phc_settime(tp, ts64);
++
++        return ret;
++}
++
++static int rtl8125_mac_phc_enable(struct ptp_clock_info *ptp,
++                                  struct ptp_clock_request *rq, int on)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        u16 ptp_ctrl;
++
++        //netif_info(tp, drv, tp->dev, "phc enable type %x on %d\n", rq->type, on);
++
++        switch (rq->type) {
++        case PTP_CLK_REQ_PPS:
++                rtnl_lock();
++                ptp_ctrl = RTL_R16(tp, PTP_CTRL_8125);
++                ptp_ctrl &= ~BIT_15;
++                if (on)
++                        ptp_ctrl |= BIT_14;
++                else
++                        ptp_ctrl &= ~BIT_14;
++                RTL_W16(tp, PTP_CTRL_8125, ptp_ctrl);
++                rtnl_unlock();
++                return 0;
++        default:
++                return -EOPNOTSUPP;
++        }
++}
++
++static int rtl8125_phy_phc_enable(struct ptp_clock_info *ptp,
++                                  struct ptp_clock_request *rq, int on)
++{
++        struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info);
++        unsigned long flags;
++        u16 phy_ocp_data;
++
++        switch (rq->type) {
++        case PTP_CLK_REQ_PPS:
++                rtnl_lock();
++                if (on) {
++                        tp->pps_enable = 1;
++                        rtl8125_mac_ocp_write(tp, 0xDC00, rtl8125_mac_ocp_read(tp, 0xDC00) & ~BIT_6);
++                        rtl8125_mac_ocp_write(tp, 0xDC60, rtl8125_mac_ocp_read(tp, 0xDC60) | BIT_6);
++
++                        r8125_spin_lock(&tp->phy_lock, flags);
++
++                        /* Set periodic pulse 1pps */
++                        /* E432[8:0] = 0x017d */
++                        phy_ocp_data = rtl8125_mdio_direct_read_phy_ocp(tp, 0xE432);
++                        phy_ocp_data &= 0xFE00;
++                        phy_ocp_data |= 0x017d;
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xE432, phy_ocp_data);
++
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xE434, 0x7840);
++
++                        /* E436[8:0] = 0xbe */
++                        phy_ocp_data = rtl8125_mdio_direct_read_phy_ocp(tp, 0xE436);
++                        phy_ocp_data &= 0xFE00;
++                        phy_ocp_data |= 0xbe;
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xE436, phy_ocp_data);
++
++                        rtl8125_mdio_direct_write_phy_ocp(tp, 0xE438, 0xbc20);
++
++                        r8125_spin_unlock(&tp->phy_lock, flags);
++
++                        /* start hrtimer */
++                        hrtimer_start(&tp->pps_timer, 1000000000, HRTIMER_MODE_REL);
++                } else
++                        tp->pps_enable = 0;
++                rtnl_unlock();
++                return 0;
++        default:
++                return -EOPNOTSUPP;
++        }
++}
++
++static void rtl8125_phy_ptp_enable_config(struct rtl8125_private *tp)
++{
++        u16 ptp_ctrl;
++
++        if (tp->syncE_en)
++                rtl8125_set_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0);
++        else
++                rtl8125_clear_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0);
++
++        ptp_ctrl = BIT_0 | BIT_1 | BIT_2 | BIT_3 | BIT_4 | BIT_5 | BIT_6 | BIT_7 | BIT_12;
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CTL, ptp_ctrl);
++
++        rtl8125_set_eth_phy_ocp_bit(tp, 0xA640, BIT_15);
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0)
++int rtl8125_get_ts_info(struct net_device *netdev,
++                        struct ethtool_ts_info *info)
++#else
++int rtl8125_get_ts_info(struct net_device *netdev,
++                        struct kernel_ethtool_ts_info *info)
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) */
++{
++        struct rtl8125_private *tp = netdev_priv(netdev);
++
++        /* we always support timestamping disabled */
++        info->rx_filters = BIT(HWTSTAMP_FILTER_NONE);
++
++        if (tp->HwSuppPtpVer == 0)
++                return ethtool_op_get_ts_info(netdev, info);
++
++        info->so_timestamping =  SOF_TIMESTAMPING_TX_SOFTWARE |
++                                 SOF_TIMESTAMPING_RX_SOFTWARE |
++                                 SOF_TIMESTAMPING_SOFTWARE |
++                                 SOF_TIMESTAMPING_TX_HARDWARE |
++                                 SOF_TIMESTAMPING_RX_HARDWARE |
++                                 SOF_TIMESTAMPING_RAW_HARDWARE;
++
++        if (tp->ptp_clock)
++                info->phc_index = ptp_clock_index(tp->ptp_clock);
++        else
++                info->phc_index = -1;
++
++        info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
++
++        info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_SYNC) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
++                           BIT(HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
++
++        return 0;
++}
++
++static const struct ptp_clock_info rtl8125_mac_ptp_clock_info = {
++        .owner      = THIS_MODULE,
++        .n_alarm    = 0,
++        .n_ext_ts   = 0,
++        .n_per_out  = 0,
++        .n_pins     = 0,
++        .pps        = 1,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0)
++        .adjfine   = rtl8125_mac_ptp_adjfine,
++#else
++        .adjfreq    = rtl8125_mac_phc_adjfreq,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */
++        .adjtime    = rtl8125_mac_phc_adjtime,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
++        .gettimex64  = rtl8125_mac_phc_gettime,
++#else
++        .gettime64  = rtl8125_mac_phc_gettime,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */
++        .settime64  = rtl8125_mac_phc_settime,
++        .enable     = rtl8125_mac_phc_enable,
++};
++
++static const struct ptp_clock_info rtl8125_phy_ptp_clock_info = {
++        .owner      = THIS_MODULE,
++        .n_alarm    = 0,
++        .n_ext_ts   = 0,
++        .n_per_out  = 0,
++        .n_pins     = 0,
++        .pps        = 1,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0)
++        .adjfine   = rtl8125_phy_ptp_adjfine,
++#else
++        .adjfreq    = rtl8125_phy_phc_adjfreq,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */
++        .adjtime    = rtl8125_phy_phc_adjtime,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0)
++        .gettimex64 = rtl8125_phy_phc_gettime,
++#else
++        .gettime64  = rtl8125_phy_phc_gettime,
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */
++
++        .settime64  = rtl8125_phy_phc_settime,
++        .enable     = rtl8125_phy_phc_enable,
++};
++
++static void rtl8125_mac_ptp_egresstime(struct rtl8125_private *tp, struct timespec64 *ts64, u32 regnum)
++{
++        /* nanoseconds */
++        //[29:0]
++        ts64->tv_nsec = rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_NS_8125 + regnum * 16 + 2);
++        ts64->tv_nsec <<= 16;
++        ts64->tv_nsec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_NS_8125 + regnum * 16);
++        ts64->tv_nsec &= 0x3fffffff;
++
++        /* seconds */
++        //[47:0]
++        ts64->tv_sec = rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16 + 4);
++        ts64->tv_sec <<= 16;
++        ts64->tv_sec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16 + 2);
++        ts64->tv_sec <<= 16;
++        ts64->tv_sec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16);
++        ts64->tv_sec &= 0x0000ffffffffffff;
++}
++
++static u16 rtl8125_phy_ptp_get_tx_msgtype(struct rtl8125_private *tp)
++{
++        u16 tx_ts_ready = 0;
++        int i;
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                tx_ts_ready = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & 0xF000;
++                if (tx_ts_ready)
++                        break;
++        }
++
++        switch (tx_ts_ready) {
++        case TX_TS_PDLYRSP_RDY:
++                return PTP_MSGTYPE_PDELAY_RESP;
++        case TX_TS_PDLYREQ_RDY:
++                return PTP_MSGTYPE_PDELAY_REQ;
++        case TX_TS_DLYREQ_RDY:
++                return PTP_MSGTYPE_DELAY_REQ;
++        case TX_TS_SYNC_RDY:
++        default:
++                return PTP_MSGTYPE_SYNC;
++        }
++}
++
++/*
++static u16 rtl8125_phy_ptp_get_rx_msgtype(struct rtl8125_private *tp)
++{
++        u16 rx_ts_ready = 0;
++        int i;
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                rx_ts_ready = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & 0x0F00;
++                if (rx_ts_ready)
++                        break;
++        }
++
++        switch (rx_ts_ready) {
++        case RX_TS_PDLYRSP_RDY:
++                return PTP_MSGTYPE_PDELAY_RESP;
++        case RX_TS_PDLYREQ_RDY:
++                return PTP_MSGTYPE_PDELAY_REQ;
++        case RX_TS_DLYREQ_RDY:
++                return PTP_MSGTYPE_DELAY_REQ;
++        case RX_TS_SYNC_RDY:
++        default:
++                return PTP_MSGTYPE_SYNC;
++        }
++}
++*/
++
++static void rtl8125_wait_phy_trx_ts_ready(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++)
++                if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & TRX_TS_RD))
++                        break;
++}
++
++static void rtl8125_set_phy_trx_ts_cmd(struct rtl8125_private *tp, u16 cmd)
++{
++        rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                              PTP_TRX_TS_STA,
++                                              TRXTS_SEL | BIT_3 | BIT_2,
++                                              TRX_TS_RD | cmd);
++
++        rtl8125_wait_phy_trx_ts_ready(tp);
++}
++
++static void rtl8125_phy_ptp_egresstime(struct rtl8125_private *tp, struct timespec64 *ts64)
++{
++        u16 msgtype;
++
++        msgtype = rtl8125_phy_ptp_get_tx_msgtype(tp);
++
++        msgtype <<= 2;
++
++        rtl8125_set_phy_trx_ts_cmd(tp, (msgtype | BIT_4));
++
++        /* nanoseconds */
++        //Ns[29:16] E448[13:0]
++        ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_HI) & 0x3fff;
++        ts64->tv_nsec <<= 16;
++        //Ns[15:0]  E446[15:0]
++        ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_LO);
++
++        /* seconds */
++        //S[47:32] E44E[15:0]
++        ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_HI);
++        ts64->tv_sec <<= 16;
++        //S[31:16] E44C[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_MI);
++        ts64->tv_sec <<= 16;
++        //S[15:0]  E44A[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_LO);
++}
++static void rtl8125_phy_ptp_ingresstime(struct rtl8125_private *tp, struct timespec64 *ts64, u8 type)
++{
++        u16 msgtype;
++
++        switch (type) {
++        case PTP_MSGTYPE_PDELAY_RESP:
++        case PTP_MSGTYPE_PDELAY_REQ:
++        case PTP_MSGTYPE_DELAY_REQ:
++        case PTP_MSGTYPE_SYNC:
++                msgtype = type << 2;
++                break;
++        default:
++                return;
++        }
++
++        rtl8125_set_phy_trx_ts_cmd(tp, (TRXTS_SEL | msgtype | BIT_4));
++
++        /* nanoseconds */
++        //Ns[29:16] E448[13:0]
++        ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_HI) & 0x3fff;
++        ts64->tv_nsec <<= 16;
++        //Ns[15:0]  E446[15:0]
++        ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_LO);
++
++        /* seconds */
++        //S[47:32] E44E[15:0]
++        ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_HI);
++        ts64->tv_sec <<= 16;
++        //S[31:16] E44C[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_MI);
++        ts64->tv_sec <<= 16;
++        //S[15:0]  E44A[15:0]
++        ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_LO);
++}
++
++static void rtl8125_mac_ptp_tx_hwtstamp(struct rtl8125_private *tp)
++{
++        struct sk_buff *skb = tp->ptp_tx_skb;
++        struct skb_shared_hwtstamps shhwtstamps = {0};
++        struct timespec64 ts64;
++        u32 regnum;
++
++        RTL_W8(tp, PTP_ISR_8125, PTP_ISR_TOK | PTP_ISR_TER);
++
++        //IO 0x2302 bit 10~11 WR_PTR
++        regnum = RTL_R16(tp, 0x2032) & 0x0C00;
++        regnum >>= 10;
++        regnum = (regnum + 3) % 4;
++
++        rtnl_lock();
++        rtl8125_mac_ptp_egresstime(tp, &ts64, regnum);
++        rtnl_unlock();
++
++        /* Upper 32 bits contain s, lower 32 bits contain ns. */
++        shhwtstamps.hwtstamp = ktime_set(ts64.tv_sec,
++                                         ts64.tv_nsec);
++
++        /* Clear the lock early before calling skb_tstamp_tx so that
++         * applications are not woken up before the lock bit is clear. We use
++         * a copy of the skb pointer to ensure other threads can't change it
++         * while we're notifying the stack.
++         */
++        tp->ptp_tx_skb = NULL;
++        clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state);
++
++        /* Notify the stack and free the skb after we've unlocked */
++        skb_tstamp_tx(skb, &shhwtstamps);
++        dev_kfree_skb_any(skb);
++}
++
++static void rtl8125_phy_ptp_tx_hwtstamp(struct rtl8125_private *tp)
++{
++        struct sk_buff *skb = tp->ptp_tx_skb;
++        struct skb_shared_hwtstamps shhwtstamps = { 0 };
++        struct timespec64 ts64;
++
++        rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, TX_TX_INTR);
++
++        rtl8125_phy_ptp_egresstime(tp, &ts64);
++
++        /* Upper 32 bits contain s, lower 32 bits contain ns. */
++        shhwtstamps.hwtstamp = ktime_set(ts64.tv_sec,
++                                         ts64.tv_nsec);
++
++        /* Clear the lock early before calling skb_tstamp_tx so that
++         * applications are not woken up before the lock bit is clear. We use
++         * a copy of the skb pointer to ensure other threads can't change it
++         * while we're notifying the stack.
++         */
++        tp->ptp_tx_skb = NULL;
++        clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state);
++
++        /* Notify the stack and free the skb after we've unlocked */
++        skb_tstamp_tx(skb, &shhwtstamps);
++        dev_kfree_skb_any(skb);
++}
++
++#define RTL8125_PTP_TX_TIMEOUT      (HZ * 15)
++static void rtl8125_mac_ptp_tx_work(struct work_struct *work)
++{
++        struct rtl8125_private *tp = container_of(work, struct rtl8125_private,
++                                     ptp_tx_work);
++
++        if (!tp->ptp_tx_skb)
++                return;
++
++        if (time_is_before_jiffies(tp->ptp_tx_start +
++                                   RTL8125_PTP_TX_TIMEOUT)) {
++                dev_kfree_skb_any(tp->ptp_tx_skb);
++                tp->ptp_tx_skb = NULL;
++                clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state);
++                tp->tx_hwtstamp_timeouts++;
++                /* Clear the tx valid bit in TSYNCTXCTL register to enable
++                 * interrupt
++                 */
++                RTL_W8(tp, PTP_ISR_8125, PTP_ISR_TOK | PTP_ISR_TER);
++                return;
++        }
++
++        if (RTL_R8(tp, PTP_ISR_8125) & (PTP_ISR_TOK))
++                rtl8125_mac_ptp_tx_hwtstamp(tp);
++        else
++                /* reschedule to check later */
++                schedule_work(&tp->ptp_tx_work);
++}
++
++static void rtl8125_phy_ptp_tx_work(struct work_struct *work)
++{
++        struct rtl8125_private *tp = container_of(work, struct rtl8125_private,
++                                     ptp_tx_work);
++        unsigned long flags;
++        bool tx_intr;
++
++        if (!tp->ptp_tx_skb)
++                return;
++
++        if (time_is_before_jiffies(tp->ptp_tx_start +
++                                   RTL8125_PTP_TX_TIMEOUT)) {
++                dev_kfree_skb_any(tp->ptp_tx_skb);
++                tp->ptp_tx_skb = NULL;
++                clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state);
++                tp->tx_hwtstamp_timeouts++;
++                /* Clear the tx valid bit in TSYNCTXCTL register to enable
++                 * interrupt
++                 */
++                r8125_spin_lock(&tp->phy_lock, flags);
++                rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, TX_TX_INTR);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                return;
++        }
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++        if (rtl8125_mdio_direct_read_phy_ocp(tp, PTP_INSR) & TX_TX_INTR) {
++                tx_intr = true;
++                rtl8125_phy_ptp_tx_hwtstamp(tp);
++        } else {
++                tx_intr = false;
++        }
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        if (!tx_intr) {
++                /* reschedule to check later */
++                schedule_work(&tp->ptp_tx_work);
++        }
++}
++
++static int rtl8125_mac_hwtstamp_enable(struct rtl8125_private *tp, bool enable)
++{
++        RTL_W16(tp, PTP_CTRL_8125, 0);
++        if (enable) {
++                u16 ptp_ctrl;
++                struct timespec64 ts64;
++
++                //clear ptp isr
++                RTL_W8(tp, PTP_ISR_8125, 0xff);
++                //ptp source 0:gphy 1:mac
++                rtl8125_mac_ocp_write(tp, 0xDC00, rtl8125_mac_ocp_read(tp, 0xDC00) | BIT_6);
++                //enable ptp
++                ptp_ctrl = (BIT_0 | BIT_3 | BIT_4 | BIT_6 | BIT_10 | BIT_12);
++                if (tp->ptp_master_mode)
++                        ptp_ctrl |= BIT_1;
++                RTL_W16(tp, PTP_CTRL_8125, ptp_ctrl);
++
++                //set system time
++                /*
++                if (ktime_to_timespec64_cond(ktime_get_real(), &ts64))
++                _rtl8125_mac_phc_settime(tp, timespec64_to_timespec(ts64));
++                */
++                ktime_get_real_ts64(&ts64);
++                _rtl8125_mac_phc_settime(tp, &ts64);
++        }
++
++        return 0;
++}
++
++static int rtl8125_phy_hwtstamp_enable(struct rtl8125_private *tp, bool enable)
++{
++        unsigned long flags;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        if (enable) {
++                //trx timestamp interrupt enable
++                rtl8125_set_eth_phy_ocp_bit(tp, PTP_INER, BIT_2 | BIT_3);
++
++                //set isr clear mode
++                rtl8125_set_eth_phy_ocp_bit(tp, PTP_GEN_CFG, BIT_0);
++
++                //clear ptp isr
++                rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, 0xFFFF);
++
++                //enable ptp
++                rtl8125_phy_ptp_enable_config(tp);
++
++                //rtl8125_set_phy_local_time(tp);
++        } else {
++                /* trx timestamp interrupt disable */
++                rtl8125_clear_eth_phy_ocp_bit(tp, PTP_INER, BIT_2 | BIT_3);
++
++                /* disable ptp */
++                rtl8125_clear_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0);
++                rtl8125_clear_eth_phy_ocp_bit(tp, PTP_CTL, BIT_0);
++                rtl8125_set_eth_phy_ocp_bit(tp, 0xA640, BIT_15);
++        }
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        return 0;
++}
++
++void rtl8125_set_phy_local_time(struct rtl8125_private *tp)
++{
++        struct timespec64 ts64;
++        //set system time
++        ktime_get_real_ts64(&ts64);
++        _rtl8125_phy_phc_settime(tp, &ts64);
++}
++
++static long rtl8125_ptp_create_clock(struct rtl8125_private *tp)
++{
++        struct net_device *netdev = tp->dev;
++        long err;
++
++        if (!IS_ERR_OR_NULL(tp->ptp_clock))
++                return 0;
++
++        if (tp->HwSuppPtpVer == 0) {
++                tp->ptp_clock = NULL;
++                return -EOPNOTSUPP;
++        }
++
++        switch (tp->HwSuppPtpVer) {
++        case 1:
++                tp->ptp_clock_info = rtl8125_mac_ptp_clock_info;
++                tp->ptp_clock_info.max_adj = 119304647;
++                break;
++        case 3:
++                tp->ptp_clock_info = rtl8125_phy_ptp_clock_info;
++                tp->ptp_clock_info.max_adj = 488281;//0x1FFFFF * 10^9 / 2^32
++                break;
++        default:
++                break;
++        }
++
++        snprintf(tp->ptp_clock_info.name, sizeof(tp->ptp_clock_info.name),
++                 "%pm", tp->dev->dev_addr);
++        tp->ptp_clock = ptp_clock_register(&tp->ptp_clock_info, &tp->pci_dev->dev);
++        if (IS_ERR(tp->ptp_clock)) {
++                err = PTR_ERR(tp->ptp_clock);
++                tp->ptp_clock = NULL;
++                netif_err(tp, drv, tp->dev, "ptp_clock_register failed\n");
++                return err;
++        } else
++                netif_info(tp, drv, tp->dev, "registered PHC device on %s\n", netdev->name);
++
++        return 0;
++}
++
++void rtl8125_ptp_reset(struct rtl8125_private *tp)
++{
++        if (!tp->ptp_clock)
++                return;
++
++        netif_info(tp, drv, tp->dev, "reset PHC clock\n");
++
++        switch (tp->HwSuppPtpVer) {
++        case 1:
++                rtl8125_mac_hwtstamp_enable(tp, false);
++                break;
++        case 3:
++                rtl8125_phy_hwtstamp_enable(tp, false);
++                break;
++        default:
++                break;
++        }
++}
++
++static enum hrtimer_restart
++rtl8125_phy_hrtimer_for_pps(struct hrtimer *timer) {
++        struct rtl8125_private *tp = container_of(timer, struct rtl8125_private, pps_timer);
++        s64 pps_sec;
++        u16 tai_cfg;
++        int i;
++
++        if (tp->pps_enable)
++        {
++                switch (tp->HwSuppPtpVer) {
++                case 3:
++                        tai_cfg = BIT_8 | BIT_5 | BIT_1 | BIT_0;
++                        break;
++                default:
++                        break;
++                }
++
++                //Direct Read
++                rtl8125_clear_and_set_eth_phy_ocp_bit(tp,
++                                                      PTP_CLK_CFG_8126,
++                                                      BIT_3 | BIT_2 | BIT_1,
++                                                      (PTP_CLKADJ_MODE_SET | DIRECT_READ));
++
++                for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) {
++                        udelay(R8125_CHANNEL_WAIT_TIME);
++
++                        if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET))
++                                break;
++                }
++
++                pps_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_HI_8126);
++                pps_sec <<= 16;
++                pps_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_MI_8126);
++                pps_sec <<= 16;
++                pps_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_LO_8126);
++                pps_sec++;
++
++                //E42A[15:0]
++                rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_TS_S_LO, pps_sec & 0xffff);
++                //E42C[31:16]
++                rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_TS_S_HI, (pps_sec & 0xffff0000) >> 16);
++                //Periodic Tai start
++                rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_CFG, tai_cfg);
++
++                hrtimer_forward_now(&tp->pps_timer, 1000000000); //rekick
++                return HRTIMER_RESTART;
++        } else
++                return HRTIMER_NORESTART;
++}
++
++void rtl8125_ptp_init(struct rtl8125_private *tp)
++{
++        /* obtain a PTP device, or re-use an existing device */
++        if (rtl8125_ptp_create_clock(tp))
++                return;
++
++        /* we have a clock so we can initialize work now */
++        switch (tp->HwSuppPtpVer) {
++        case 1:
++                INIT_WORK(&tp->ptp_tx_work, rtl8125_mac_ptp_tx_work);
++                break;
++        case 3:
++                INIT_WORK(&tp->ptp_tx_work, rtl8125_phy_ptp_tx_work);
++                break;
++        default:
++                break;
++        }
++
++        /* init a hrtimer for pps */
++        switch (tp->HwSuppPtpVer) {
++        case 3:
++                tp->pps_enable = 0;
++                hrtimer_init(&tp->pps_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
++                tp->pps_timer.function = rtl8125_phy_hrtimer_for_pps;
++                break;
++        default:
++                break;
++        }
++
++        /* reset the PTP related hardware bits */
++        rtl8125_ptp_reset(tp);
++
++        return;
++}
++
++void rtl8125_ptp_suspend(struct rtl8125_private *tp)
++{
++        if (!tp->ptp_clock)
++                return;
++
++        netif_info(tp, drv, tp->dev, "suspend PHC clock\n");
++
++        switch (tp->HwSuppPtpVer) {
++        case 1:
++                rtl8125_mac_hwtstamp_enable(tp, false);
++                break;
++        case 3:
++                rtl8125_phy_hwtstamp_enable(tp, false);
++                break;
++        default:
++                break;
++        }
++
++        /* ensure that we cancel any pending PTP Tx work item in progress */
++        cancel_work_sync(&tp->ptp_tx_work);
++
++        switch (tp->HwSuppPtpVer) {
++        case 3:
++                hrtimer_cancel(&tp->pps_timer);
++                break;
++        default:
++                break;
++        }
++}
++
++void rtl8125_ptp_stop(struct rtl8125_private *tp)
++{
++        struct net_device *netdev = tp->dev;
++
++        netif_info(tp, drv, tp->dev, "stop PHC clock\n");
++
++        /* first, suspend PTP activity */
++        rtl8125_ptp_suspend(tp);
++
++        /* disable the PTP clock device */
++        if (tp->ptp_clock) {
++                ptp_clock_unregister(tp->ptp_clock);
++                tp->ptp_clock = NULL;
++                netif_info(tp, drv, tp->dev, "removed PHC on %s\n",
++                           netdev->name);
++        }
++}
++
++static int rtl8125_set_tstamp(struct net_device *netdev, struct ifreq *ifr)
++{
++        struct rtl8125_private *tp = netdev_priv(netdev);
++        struct hwtstamp_config config;
++        bool hwtstamp = 0;
++
++        //netif_info(tp, drv, tp->dev, "ptp set ts\n");
++
++        if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
++                return -EFAULT;
++
++        if (config.flags)
++                return -EINVAL;
++
++        switch (config.tx_type) {
++        case HWTSTAMP_TX_ON:
++                hwtstamp = 1;
++                break;
++        case HWTSTAMP_TX_OFF:
++                break;
++        case HWTSTAMP_TX_ONESTEP_SYNC:
++        default:
++                return -ERANGE;
++        }
++
++        switch (config.rx_filter) {
++        case HWTSTAMP_FILTER_PTP_V2_EVENT:
++        case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
++        case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
++        case HWTSTAMP_FILTER_PTP_V2_SYNC:
++        case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
++        case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
++        case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
++        case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
++        case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
++                config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
++                hwtstamp = 1;
++                tp->flags |= RTL_FLAG_RX_HWTSTAMP_ENABLED;
++                break;
++        case HWTSTAMP_FILTER_NONE:
++                tp->flags &= ~RTL_FLAG_RX_HWTSTAMP_ENABLED;
++                break;
++        default:
++                tp->flags &= ~RTL_FLAG_RX_HWTSTAMP_ENABLED;
++                return -ERANGE;
++        }
++
++        if (tp->hwtstamp_config.tx_type != config.tx_type ||
++            tp->hwtstamp_config.rx_filter != config.rx_filter) {
++                tp->hwtstamp_config = config;
++
++                switch (tp->HwSuppPtpVer) {
++                case 1:
++                        rtl8125_mac_hwtstamp_enable(tp, hwtstamp);
++                        break;
++                case 3:
++                        rtl8125_phy_hwtstamp_enable(tp, hwtstamp);
++                        break;
++                default:
++                        break;
++                }
++        }
++
++        return copy_to_user(ifr->ifr_data, &config,
++                            sizeof(config)) ? -EFAULT : 0;
++}
++
++static int rtl8125_get_tstamp(struct net_device *netdev, struct ifreq *ifr)
++{
++        struct rtl8125_private *tp = netdev_priv(netdev);
++
++        //netif_info(tp, drv, tp->dev, "ptp get ts\n");
++
++        return copy_to_user(ifr->ifr_data, &tp->hwtstamp_config,
++                            sizeof(tp->hwtstamp_config)) ? -EFAULT : 0;
++}
++
++int rtl8125_ptp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
++{
++        int ret;
++
++        //netif_info(tp, drv, tp->dev, "ptp ioctl\n");
++
++        switch (cmd) {
++#ifdef ENABLE_PTP_SUPPORT
++        case SIOCSHWTSTAMP:
++                ret = rtl8125_set_tstamp(netdev, ifr);
++                break;
++        case SIOCGHWTSTAMP:
++                ret = rtl8125_get_tstamp(netdev, ifr);
++                break;
++#endif
++        default:
++                ret = -EOPNOTSUPP;
++                break;
++        }
++
++        return ret;
++}
++
++void rtl8125_rx_mac_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb,
++                                 struct RxDescV3 *descv3)
++{
++        time64_t tv_sec;
++        long tv_nsec;
++
++        tv_sec = le32_to_cpu(descv3->RxDescTimeStamp.TimeStampHigh) +
++                 ((u64)le32_to_cpu(descv3->RxDescPTPDDWord4.TimeStampHHigh) << 32);
++        tv_nsec = le32_to_cpu(descv3->RxDescTimeStamp.TimeStampLow);
++
++        skb_hwtstamps(skb)->hwtstamp = ktime_set(tv_sec, tv_nsec);
++}
++
++static void rtl8125_rx_phy_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb, u8 type)
++{
++        struct timespec64 ts64;
++        unsigned long flags;
++
++        r8125_spin_lock(&tp->phy_lock, flags);
++
++        rtl8125_phy_ptp_ingresstime(tp, &ts64, type);
++
++        r8125_spin_unlock(&tp->phy_lock, flags);
++
++        skb_hwtstamps(skb)->hwtstamp = ktime_set(ts64.tv_sec, ts64.tv_nsec);
++
++        return;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0)
++static struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
++{
++        u8 *ptr = skb_mac_header(skb);
++
++        if (type & PTP_CLASS_VLAN)
++                //ptr += VLAN_HLEN;
++                ptr += 4;
++
++        switch (type & PTP_CLASS_PMASK) {
++        case PTP_CLASS_IPV4:
++                ptr += IPV4_HLEN(ptr) + UDP_HLEN;
++                break;
++        case PTP_CLASS_IPV6:
++                ptr += IP6_HLEN + UDP_HLEN;
++                break;
++        case PTP_CLASS_L2:
++                break;
++        default:
++                return NULL;
++        }
++
++        ptr += ETH_HLEN;
++
++        /* Ensure that the entire header is present in this packet. */
++        if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
++                return NULL;
++
++        return (struct ptp_header *)ptr;
++}
++
++static inline u8 ptp_get_msgtype(const struct ptp_header *hdr,
++                                 unsigned int type)
++{
++        u8 msgtype;
++
++        if (unlikely(type & PTP_CLASS_V1)) {
++                /* msg type is located at the control field for ptp v1 */
++                msgtype = hdr->control;
++        } else {
++                msgtype = hdr->tsmt & 0x0f;
++        }
++
++        return msgtype;
++}
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) */
++
++void rtl8125_rx_phy_ptp_timestamp(struct rtl8125_private *tp, struct sk_buff *skb)
++{
++        unsigned int ptp_class;
++        struct ptp_header *hdr;
++        u8 msgtype;
++
++        ptp_class = ptp_classify_raw(skb);
++        if (ptp_class == PTP_CLASS_NONE)
++                return;
++
++        skb_reset_mac_header(skb);
++        hdr = ptp_parse_header(skb, ptp_class);
++        if (unlikely(!hdr))
++                return;
++
++        msgtype = ptp_get_msgtype(hdr, ptp_class);
++        rtl8125_rx_phy_ptp_pktstamp(tp, skb, msgtype);
++
++        return;
++}
+diff --git a/drivers/net/ethernet/realtek/r8125_ptp.h b/drivers/net/ethernet/realtek/r8125_ptp.h
+new file mode 100755
+index 000000000000..3cd8b677fd60
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_ptp.h
+@@ -0,0 +1,159 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_rtl8125_PTP_H
++#define _LINUX_rtl8125_PTP_H
++
++#include <linux/ktime.h>
++#include <linux/timecounter.h>
++#include <linux/net_tstamp.h>
++#include <linux/ptp_clock_kernel.h>
++#include <linux/ptp_classify.h>
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0)
++#define PTP_MSGTYPE_SYNC        0x0
++#define PTP_MSGTYPE_DELAY_REQ   0x1
++#define PTP_MSGTYPE_PDELAY_REQ  0x2
++#define PTP_MSGTYPE_PDELAY_RESP 0x3
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0) */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0)
++struct clock_identity {
++        u8 id[8];
++} __packed;
++
++struct port_identity {
++        struct clock_identity	clock_identity;
++        __be16			port_number;
++} __packed;
++
++struct ptp_header {
++        u8			tsmt;  /* transportSpecific | messageType */
++        u8			ver;   /* reserved          | versionPTP  */
++        __be16			message_length;
++        u8			domain_number;
++        u8			reserved1;
++        u8			flag_field[2];
++        __be64			correction;
++        __be32			reserved2;
++        struct port_identity	source_port_identity;
++        __be16			sequence_id;
++        u8			control;
++        u8			log_message_interval;
++} __packed;
++
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) */
++
++struct rtl8125_ptp_info {
++        s64 time_sec;
++        u32 time_ns;
++        u16 ts_info;
++};
++
++#ifndef _STRUCT_TIMESPEC
++#define _STRUCT_TIMESPEC
++struct timespec {
++        __kernel_old_time_t tv_sec;     /* seconds */
++        long            tv_nsec;    /* nanoseconds */
++};
++#endif
++
++enum PTP_CMD_TYPE {
++        PTP_CMD_SET_LOCAL_TIME = 0,
++        PTP_CMD_DRIFT_LOCAL_TIME,
++        PTP_CMD_LATCHED_LOCAL_TIME,
++};
++
++#define PTP_CLKADJ_MODE_SET BIT_0
++
++enum PTP_CLKADJ_MOD_TYPE {
++        NO_FUNCTION     = 0,
++        CLKADJ_MODE_SET = 1,
++        RESERVED        = 2,
++        DIRECT_READ     = 4,
++        DIRECT_WRITE    = 6,
++        INCREMENT_STEP  = 8,
++        DECREMENT_STEP  = 10,
++        RATE_READ       = 12,
++        RATE_WRITE      = 14,
++};
++
++enum PTP_INSR_TYPE {
++        EVENT_CAP_INTR   = (1 << 0),
++        TRIG_GEN_INTR    = (1 << 1),
++        RX_TS_INTR       = (1 << 2),
++        TX_TX_INTR       = (1 << 3),
++};
++
++enum PTP_TRX_TS_STA_REG {
++        TRX_TS_RD               = (1 << 0),
++        TRXTS_SEL               = (1 << 1),
++        RX_TS_PDLYRSP_RDY       = (1 << 8),
++        RX_TS_PDLYREQ_RDY       = (1 << 9),
++        RX_TS_DLYREQ_RDY        = (1 << 10),
++        RX_TS_SYNC_RDY          = (1 << 11),
++        TX_TS_PDLYRSP_RDY       = (1 << 12),
++        TX_TS_PDLYREQ_RDY       = (1 << 13),
++        TX_TS_DLYREQ_RDY        = (1 << 14),
++        TX_TS_SYNC_RDY          = (1 << 15),
++};
++
++#define RTL_FLAG_RX_HWTSTAMP_ENABLED BIT_0
++
++struct rtl8125_private;
++struct RxDescV3;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0)
++int rtl8125_get_ts_info(struct net_device *netdev,
++                        struct ethtool_ts_info *info);
++#else
++int rtl8125_get_ts_info(struct net_device *netdev,
++                        struct kernel_ethtool_ts_info *info);
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) */
++
++void rtl8125_ptp_reset(struct rtl8125_private *tp);
++void rtl8125_ptp_init(struct rtl8125_private *tp);
++void rtl8125_ptp_suspend(struct rtl8125_private *tp);
++void rtl8125_ptp_stop(struct rtl8125_private *tp);
++
++int rtl8125_ptp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
++
++void rtl8125_rx_mac_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb,
++                                 struct RxDescV3 *descv3);
++
++void rtl8125_set_phy_local_time(struct rtl8125_private *tp);
++
++void rtl8125_rx_phy_ptp_timestamp(struct rtl8125_private *tp, struct sk_buff *skb);
++
++#endif /* _LINUX_rtl8125_PTP_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_realwow.h b/drivers/net/ethernet/realtek/r8125_realwow.h
+new file mode 100755
+index 000000000000..4b2315ebbb62
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_realwow.h
+@@ -0,0 +1,118 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_R8125_REALWOW_H
++#define _LINUX_R8125_REALWOW_H
++
++#define SIOCDEVPRIVATE_RTLREALWOW   SIOCDEVPRIVATE+3
++
++#define MAX_RealWoW_KCP_SIZE (100)
++#define MAX_RealWoW_Payload (64)
++
++#define KA_TX_PACKET_SIZE (100)
++#define KA_WAKEUP_PATTERN_SIZE (120)
++
++//HwSuppKeepAliveOffloadVer
++#define HW_SUPPORT_KCP_OFFLOAD(_M)        ((_M)->HwSuppKCPOffloadVer > 0)
++
++enum rtl_realwow_cmd {
++
++        RTL_REALWOW_SET_KCP_DISABLE=0,
++        RTL_REALWOW_SET_KCP_INFO,
++        RTL_REALWOW_SET_KCP_CONTENT,
++
++        RTL_REALWOW_SET_KCP_ACKPKTINFO,
++        RTL_REALWOW_SET_KCP_WPINFO,
++        RTL_REALWOW_SET_KCPDHCP_TIMEOUT,
++
++        RTLT_REALWOW_COMMAND_INVALID
++};
++
++struct rtl_realwow_ioctl_struct {
++        __u32	cmd;
++        __u32	offset;
++        __u32	len;
++        union {
++                __u32	data;
++                void *data_buffer;
++        };
++};
++
++typedef struct _MP_KCPInfo {
++        u8 DIPv4[4];
++        u8 MacID[6];
++        u16 UdpPort[2];
++        u8 PKTLEN[2];
++
++        u16 ackLostCnt;
++        u8 KCP_WakePattern[MAX_RealWoW_Payload];
++        u8 KCP_AckPacket[MAX_RealWoW_Payload];
++        u32 KCP_interval;
++        u8 KCP_WakePattern_Len;
++        u8 KCP_AckPacket_Len;
++        u8 KCP_TxPacket[2][KA_TX_PACKET_SIZE];
++} MP_KCP_INFO, *PMP_KCP_INFO;
++
++typedef struct _KCPInfo {
++        u32 nId; // = id
++        u8 DIPv4[4];
++        u8 MacID[6];
++        u16 UdpPort;
++        u16 PKTLEN;
++} KCPInfo, *PKCPInfo;
++
++typedef struct _KCPContent {
++        u32 id; // = id
++        u32 mSec; // = msec
++        u32 size; // =size
++        u8 bPacket[MAX_RealWoW_KCP_SIZE]; // put packet here
++} KCPContent, *PKCPContent;
++
++typedef struct _RealWoWAckPktInfo {
++        u16 ackLostCnt;
++        u16 patterntSize;
++        u8 pattern[MAX_RealWoW_Payload];
++} RealWoWAckPktInfo,*PRealWoWAckPktInfo;
++
++typedef struct _RealWoWWPInfo {
++        u16 patterntSize;
++        u8 pattern[MAX_RealWoW_Payload];
++} RealWoWWPInfo,*PRealWoWWPInfo;
++
++int rtl8125_realwow_ioctl(struct net_device *dev, struct ifreq *ifr);
++void rtl8125_realwow_hw_init(struct net_device *dev);
++void rtl8125_get_realwow_hw_version(struct net_device *dev);
++void rtl8125_set_realwow_d3_para(struct net_device *dev);
++
++#endif /* _LINUX_R8125_REALWOW_H */
+diff --git a/drivers/net/ethernet/realtek/r8125_rss.c b/drivers/net/ethernet/realtek/r8125_rss.c
+new file mode 100755
+index 000000000000..bcdcab01a6ab
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_rss.c
+@@ -0,0 +1,583 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/version.h>
++#include "r8125.h"
++
++enum rtl8125_rss_register_content {
++        /* RSS */
++        RSS_CTRL_TCP_IPV4_SUPP = (1 << 0),
++        RSS_CTRL_IPV4_SUPP  = (1 << 1),
++        RSS_CTRL_TCP_IPV6_SUPP  = (1 << 2),
++        RSS_CTRL_IPV6_SUPP  = (1 << 3),
++        RSS_CTRL_IPV6_EXT_SUPP  = (1 << 4),
++        RSS_CTRL_TCP_IPV6_EXT_SUPP  = (1 << 5),
++        RSS_HALF_SUPP  = (1 << 7),
++        RSS_CTRL_UDP_IPV4_SUPP  = (1 << 11),
++        RSS_CTRL_UDP_IPV6_SUPP  = (1 << 12),
++        RSS_CTRL_UDP_IPV6_EXT_SUPP  = (1 << 13),
++        RSS_QUAD_CPU_EN  = (1 << 16),
++        RSS_HQ_Q_SUP_R  = (1 << 31),
++};
++
++static int rtl8125_get_rss_hash_opts(struct rtl8125_private *tp,
++                                     struct ethtool_rxnfc *cmd)
++{
++        cmd->data = 0;
++
++        /* Report default options for RSS */
++        switch (cmd->flow_type) {
++        case TCP_V4_FLOW:
++                cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
++                fallthrough;
++        case UDP_V4_FLOW:
++                if (tp->rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4)
++                        cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
++                fallthrough;
++        case IPV4_FLOW:
++                cmd->data |= RXH_IP_SRC | RXH_IP_DST;
++                break;
++        case TCP_V6_FLOW:
++                cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
++                fallthrough;
++        case UDP_V6_FLOW:
++                if (tp->rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6)
++                        cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
++                fallthrough;
++        case IPV6_FLOW:
++                cmd->data |= RXH_IP_SRC | RXH_IP_DST;
++                break;
++        default:
++                return -EINVAL;
++        }
++
++        return 0;
++}
++
++int rtl8125_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
++                      u32 *rule_locs)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret = -EOPNOTSUPP;
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return ret;
++
++        switch (cmd->cmd) {
++        case ETHTOOL_GRXRINGS:
++                cmd->data = rtl8125_tot_rx_rings(tp);
++                ret = 0;
++                break;
++        case ETHTOOL_GRXFH:
++                ret = rtl8125_get_rss_hash_opts(tp, cmd);
++                break;
++        default:
++                break;
++        }
++
++        return ret;
++}
++
++u32 rtl8125_rss_indir_tbl_entries(struct rtl8125_private *tp)
++{
++        return tp->HwSuppIndirTblEntries;
++}
++
++#define RSS_MASK_BITS_OFFSET (8)
++#define RSS_CPU_NUM_OFFSET (16)
++#define RTL8125_UDP_RSS_FLAGS (RTL_8125_RSS_FLAG_HASH_UDP_IPV4 | \
++		       RTL_8125_RSS_FLAG_HASH_UDP_IPV6)
++static int _rtl8125_set_rss_hash_opt(struct rtl8125_private *tp)
++{
++        u32 rss_flags = tp->rss_flags;
++        u32 hash_mask_len;
++        u32 rss_ctrl;
++
++        rss_ctrl = ilog2(rtl8125_tot_rx_rings(tp));
++        rss_ctrl &= (BIT_0 | BIT_1 | BIT_2);
++        rss_ctrl <<= RSS_CPU_NUM_OFFSET;
++
++        /* Perform hash on these packet types */
++        rss_ctrl |= RSS_CTRL_TCP_IPV4_SUPP
++                    | RSS_CTRL_IPV4_SUPP
++                    | RSS_CTRL_IPV6_SUPP
++                    | RSS_CTRL_IPV6_EXT_SUPP
++                    | RSS_CTRL_TCP_IPV6_SUPP
++                    | RSS_CTRL_TCP_IPV6_EXT_SUPP;
++
++        if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4)
++                rss_ctrl |= RSS_CTRL_UDP_IPV4_SUPP;
++
++        if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6)
++                rss_ctrl |= RSS_CTRL_UDP_IPV6_SUPP |
++                            RSS_CTRL_UDP_IPV6_EXT_SUPP;
++
++        hash_mask_len = ilog2(rtl8125_rss_indir_tbl_entries(tp));
++        hash_mask_len &= (BIT_0 | BIT_1 | BIT_2);
++        rss_ctrl |= hash_mask_len << RSS_MASK_BITS_OFFSET;
++
++        RTL_W32(tp, RSS_CTRL_8125, rss_ctrl);
++
++        return 0;
++}
++
++static int rtl8125_set_rss_hash_opt(struct rtl8125_private *tp,
++                                    struct ethtool_rxnfc *nfc)
++{
++        u32 rss_flags = tp->rss_flags;
++
++        /*
++         * RSS does not support anything other than hashing
++         * to queues on src and dst IPs and ports
++         */
++        if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
++                          RXH_L4_B_0_1 | RXH_L4_B_2_3))
++                return -EINVAL;
++
++        switch (nfc->flow_type) {
++        case TCP_V4_FLOW:
++        case TCP_V6_FLOW:
++                if (!(nfc->data & RXH_IP_SRC) ||
++                    !(nfc->data & RXH_IP_DST) ||
++                    !(nfc->data & RXH_L4_B_0_1) ||
++                    !(nfc->data & RXH_L4_B_2_3))
++                        return -EINVAL;
++                break;
++        case UDP_V4_FLOW:
++                if (!(nfc->data & RXH_IP_SRC) ||
++                    !(nfc->data & RXH_IP_DST))
++                        return -EINVAL;
++                switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
++                case 0:
++                        rss_flags &= ~RTL_8125_RSS_FLAG_HASH_UDP_IPV4;
++                        break;
++                case (RXH_L4_B_0_1 | RXH_L4_B_2_3):
++                        rss_flags |= RTL_8125_RSS_FLAG_HASH_UDP_IPV4;
++                        break;
++                default:
++                        return -EINVAL;
++                }
++                break;
++        case UDP_V6_FLOW:
++                if (!(nfc->data & RXH_IP_SRC) ||
++                    !(nfc->data & RXH_IP_DST))
++                        return -EINVAL;
++                switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
++                case 0:
++                        rss_flags &= ~RTL_8125_RSS_FLAG_HASH_UDP_IPV6;
++                        break;
++                case (RXH_L4_B_0_1 | RXH_L4_B_2_3):
++                        rss_flags |= RTL_8125_RSS_FLAG_HASH_UDP_IPV6;
++                        break;
++                default:
++                        return -EINVAL;
++                }
++                break;
++        case SCTP_V4_FLOW:
++        case AH_ESP_V4_FLOW:
++        case AH_V4_FLOW:
++        case ESP_V4_FLOW:
++        case SCTP_V6_FLOW:
++        case AH_ESP_V6_FLOW:
++        case AH_V6_FLOW:
++        case ESP_V6_FLOW:
++        case IP_USER_FLOW:
++        case ETHER_FLOW:
++                /* RSS is not supported for these protocols */
++                if (nfc->data) {
++                        netif_err(tp, drv, tp->dev, "Command parameters not supported\n");
++                        return -EINVAL;
++                }
++                return 0;
++                break;
++        default:
++                return -EINVAL;
++        }
++
++        /* if we changed something we need to update flags */
++        if (rss_flags != tp->rss_flags) {
++                u32 rss_ctrl = RTL_R32(tp, RSS_CTRL_8125);
++
++                if ((rss_flags & RTL8125_UDP_RSS_FLAGS) &&
++                    !(tp->rss_flags & RTL8125_UDP_RSS_FLAGS))
++                        netdev_warn(tp->dev,
++                                    "enabling UDP RSS: fragmented packets may "
++                                    "arrive out of order to the stack above\n");
++
++                tp->rss_flags = rss_flags;
++
++                /* Perform hash on these packet types */
++                rss_ctrl |= RSS_CTRL_TCP_IPV4_SUPP
++                            | RSS_CTRL_IPV4_SUPP
++                            | RSS_CTRL_IPV6_SUPP
++                            | RSS_CTRL_IPV6_EXT_SUPP
++                            | RSS_CTRL_TCP_IPV6_SUPP
++                            | RSS_CTRL_TCP_IPV6_EXT_SUPP;
++
++                rss_ctrl &= ~(RSS_CTRL_UDP_IPV4_SUPP |
++                              RSS_CTRL_UDP_IPV6_SUPP |
++                              RSS_CTRL_UDP_IPV6_EXT_SUPP);
++
++                if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4)
++                        rss_ctrl |= RSS_CTRL_UDP_IPV4_SUPP;
++
++                if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6)
++                        rss_ctrl |= RSS_CTRL_UDP_IPV6_SUPP |
++                                    RSS_CTRL_UDP_IPV6_EXT_SUPP;
++
++                RTL_W32(tp, RSS_CTRL_8125, rss_ctrl);
++        }
++
++        return 0;
++}
++
++int rtl8125_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int ret = -EOPNOTSUPP;
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return ret;
++
++        switch (cmd->cmd) {
++        case ETHTOOL_SRXFH:
++                ret = rtl8125_set_rss_hash_opt(tp, cmd);
++                break;
++        default:
++                break;
++        }
++
++        return ret;
++}
++
++static u32 _rtl8125_get_rxfh_key_size(struct rtl8125_private *tp)
++{
++        return sizeof(tp->rss_key);
++}
++
++u32 rtl8125_get_rxfh_key_size(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return 0;
++
++        return _rtl8125_get_rxfh_key_size(tp);
++}
++
++u32 rtl8125_rss_indir_size(struct net_device *dev)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return 0;
++
++        return rtl8125_rss_indir_tbl_entries(tp);
++}
++
++static void rtl8125_get_reta(struct rtl8125_private *tp, u32 *indir)
++{
++        int i, reta_size = rtl8125_rss_indir_tbl_entries(tp);
++
++        for (i = 0; i < reta_size; i++)
++                indir[i] = tp->rss_indir_tbl[i];
++}
++
++static u32 rtl8125_rss_key_reg(struct rtl8125_private *tp)
++{
++        return RSS_KEY_8125;
++}
++
++static u32 rtl8125_rss_indir_tbl_reg(struct rtl8125_private *tp)
++{
++        return RSS_INDIRECTION_TBL_8125_V2;
++}
++
++static void rtl8125_store_reta(struct rtl8125_private *tp)
++{
++        u16 indir_tbl_reg = rtl8125_rss_indir_tbl_reg(tp);
++        u32 i, reta_entries = rtl8125_rss_indir_tbl_entries(tp);
++        u32 reta = 0;
++        u8 *indir_tbl = tp->rss_indir_tbl;
++
++        /* Write redirection table to HW */
++        for (i = 0; i < reta_entries; i++) {
++                reta |= indir_tbl[i] << (i & 0x3) * 8;
++                if ((i & 3) == 3) {
++                        RTL_W32(tp, indir_tbl_reg, reta);
++
++                        indir_tbl_reg += 4;
++                        reta = 0;
++                }
++        }
++}
++
++static void rtl8125_store_rss_key(struct rtl8125_private *tp)
++{
++        const u16 rss_key_reg = rtl8125_rss_key_reg(tp);
++        u32 i, rss_key_size = _rtl8125_get_rxfh_key_size(tp);
++        u32 *rss_key = (u32*)tp->rss_key;
++
++        /* Write redirection table to HW */
++        for (i = 0; i < rss_key_size; i+=4)
++                RTL_W32(tp, rss_key_reg + i, *rss_key++);
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0)
++int rtl8125_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return -EOPNOTSUPP;
++
++        rxfh->hfunc = ETH_RSS_HASH_TOP;
++
++        if (rxfh->indir)
++                rtl8125_get_reta(tp, rxfh->indir);
++
++        if (rxfh->key)
++                memcpy(rxfh->key, tp->rss_key, RTL8125_RSS_KEY_SIZE);
++
++        return 0;
++}
++
++int rtl8125_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
++                     struct netlink_ext_ack *extack)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++        u32 reta_entries = rtl8125_rss_indir_tbl_entries(tp);
++
++        /* We require at least one supported parameter to be changed and no
++         * change in any of the unsupported parameters
++         */
++        if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && rxfh->hfunc != ETH_RSS_HASH_TOP)
++                return -EOPNOTSUPP;
++
++        /* Fill out the redirection table */
++        if (rxfh->indir) {
++                int max_queues = tp->num_rx_rings;
++
++                /* Verify user input. */
++                for (i = 0; i < reta_entries; i++)
++                        if (rxfh->indir[i] >= max_queues)
++                                return -EINVAL;
++
++                for (i = 0; i < reta_entries; i++)
++                        tp->rss_indir_tbl[i] = rxfh->indir[i];
++        }
++
++        /* Fill out the rss hash key */
++        if (rxfh->key)
++                memcpy(tp->rss_key, rxfh->key, RTL8125_RSS_KEY_SIZE);
++
++        rtl8125_store_reta(tp);
++
++        rtl8125_store_rss_key(tp);
++
++        return 0;
++}
++#else
++int rtl8125_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
++                     u8 *hfunc)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++
++        if (!(dev->features & NETIF_F_RXHASH))
++                return -EOPNOTSUPP;
++
++        if (hfunc)
++                *hfunc = ETH_RSS_HASH_TOP;
++
++        if (indir)
++                rtl8125_get_reta(tp, indir);
++
++        if (key)
++                memcpy(key, tp->rss_key, RTL8125_RSS_KEY_SIZE);
++
++        return 0;
++}
++
++int rtl8125_set_rxfh(struct net_device *dev, const u32 *indir,
++                     const u8 *key, const u8 hfunc)
++{
++        struct rtl8125_private *tp = netdev_priv(dev);
++        int i;
++        u32 reta_entries = rtl8125_rss_indir_tbl_entries(tp);
++
++        /* We require at least one supported parameter to be changed and no
++         * change in any of the unsupported parameters
++         */
++        if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
++                return -EOPNOTSUPP;
++
++        /* Fill out the redirection table */
++        if (indir) {
++                int max_queues = tp->num_rx_rings;
++
++                /* Verify user input. */
++                for (i = 0; i < reta_entries; i++)
++                        if (indir[i] >= max_queues)
++                                return -EINVAL;
++
++                for (i = 0; i < reta_entries; i++)
++                        tp->rss_indir_tbl[i] = indir[i];
++        }
++
++        /* Fill out the rss hash key */
++        if (key)
++                memcpy(tp->rss_key, key, RTL8125_RSS_KEY_SIZE);
++
++        rtl8125_store_reta(tp);
++
++        rtl8125_store_rss_key(tp);
++
++        return 0;
++}
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) */
++
++static u32 rtl8125_get_rx_desc_hash(struct rtl8125_private *tp,
++                                    struct RxDesc *desc)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                return le32_to_cpu(((struct RxDescV3 *)desc)->RxDescNormalDDWord2.RSSResult);
++        case RX_DESC_RING_TYPE_4:
++                return le32_to_cpu(((struct RxDescV4 *)desc)->RxDescNormalDDWord1.RSSResult);
++        default:
++                return 0;
++        }
++}
++
++#define RXS_8125B_RSS_UDP BIT(9)
++#define RXS_8125_RSS_IPV4 BIT(10)
++#define RXS_8125_RSS_IPV6 BIT(12)
++#define RXS_8125_RSS_TCP BIT(13)
++#define RTL8125_RXS_RSS_L3_TYPE_MASK (RXS_8125_RSS_IPV4 | RXS_8125_RSS_IPV6)
++#define RTL8125_RXS_RSS_L4_TYPE_MASK (RXS_8125_RSS_TCP | RXS_8125B_RSS_UDP)
++
++#define RXS_8125B_RSS_UDP_V4 BIT(27)
++#define RXS_8125_RSS_IPV4_V4 BIT(28)
++#define RXS_8125_RSS_IPV6_V4 BIT(29)
++#define RXS_8125_RSS_TCP_V4 BIT(30)
++#define RTL8125_RXS_RSS_L3_TYPE_MASK_V4 (RXS_8125_RSS_IPV4_V4 | RXS_8125_RSS_IPV6_V4)
++#define RTL8125_RXS_RSS_L4_TYPE_MASK_V4 (RXS_8125_RSS_TCP_V4 | RXS_8125B_RSS_UDP_V4)
++static void rtl8125_rx_hash_v3(struct rtl8125_private *tp,
++                               struct RxDescV3 *descv3,
++                               struct sk_buff *skb)
++{
++        u16 rss_header_info;
++
++        if (!(tp->dev->features & NETIF_F_RXHASH))
++                return;
++
++        rss_header_info = le16_to_cpu(descv3->RxDescNormalDDWord2.HeaderInfo);
++
++        if (!(rss_header_info & RTL8125_RXS_RSS_L3_TYPE_MASK))
++                return;
++
++        skb_set_hash(skb, rtl8125_get_rx_desc_hash(tp, (struct RxDesc *)descv3),
++                     (RTL8125_RXS_RSS_L4_TYPE_MASK & rss_header_info) ?
++                     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
++}
++
++static void rtl8125_rx_hash_v4(struct rtl8125_private *tp,
++                               struct RxDescV4 *descv4,
++                               struct sk_buff *skb)
++{
++        u32 rss_header_info;
++
++        if (!(tp->dev->features & NETIF_F_RXHASH))
++                return;
++
++        rss_header_info = le32_to_cpu(descv4->RxDescNormalDDWord1.RSSInfo);
++
++        if (!(rss_header_info & RTL8125_RXS_RSS_L3_TYPE_MASK_V4))
++                return;
++
++        skb_set_hash(skb, rtl8125_get_rx_desc_hash(tp, (struct RxDesc *)descv4),
++                     (RTL8125_RXS_RSS_L4_TYPE_MASK_V4 & rss_header_info) ?
++                     PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3);
++}
++
++void rtl8125_rx_hash(struct rtl8125_private *tp,
++                     struct RxDesc *desc,
++                     struct sk_buff *skb)
++{
++        switch (tp->InitRxDescType) {
++        case RX_DESC_RING_TYPE_3:
++                rtl8125_rx_hash_v3(tp, (struct RxDescV3 *)desc, skb);
++                break;
++        case RX_DESC_RING_TYPE_4:
++                rtl8125_rx_hash_v4(tp, (struct RxDescV4 *)desc, skb);
++                break;
++        default:
++                return;
++        }
++}
++
++void rtl8125_disable_rss(struct rtl8125_private *tp)
++{
++        RTL_W32(tp, RSS_CTRL_8125, 0x00);
++}
++
++void _rtl8125_config_rss(struct rtl8125_private *tp)
++{
++        _rtl8125_set_rss_hash_opt(tp);
++
++        rtl8125_store_reta(tp);
++
++        rtl8125_store_rss_key(tp);
++}
++
++void rtl8125_config_rss(struct rtl8125_private *tp)
++{
++        if (!tp->EnableRss) {
++                rtl8125_disable_rss(tp);
++                return;
++        }
++
++        _rtl8125_config_rss(tp);
++}
++
++void rtl8125_init_rss(struct rtl8125_private *tp)
++{
++        int i;
++
++        for (i = 0; i < rtl8125_rss_indir_tbl_entries(tp); i++)
++                tp->rss_indir_tbl[i] = ethtool_rxfh_indir_default(i, tp->num_rx_rings);
++
++        netdev_rss_key_fill(tp->rss_key, RTL8125_RSS_KEY_SIZE);
++}
+diff --git a/drivers/net/ethernet/realtek/r8125_rss.h b/drivers/net/ethernet/realtek/r8125_rss.h
+new file mode 100755
+index 000000000000..d2ec5f06c3f1
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/r8125_rss.h
+@@ -0,0 +1,76 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_rtl8125_RSS_H
++#define _LINUX_rtl8125_RSS_H
++
++#include <linux/netdevice.h>
++#include <linux/types.h>
++
++#define RTL8125_RSS_KEY_SIZE     40  /* size of RSS Hash Key in bytes */
++#define RTL8125_MAX_INDIRECTION_TABLE_ENTRIES 128
++
++enum rtl8125_rss_flag {
++        RTL_8125_RSS_FLAG_HASH_UDP_IPV4  = (1 << 0),
++        RTL_8125_RSS_FLAG_HASH_UDP_IPV6  = (1 << 1),
++};
++
++struct rtl8125_private;
++struct RxDesc;
++
++int rtl8125_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
++                      u32 *rule_locs);
++int rtl8125_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd);
++u32 rtl8125_get_rxfh_key_size(struct net_device *netdev);
++u32 rtl8125_rss_indir_size(struct net_device *netdev);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0)
++int rtl8125_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh);
++int rtl8125_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
++                     struct netlink_ext_ack *extack);
++#else
++int rtl8125_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
++                     u8 *hfunc);
++int rtl8125_set_rxfh(struct net_device *netdev, const u32 *indir,
++                     const u8 *key, const u8 hfunc);
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) */
++void rtl8125_rx_hash(struct rtl8125_private *tp,
++                     struct RxDesc *desc,
++                     struct sk_buff *skb);
++void _rtl8125_config_rss(struct rtl8125_private *tp);
++void rtl8125_config_rss(struct rtl8125_private *tp);
++void rtl8125_init_rss(struct rtl8125_private *tp);
++u32 rtl8125_rss_indir_tbl_entries(struct rtl8125_private *tp);
++void rtl8125_disable_rss(struct rtl8125_private *tp);
++
++#endif /* _LINUX_rtl8125_RSS_H */
+diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
+index 755083852eef..32648463d169 100644
+--- a/drivers/net/ethernet/realtek/r8169_main.c
++++ b/drivers/net/ethernet/realtek/r8169_main.c
+@@ -215,7 +215,6 @@ static const struct pci_device_id rtl8169_pci_tbl[] = {
+ 	{ PCI_VDEVICE(REALTEK,	0x8129) },
+ 	{ PCI_VDEVICE(REALTEK,	0x8136), RTL_CFG_NO_GBIT },
+ 	{ PCI_VDEVICE(REALTEK,	0x8161) },
+-	{ PCI_VDEVICE(REALTEK,	0x8162) },
+ 	{ PCI_VDEVICE(REALTEK,	0x8167) },
+ 	{ PCI_VDEVICE(REALTEK,	0x8168) },
+ 	{ PCI_VDEVICE(NCUBE,	0x8168) },
+@@ -226,10 +225,13 @@ static const struct pci_device_id rtl8169_pci_tbl[] = {
+ 	{ PCI_VDEVICE(USR,	0x0116) },
+ 	{ PCI_VENDOR_ID_LINKSYS, 0x1032, PCI_ANY_ID, 0x0024 },
+ 	{ 0x0001, 0x8168, PCI_ANY_ID, 0x2410 },
+-	{ PCI_VDEVICE(REALTEK,	0x8125) },
+ 	{ PCI_VDEVICE(REALTEK,	0x8126) },
+ 	{ PCI_VDEVICE(REALTEK,	0x8127) },
++#if !defined(CONFIG_R8125) && !defined(CONFIG_R8125_MODULE)
++	{ PCI_VDEVICE(REALTEK,	0x8125) },
++	{ PCI_VDEVICE(REALTEK,	0x8162) },
+ 	{ PCI_VDEVICE(REALTEK,	0x3000) },
++#endif /* !CONFIG_R8125 */
+ 	{ PCI_VDEVICE(REALTEK,	0x5000) },
+ 	{ PCI_VDEVICE(REALTEK,	0x0e10) },
+ 	{}
+diff --git a/drivers/net/ethernet/realtek/rtl_eeprom.c b/drivers/net/ethernet/realtek/rtl_eeprom.c
+new file mode 100755
+index 000000000000..f1c2a1d12e3c
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/rtl_eeprom.c
+@@ -0,0 +1,284 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/ethtool.h>
++#include <linux/netdevice.h>
++#include <linux/delay.h>
++
++#include <asm/io.h>
++
++#include "r8125.h"
++#include "rtl_eeprom.h"
++
++//-------------------------------------------------------------------
++//rtl8125_eeprom_type():
++//  tell the eeprom type
++//return value:
++//  0: the eeprom type is 93C46
++//  1: the eeprom type is 93C56 or 93C66
++//-------------------------------------------------------------------
++void rtl8125_eeprom_type(struct rtl8125_private *tp)
++{
++        u16 magic = 0;
++
++        if (tp->mcfg == CFG_METHOD_DEFAULT)
++                goto out_no_eeprom;
++
++        if(RTL_R8(tp, 0xD2)&0x04) {
++                //not support
++                //tp->eeprom_type = EEPROM_TWSI;
++                //tp->eeprom_len = 256;
++                goto out_no_eeprom;
++        } else if(RTL_R32(tp, RxConfig) & RxCfg_9356SEL) {
++                tp->eeprom_type = EEPROM_TYPE_93C56;
++                tp->eeprom_len = 256;
++        } else {
++                tp->eeprom_type = EEPROM_TYPE_93C46;
++                tp->eeprom_len = 128;
++        }
++
++        magic = rtl8125_eeprom_read_sc(tp, 0);
++
++out_no_eeprom:
++        if ((magic != 0x8129) && (magic != 0x8128)) {
++                tp->eeprom_type = EEPROM_TYPE_NONE;
++                tp->eeprom_len = 0;
++        }
++}
++
++void rtl8125_eeprom_cleanup(struct rtl8125_private *tp)
++{
++        u8 x;
++
++        x = RTL_R8(tp, Cfg9346);
++        x &= ~(Cfg9346_EEDI | Cfg9346_EECS);
++
++        RTL_W8(tp, Cfg9346, x);
++
++        rtl8125_raise_clock(tp, &x);
++        rtl8125_lower_clock(tp, &x);
++}
++
++static int rtl8125_eeprom_cmd_done(struct rtl8125_private *tp)
++{
++        u8 x;
++        int i;
++
++        rtl8125_stand_by(tp);
++
++        for (i = 0; i < 50000; i++) {
++                x = RTL_R8(tp, Cfg9346);
++
++                if (x & Cfg9346_EEDO) {
++                        udelay(RTL_CLOCK_RATE * 2 * 3);
++                        return 0;
++                }
++                udelay(1);
++        }
++
++        return -1;
++}
++
++//-------------------------------------------------------------------
++//rtl8125_eeprom_read_sc():
++//  read one word from eeprom
++//-------------------------------------------------------------------
++u16 rtl8125_eeprom_read_sc(struct rtl8125_private *tp, u16 reg)
++{
++        int addr_sz = 6;
++        u8 x;
++        u16 data;
++
++        if(tp->eeprom_type == EEPROM_TYPE_NONE)
++                return -1;
++
++        if (tp->eeprom_type==EEPROM_TYPE_93C46)
++                addr_sz = 6;
++        else if (tp->eeprom_type==EEPROM_TYPE_93C56)
++                addr_sz = 8;
++
++        x = Cfg9346_EEM1 | Cfg9346_EECS;
++        RTL_W8(tp, Cfg9346, x);
++
++        rtl8125_shift_out_bits(tp, RTL_EEPROM_READ_OPCODE, 3);
++        rtl8125_shift_out_bits(tp, reg, addr_sz);
++
++        data = rtl8125_shift_in_bits(tp);
++
++        rtl8125_eeprom_cleanup(tp);
++
++        RTL_W8(tp, Cfg9346, 0);
++
++        return data;
++}
++
++//-------------------------------------------------------------------
++//rtl8125_eeprom_write_sc():
++//  write one word to a specific address in the eeprom
++//-------------------------------------------------------------------
++void rtl8125_eeprom_write_sc(struct rtl8125_private *tp, u16 reg, u16 data)
++{
++        u8 x;
++        int addr_sz = 6;
++        int w_dummy_addr = 4;
++
++        if(tp->eeprom_type == EEPROM_TYPE_NONE)
++                return;
++
++        if (tp->eeprom_type==EEPROM_TYPE_93C46) {
++                addr_sz = 6;
++                w_dummy_addr = 4;
++        } else if (tp->eeprom_type==EEPROM_TYPE_93C56) {
++                addr_sz = 8;
++                w_dummy_addr = 6;
++        }
++
++        x = Cfg9346_EEM1 | Cfg9346_EECS;
++        RTL_W8(tp, Cfg9346, x);
++
++        rtl8125_shift_out_bits(tp, RTL_EEPROM_EWEN_OPCODE, 5);
++        rtl8125_shift_out_bits(tp, reg, w_dummy_addr);
++        rtl8125_stand_by(tp);
++
++        rtl8125_shift_out_bits(tp, RTL_EEPROM_ERASE_OPCODE, 3);
++        rtl8125_shift_out_bits(tp, reg, addr_sz);
++        if (rtl8125_eeprom_cmd_done(tp) < 0)
++                return;
++        rtl8125_stand_by(tp);
++
++        rtl8125_shift_out_bits(tp, RTL_EEPROM_WRITE_OPCODE, 3);
++        rtl8125_shift_out_bits(tp, reg, addr_sz);
++        rtl8125_shift_out_bits(tp, data, 16);
++        if (rtl8125_eeprom_cmd_done(tp) < 0)
++                return;
++        rtl8125_stand_by(tp);
++
++        rtl8125_shift_out_bits(tp, RTL_EEPROM_EWDS_OPCODE, 5);
++        rtl8125_shift_out_bits(tp, reg, w_dummy_addr);
++
++        rtl8125_eeprom_cleanup(tp);
++        RTL_W8(tp, Cfg9346, 0);
++}
++
++void rtl8125_raise_clock(struct rtl8125_private *tp, u8 *x)
++{
++        *x = *x | Cfg9346_EESK;
++        RTL_W8(tp, Cfg9346, *x);
++        udelay(RTL_CLOCK_RATE);
++}
++
++void rtl8125_lower_clock(struct rtl8125_private *tp, u8 *x)
++{
++        *x = *x & ~Cfg9346_EESK;
++        RTL_W8(tp, Cfg9346, *x);
++        udelay(RTL_CLOCK_RATE);
++}
++
++void rtl8125_shift_out_bits(struct rtl8125_private *tp, int data, int count)
++{
++        u8 x;
++        int  mask;
++
++        mask = 0x01 << (count - 1);
++        x = RTL_R8(tp, Cfg9346);
++        x &= ~(Cfg9346_EEDI | Cfg9346_EEDO);
++
++        do {
++                if (data & mask)
++                        x |= Cfg9346_EEDI;
++                else
++                        x &= ~Cfg9346_EEDI;
++
++                RTL_W8(tp, Cfg9346, x);
++                udelay(RTL_CLOCK_RATE);
++                rtl8125_raise_clock(tp, &x);
++                rtl8125_lower_clock(tp, &x);
++                mask = mask >> 1;
++        } while(mask);
++
++        x &= ~Cfg9346_EEDI;
++        RTL_W8(tp, Cfg9346, x);
++}
++
++u16 rtl8125_shift_in_bits(struct rtl8125_private *tp)
++{
++        u8 x;
++        u16 d, i;
++
++        x = RTL_R8(tp, Cfg9346);
++        x &= ~(Cfg9346_EEDI | Cfg9346_EEDO);
++
++        d = 0;
++
++        for (i = 0; i < 16; i++) {
++                d = d << 1;
++                rtl8125_raise_clock(tp, &x);
++
++                x = RTL_R8(tp, Cfg9346);
++                x &= ~Cfg9346_EEDI;
++
++                if (x & Cfg9346_EEDO)
++                        d |= 1;
++
++                rtl8125_lower_clock(tp, &x);
++        }
++
++        return d;
++}
++
++void rtl8125_stand_by(struct rtl8125_private *tp)
++{
++        u8 x;
++
++        x = RTL_R8(tp, Cfg9346);
++        x &= ~(Cfg9346_EECS | Cfg9346_EESK);
++        RTL_W8(tp, Cfg9346, x);
++        udelay(RTL_CLOCK_RATE);
++
++        x |= Cfg9346_EECS;
++        RTL_W8(tp, Cfg9346, x);
++}
++
++void rtl8125_set_eeprom_sel_low(struct rtl8125_private *tp)
++{
++        RTL_W8(tp, Cfg9346, Cfg9346_EEM1);
++        RTL_W8(tp, Cfg9346, Cfg9346_EEM1 | Cfg9346_EESK);
++
++        udelay(20);
++
++        RTL_W8(tp, Cfg9346, Cfg9346_EEM1);
++}
+diff --git a/drivers/net/ethernet/realtek/rtl_eeprom.h b/drivers/net/ethernet/realtek/rtl_eeprom.h
+new file mode 100755
+index 000000000000..7c154f2f4b48
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/rtl_eeprom.h
+@@ -0,0 +1,53 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++//EEPROM opcodes
++#define RTL_EEPROM_READ_OPCODE      06
++#define RTL_EEPROM_WRITE_OPCODE     05
++#define RTL_EEPROM_ERASE_OPCODE     07
++#define RTL_EEPROM_EWEN_OPCODE      19
++#define RTL_EEPROM_EWDS_OPCODE      16
++
++#define RTL_CLOCK_RATE  3
++
++void rtl8125_eeprom_type(struct rtl8125_private *tp);
++void rtl8125_eeprom_cleanup(struct rtl8125_private *tp);
++u16 rtl8125_eeprom_read_sc(struct rtl8125_private *tp, u16 reg);
++void rtl8125_eeprom_write_sc(struct rtl8125_private *tp, u16 reg, u16 data);
++void rtl8125_shift_out_bits(struct rtl8125_private *tp, int data, int count);
++u16 rtl8125_shift_in_bits(struct rtl8125_private *tp);
++void rtl8125_raise_clock(struct rtl8125_private *tp, u8 *x);
++void rtl8125_lower_clock(struct rtl8125_private *tp, u8 *x);
++void rtl8125_stand_by(struct rtl8125_private *tp);
++void rtl8125_set_eeprom_sel_low(struct rtl8125_private *tp);
+diff --git a/drivers/net/ethernet/realtek/rtltool.c b/drivers/net/ethernet/realtek/rtltool.c
+new file mode 100755
+index 000000000000..8dd79e1800a5
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/rtltool.c
+@@ -0,0 +1,312 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/pci.h>
++#include <linux/netdevice.h>
++#include <linux/delay.h>
++#include <linux/in.h>
++#include <linux/ethtool.h>
++#include <asm/uaccess.h>
++#include "r8125.h"
++#include "rtl_eeprom.h"
++#include "rtltool.h"
++
++int rtl8125_tool_ioctl(struct rtl8125_private *tp, struct ifreq *ifr)
++{
++        struct rtltool_cmd my_cmd;
++        unsigned long flags;
++        int ret;
++
++        if (copy_from_user(&my_cmd, ifr->ifr_data, sizeof(my_cmd)))
++                return -EFAULT;
++
++        ret = 0;
++        switch (my_cmd.cmd) {
++        case RTLTOOL_READ_MAC:
++                if ((my_cmd.offset + my_cmd.len) > pci_resource_len(tp->pci_dev, 2)) {
++                        ret = -EINVAL;
++                        break;
++                }
++
++                if (my_cmd.len==1)
++                        my_cmd.data = readb(tp->mmio_addr+my_cmd.offset);
++                else if (my_cmd.len==2)
++                        my_cmd.data = readw(tp->mmio_addr+(my_cmd.offset&~1));
++                else if (my_cmd.len==4)
++                        my_cmd.data = readl(tp->mmio_addr+(my_cmd.offset&~3));
++                else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++                break;
++
++        case RTLTOOL_WRITE_MAC:
++                if ((my_cmd.offset + my_cmd.len) > pci_resource_len(tp->pci_dev, 2)) {
++                        ret = -EINVAL;
++                        break;
++                }
++
++                if (my_cmd.len==1)
++                        writeb(my_cmd.data, tp->mmio_addr+my_cmd.offset);
++                else if (my_cmd.len==2)
++                        writew(my_cmd.data, tp->mmio_addr+(my_cmd.offset&~1));
++                else if (my_cmd.len==4)
++                        writel(my_cmd.data, tp->mmio_addr+(my_cmd.offset&~3));
++                else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_READ_PHY:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                my_cmd.data = rtl8125_mdio_prot_read(tp, my_cmd.offset);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_WRITE_PHY:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                rtl8125_mdio_prot_write(tp, my_cmd.offset, my_cmd.data);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                break;
++
++        case RTLTOOL_READ_EPHY:
++                my_cmd.data = rtl8125_ephy_read(tp, my_cmd.offset);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_WRITE_EPHY:
++                rtl8125_ephy_write(tp, my_cmd.offset, my_cmd.data);
++                break;
++
++        case RTLTOOL_READ_ERI:
++                my_cmd.data = 0;
++                if (my_cmd.len==1 || my_cmd.len==2 || my_cmd.len==4) {
++                        my_cmd.data = rtl8125_eri_read(tp, my_cmd.offset, my_cmd.len, ERIAR_ExGMAC);
++                } else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_WRITE_ERI:
++                if (my_cmd.len==1 || my_cmd.len==2 || my_cmd.len==4) {
++                        rtl8125_eri_write(tp, my_cmd.offset, my_cmd.len, my_cmd.data, ERIAR_ExGMAC);
++                } else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++                break;
++
++        case RTLTOOL_READ_PCI:
++                my_cmd.data = 0;
++                if (my_cmd.len==1)
++                        pci_read_config_byte(tp->pci_dev, my_cmd.offset,
++                                             (u8 *)&my_cmd.data);
++                else if (my_cmd.len==2)
++                        pci_read_config_word(tp->pci_dev, my_cmd.offset,
++                                             (u16 *)&my_cmd.data);
++                else if (my_cmd.len==4)
++                        pci_read_config_dword(tp->pci_dev, my_cmd.offset,
++                                              &my_cmd.data);
++                else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++                break;
++
++        case RTLTOOL_WRITE_PCI:
++                if (my_cmd.len==1)
++                        pci_write_config_byte(tp->pci_dev, my_cmd.offset,
++                                              my_cmd.data);
++                else if (my_cmd.len==2)
++                        pci_write_config_word(tp->pci_dev, my_cmd.offset,
++                                              my_cmd.data);
++                else if (my_cmd.len==4)
++                        pci_write_config_dword(tp->pci_dev, my_cmd.offset,
++                                               my_cmd.data);
++                else {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_READ_EEPROM:
++                my_cmd.data = rtl8125_eeprom_read_sc(tp, my_cmd.offset);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTLTOOL_WRITE_EEPROM:
++                rtl8125_eeprom_write_sc(tp, my_cmd.offset, my_cmd.data);
++                break;
++
++        case RTL_READ_OOB_MAC:
++                rtl8125_oob_mutex_lock(tp);
++                my_cmd.data = rtl8125_ocp_read(tp, my_cmd.offset, 4);
++                rtl8125_oob_mutex_unlock(tp);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++                break;
++
++        case RTL_WRITE_OOB_MAC:
++                if (my_cmd.len == 0 || my_cmd.len > 4)
++                        return -EOPNOTSUPP;
++
++                rtl8125_oob_mutex_lock(tp);
++                rtl8125_ocp_write(tp, my_cmd.offset, my_cmd.len, my_cmd.data);
++                rtl8125_oob_mutex_unlock(tp);
++                break;
++
++        case RTL_ENABLE_PCI_DIAG:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                tp->rtk_enable_diag = 1;
++                r8125_spin_unlock(&tp->phy_lock, flags);
++
++                dprintk("enable rtk diag\n");
++                break;
++
++        case RTL_DISABLE_PCI_DIAG:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                tp->rtk_enable_diag = 0;
++                r8125_spin_unlock(&tp->phy_lock, flags);
++
++                dprintk("disable rtk diag\n");
++                break;
++
++        case RTL_READ_MAC_OCP:
++                if (my_cmd.offset % 2)
++                        return -EOPNOTSUPP;
++
++                my_cmd.data = rtl8125_mac_ocp_read(tp, my_cmd.offset);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++                break;
++
++        case RTL_WRITE_MAC_OCP:
++                if ((my_cmd.offset % 2) || (my_cmd.len != 2))
++                        return -EOPNOTSUPP;
++
++                rtl8125_mac_ocp_write(tp, my_cmd.offset, (u16)my_cmd.data);
++                break;
++
++        case RTL_DIRECT_READ_PHY_OCP:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                my_cmd.data = rtl8125_mdio_prot_direct_read_phy_ocp(tp, my_cmd.offset);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTL_DIRECT_WRITE_PHY_OCP:
++                r8125_spin_lock(&tp->phy_lock, flags);
++                rtl8125_mdio_prot_direct_write_phy_ocp(tp, my_cmd.offset, my_cmd.data);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                break;
++
++#ifdef ENABLE_FIBER_SUPPORT
++        case RTL_READ_FIBER_PHY:
++                if (!HW_FIBER_STATUS_CONNECTED(tp)) {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                r8125_spin_lock(&tp->phy_lock, flags);
++                my_cmd.data = rtl8125_fiber_mdio_read(tp, my_cmd.offset);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) {
++                        ret = -EFAULT;
++                        break;
++                }
++
++                break;
++
++        case RTL_WRITE_FIBER_PHY:
++                if (!HW_FIBER_STATUS_CONNECTED(tp)) {
++                        ret = -EOPNOTSUPP;
++                        break;
++                }
++
++                r8125_spin_lock(&tp->phy_lock, flags);
++                rtl8125_fiber_mdio_write(tp, my_cmd.offset, my_cmd.data);
++                r8125_spin_unlock(&tp->phy_lock, flags);
++                break;
++#endif /* ENABLE_FIBER_SUPPORT */
++
++        default:
++                ret = -EOPNOTSUPP;
++                break;
++        }
++
++        return ret;
++}
+diff --git a/drivers/net/ethernet/realtek/rtltool.h b/drivers/net/ethernet/realtek/rtltool.h
+new file mode 100755
+index 000000000000..940be4fe7606
+--- /dev/null
++++ b/drivers/net/ethernet/realtek/rtltool.h
+@@ -0,0 +1,89 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++################################################################################
++#
++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet
++# controllers with PCI-Express interface.
++#
++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option)
++# any later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++# more details.
++#
++# You should have received a copy of the GNU General Public License along with
++# this program; if not, see <http://www.gnu.org/licenses/>.
++#
++# Author:
++# Realtek NIC software team <nicfae@realtek.com>
++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan
++#
++################################################################################
++*/
++
++/************************************************************************************
++ *  This product is covered by one or more of the following patents:
++ *  US6,570,884, US6,115,776, and US6,327,625.
++ ***********************************************************************************/
++
++#ifndef _LINUX_RTLTOOL_H
++#define _LINUX_RTLTOOL_H
++
++#define SIOCRTLTOOL		SIOCDEVPRIVATE+1
++
++enum rtl_cmd {
++        RTLTOOL_READ_MAC=0,
++        RTLTOOL_WRITE_MAC,
++        RTLTOOL_READ_PHY,
++        RTLTOOL_WRITE_PHY,
++        RTLTOOL_READ_EPHY,
++        RTLTOOL_WRITE_EPHY,
++        RTLTOOL_READ_ERI,
++        RTLTOOL_WRITE_ERI,
++        RTLTOOL_READ_PCI,
++        RTLTOOL_WRITE_PCI,
++        RTLTOOL_READ_EEPROM,
++        RTLTOOL_WRITE_EEPROM,
++
++        RTL_READ_OOB_MAC,
++        RTL_WRITE_OOB_MAC,
++
++        RTL_ENABLE_PCI_DIAG,
++        RTL_DISABLE_PCI_DIAG,
++
++        RTL_READ_MAC_OCP,
++        RTL_WRITE_MAC_OCP,
++
++        RTL_DIRECT_READ_PHY_OCP,
++        RTL_DIRECT_WRITE_PHY_OCP,
++
++        RTL_READ_FIBER_PHY,
++        RTL_WRITE_FIBER_PHY,
++
++        RTLTOOL_INVALID
++};
++
++struct rtltool_cmd {
++        __u32	cmd;
++        __u32	offset;
++        __u32	len;
++        __u32	data;
++};
++
++enum mode_access {
++        MODE_NONE=0,
++        MODE_READ,
++        MODE_WRITE
++};
++
++#ifdef __KERNEL__
++int rtl8125_tool_ioctl(struct rtl8125_private *tp, struct ifreq *ifr);
++#endif
++
++#endif /* _LINUX_RTLTOOL_H */
+-- 
+2.53.0
+
diff --git a/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch b/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch
new file mode 100644
index 0000000..2a11388
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch
@@ -0,0 +1,392 @@
+From 6f7b751921f791358d7c89c6a0ffe66914ae8d0d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 27 Feb 2026 09:09:41 +0100
+Subject: [PATCH 7/8] vesa-dsc-bpp
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  16 +++
+ .../drm/amd/display/dc/dml/dsc/qp_tables.h    |   4 +-
+ .../drm/amd/display/dc/dml/dsc/rc_calc_fpu.c  |   2 +-
+ drivers/gpu/drm/drm_displayid_internal.h      |  11 ++
+ drivers/gpu/drm/drm_edid.c                    | 102 +++++++++++-------
+ include/drm/drm_connector.h                   |   6 ++
+ include/drm/drm_modes.h                       |  10 ++
+ 7 files changed, 112 insertions(+), 39 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index bc9aca604aa0..47583196cfa8 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -6779,6 +6779,11 @@ static void fill_stream_properties_from_drm_display_mode(
+ 
+ 	stream->output_color_space = get_output_color_space(timing_out, connector_state);
+ 	stream->content_type = get_output_content_type(connector_state);
++
++	/* DisplayID Type VII pass-through timings. */
++	if (mode_in->dsc_passthrough_timings_support && info->dp_dsc_bpp_x16 != 0) {
++		stream->timing.dsc_fixed_bits_per_pixel_x16 = info->dp_dsc_bpp_x16;
++	}
+ }
+ 
+ static void fill_audio_info(struct audio_info *audio_info,
+@@ -7237,6 +7242,7 @@ create_stream_for_sink(struct drm_connector *connector,
+ 	struct drm_display_mode mode;
+ 	struct drm_display_mode saved_mode;
+ 	struct drm_display_mode *freesync_mode = NULL;
++	struct drm_display_mode *dsc_passthru_mode = NULL;
+ 	bool native_mode_found = false;
+ 	bool recalculate_timing = false;
+ 	bool scale = dm_state->scaling != RMX_OFF;
+@@ -7328,6 +7334,16 @@ create_stream_for_sink(struct drm_connector *connector,
+ 		}
+ 	}
+ 
++	list_for_each_entry(dsc_passthru_mode, &connector->modes, head) {
++		if (dsc_passthru_mode->hdisplay == mode.hdisplay &&
++		    dsc_passthru_mode->vdisplay == mode.vdisplay &&
++		    drm_mode_vrefresh(dsc_passthru_mode) == mode_refresh) {
++			mode.dsc_passthrough_timings_support =
++				dsc_passthru_mode->dsc_passthrough_timings_support;
++			break;
++		}
++	}
++
+ 	if (recalculate_timing)
+ 		drm_mode_set_crtcinfo(&saved_mode, 0);
+ 
+diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h b/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h
+index dcff0dd2b6a1..622abb69ea00 100644
+--- a/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h
++++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h
+@@ -63,7 +63,7 @@ static const qp_table   qp_table_444_8bpc_max = {
+ 	{ 6.5, { 4, 6, 7, 8, 8, 8, 9, 10, 11, 11, 12, 12, 12, 13, 15} },
+ 	{   7, { 4, 5, 7, 7, 8, 8, 8, 9, 10, 11, 11, 12, 12, 13, 14} },
+ 	{ 7.5, { 4, 5, 6, 7, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 14} },
+-	{   8, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11, 11, 12, 13} },
++	{   8, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 11, 12, 13, 13, 15} },
+ 	{ 8.5, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11, 11, 12, 13} },
+ 	{   9, { 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 10, 11, 11, 13} },
+ 	{ 9.5, { 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 10, 11, 11, 13} },
+@@ -211,7 +211,7 @@ static const qp_table   qp_table_444_8bpc_min = {
+ 	{ 6.5, { 0, 1, 2, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 9, 14} },
+ 	{   7, { 0, 0, 2, 2, 4, 4, 4, 4, 4, 5, 5, 6, 6, 9, 13} },
+ 	{ 7.5, { 0, 0, 2, 2, 3, 4, 4, 4, 4, 4, 5, 5, 6, 9, 13} },
+-	{   8, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 4, 5, 5, 5, 8, 12} },
++	{   8, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 13} },
+ 	{ 8.5, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 4, 5, 5, 5, 8, 12} },
+ 	{   9, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 12} },
+ 	{ 9.5, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 12} },
+diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
+index ef75eb7d5adc..8804419871d0 100644
+--- a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
++++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c
+@@ -123,7 +123,7 @@ static void get_ofs_set(qp_set ofs, enum colour_mode mode, float bpp)
+ 		*p++ = (bpp <= 12) ? (-8) : ((bpp >= 15) ? (-6) : (-8 + dsc_roundf((bpp - 12) * (2 / 3.0))));
+ 		*p++ = (bpp <= 12) ? (-10) : ((bpp >= 15) ? (-8) : (-10 + dsc_roundf((bpp - 12) * (2 / 3.0))));
+ 		*p++ = -10;
+-		*p++ = (bpp <=  6) ? (-12) : ((bpp >=  8) ? (-10) : (-12 + dsc_roundf((bpp -  6) * (2 / 2.0))));
++		*p++ = (bpp <=  6) ? (-12) : ((bpp >=  8) ? (-12) : (-12 + dsc_roundf((bpp -  6) * (2 / 2.0))));
+ 		*p++ = -12;
+ 		*p++ = -12;
+ 		*p++ = -12;
+diff --git a/drivers/gpu/drm/drm_displayid_internal.h b/drivers/gpu/drm/drm_displayid_internal.h
+index 5b1b32f73516..8f1a2f33ca1a 100644
+--- a/drivers/gpu/drm/drm_displayid_internal.h
++++ b/drivers/gpu/drm/drm_displayid_internal.h
+@@ -97,6 +97,7 @@ struct displayid_header {
+ 	u8 ext_count;
+ } __packed;
+ 
++#define DISPLAYID_BLOCK_REV	GENMASK(2, 0)
+ struct displayid_block {
+ 	u8 tag;
+ 	u8 rev;
+@@ -125,6 +126,7 @@ struct displayid_detailed_timings_1 {
+ 	__le16 vsw;
+ } __packed;
+ 
++#define DISPLAYID_BLOCK_PASSTHROUGH_TIMINGS_SUPPORT	BIT(3)
+ struct displayid_detailed_timing_block {
+ 	struct displayid_block base;
+ 	struct displayid_detailed_timings_1 timings[];
+@@ -137,19 +139,28 @@ struct displayid_formula_timings_9 {
+ 	u8 vrefresh;
+ } __packed;
+ 
++#define DISPLAYID_BLOCK_DESCRIPTOR_PAYLOAD_BYTES	GENMASK(6, 4)
+ struct displayid_formula_timing_block {
+ 	struct displayid_block base;
+ 	struct displayid_formula_timings_9 timings[];
+ } __packed;
+ 
++#define DISPLAYID_VESA_DP_TYPE		GENMASK(2, 0)
+ #define DISPLAYID_VESA_MSO_OVERLAP	GENMASK(3, 0)
+ #define DISPLAYID_VESA_MSO_MODE		GENMASK(6, 5)
++#define DISPLAYID_VESA_DSC_BPP_INT	GENMASK(5, 0)
++#define DISPLAYID_VESA_DSC_BPP_FRACT	GENMASK(3, 0)
++
++#define DISPLAYID_VESA_DP_TYPE_EDP	0
++#define DISPLAYID_VESA_DP_TYPE_DP	1
+ 
+ struct displayid_vesa_vendor_specific_block {
+ 	struct displayid_block base;
+ 	u8 oui[3];
+ 	u8 data_structure_type;
+ 	u8 mso;
++	u8 dsc_bpp_int;
++	u8 dsc_bpp_fract;
+ } __packed;
+ 
+ /*
+diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
+index 056eff8cbd1a..26d53a548a27 100644
+--- a/drivers/gpu/drm/drm_edid.c
++++ b/drivers/gpu/drm/drm_edid.c
+@@ -45,6 +45,7 @@
+ #include <drm/drm_edid.h>
+ #include <drm/drm_eld.h>
+ #include <drm/drm_encoder.h>
++#include <drm/drm_fixed.h>
+ #include <drm/drm_print.h>
+ 
+ #include "drm_crtc_internal.h"
+@@ -6566,12 +6567,13 @@ static void drm_get_monitor_range(struct drm_connector *connector,
+ 		    info->monitor_range.min_vfreq, info->monitor_range.max_vfreq);
+ }
+ 
+-static void drm_parse_vesa_mso_data(struct drm_connector *connector,
+-				    const struct displayid_block *block)
++static void drm_parse_vesa_specific_block(struct drm_connector *connector,
++					  const struct displayid_block *block)
+ {
+ 	struct displayid_vesa_vendor_specific_block *vesa =
+ 		(struct displayid_vesa_vendor_specific_block *)block;
+ 	struct drm_display_info *info = &connector->display_info;
++	int dp_type;
+ 
+ 	if (block->num_bytes < 3) {
+ 		drm_dbg_kms(connector->dev,
+@@ -6583,51 +6585,73 @@ static void drm_parse_vesa_mso_data(struct drm_connector *connector,
+ 	if (oui(vesa->oui[0], vesa->oui[1], vesa->oui[2]) != VESA_IEEE_OUI)
+ 		return;
+ 
+-	if (sizeof(*vesa) != sizeof(*block) + block->num_bytes) {
++	if (block->num_bytes < 5) {
+ 		drm_dbg_kms(connector->dev,
+ 			    "[CONNECTOR:%d:%s] Unexpected VESA vendor block size\n",
+ 			    connector->base.id, connector->name);
+ 		return;
+ 	}
+ 
+-	switch (FIELD_GET(DISPLAYID_VESA_MSO_MODE, vesa->mso)) {
+-	default:
+-		drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO mode value\n",
++	dp_type = FIELD_GET(DISPLAYID_VESA_DP_TYPE, vesa->data_structure_type);
++	if (dp_type > 1) {
++		drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved dp type value\n",
+ 			    connector->base.id, connector->name);
+-		fallthrough;
+-	case 0:
+-		info->mso_stream_count = 0;
+-		break;
+-	case 1:
+-		info->mso_stream_count = 2; /* 2 or 4 links */
+-		break;
+-	case 2:
+-		info->mso_stream_count = 4; /* 4 links */
+-		break;
+ 	}
+ 
+-	if (!info->mso_stream_count) {
++	/* MSO is only supported for eDP */
++	if (dp_type == DISPLAYID_VESA_DP_TYPE_EDP) {
++		switch (FIELD_GET(DISPLAYID_VESA_MSO_MODE, vesa->mso)) {
++		default:
++			drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO mode value\n",
++				    connector->base.id, connector->name);
++			fallthrough;
++		case 0:
++			info->mso_stream_count = 0;
++			break;
++		case 1:
++			info->mso_stream_count = 2; /* 2 or 4 links */
++			break;
++		case 2:
++			info->mso_stream_count = 4; /* 4 links */
++			break;
++		}
++	}
++
++	if (info->mso_stream_count) {
++		info->mso_pixel_overlap = FIELD_GET(DISPLAYID_VESA_MSO_OVERLAP, vesa->mso);
++		if (info->mso_pixel_overlap > 8) {
++			drm_dbg_kms(connector->dev,
++				    "[CONNECTOR:%d:%s] Reserved MSO pixel overlap value %u\n",
++				    connector->base.id, connector->name,
++				    info->mso_pixel_overlap);
++			info->mso_pixel_overlap = 8;
++		}
++		drm_dbg_kms(connector->dev,
++			    "[CONNECTOR:%d:%s] MSO stream count %u, pixel overlap %u\n",
++			    connector->base.id, connector->name,
++			    info->mso_stream_count, info->mso_pixel_overlap);
++	} else {
+ 		info->mso_pixel_overlap = 0;
++	}
++
++	if (block->num_bytes < 7) {
++		/* DSC bpp is optional */
+ 		return;
+ 	}
+ 
+-	info->mso_pixel_overlap = FIELD_GET(DISPLAYID_VESA_MSO_OVERLAP, vesa->mso);
+-	if (info->mso_pixel_overlap > 8) {
++	info->dp_dsc_bpp_x16 = FIELD_GET(DISPLAYID_VESA_DSC_BPP_INT, vesa->dsc_bpp_int) << 4 |
++			       FIELD_GET(DISPLAYID_VESA_DSC_BPP_FRACT, vesa->dsc_bpp_fract);
++
++	if (info->dp_dsc_bpp_x16 > 0) {
+ 		drm_dbg_kms(connector->dev,
+-			    "[CONNECTOR:%d:%s] Reserved MSO pixel overlap value %u\n",
++			    "[CONNECTOR:%d:%s] DSC bits per pixel " FXP_Q4_FMT "\n",
+ 			    connector->base.id, connector->name,
+-			    info->mso_pixel_overlap);
+-		info->mso_pixel_overlap = 8;
++			    FXP_Q4_ARGS(info->dp_dsc_bpp_x16));
+ 	}
+-
+-	drm_dbg_kms(connector->dev,
+-		    "[CONNECTOR:%d:%s] MSO stream count %u, pixel overlap %u\n",
+-		    connector->base.id, connector->name,
+-		    info->mso_stream_count, info->mso_pixel_overlap);
+ }
+ 
+-static void drm_update_mso(struct drm_connector *connector,
+-			   const struct drm_edid *drm_edid)
++static void drm_update_vesa_specific_block(struct drm_connector *connector,
++					   const struct drm_edid *drm_edid)
+ {
+ 	const struct displayid_block *block;
+ 	struct displayid_iter iter;
+@@ -6635,7 +6659,7 @@ static void drm_update_mso(struct drm_connector *connector,
+ 	displayid_iter_edid_begin(drm_edid, &iter);
+ 	displayid_iter_for_each(block, &iter) {
+ 		if (block->tag == DATA_BLOCK_2_VENDOR_SPECIFIC)
+-			drm_parse_vesa_mso_data(connector, block);
++			drm_parse_vesa_specific_block(connector, block);
+ 	}
+ 	displayid_iter_end(&iter);
+ }
+@@ -6672,6 +6696,7 @@ static void drm_reset_display_info(struct drm_connector *connector)
+ 	info->mso_stream_count = 0;
+ 	info->mso_pixel_overlap = 0;
+ 	info->max_dsc_bpp = 0;
++	info->dp_dsc_bpp_x16 = 0;
+ 
+ 	kfree(info->vics);
+ 	info->vics = NULL;
+@@ -6795,7 +6820,7 @@ static void update_display_info(struct drm_connector *connector,
+ 	if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422)
+ 		info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ 
+-	drm_update_mso(connector, drm_edid);
++	drm_update_vesa_specific_block(connector, drm_edid);
+ 
+ out:
+ 	if (drm_edid_has_internal_quirk(connector, EDID_QUIRK_NON_DESKTOP)) {
+@@ -6825,8 +6850,8 @@ static void update_display_info(struct drm_connector *connector,
+ }
+ 
+ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *dev,
+-							    const struct displayid_detailed_timings_1 *timings,
+-							    bool type_7)
++							    const struct displayid_block *block,
++							    const struct displayid_detailed_timings_1 *timings)
+ {
+ 	struct drm_display_mode *mode;
+ 	unsigned int pixel_clock = (timings->pixel_clock[0] |
+@@ -6842,11 +6867,16 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d
+ 	unsigned int vsync_width = le16_to_cpu(timings->vsw) + 1;
+ 	bool hsync_positive = le16_to_cpu(timings->hsync) & (1 << 15);
+ 	bool vsync_positive = le16_to_cpu(timings->vsync) & (1 << 15);
++	bool type_7 = block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING;
+ 
+ 	mode = drm_mode_create(dev);
+ 	if (!mode)
+ 		return NULL;
+ 
++	if (type_7 && FIELD_GET(DISPLAYID_BLOCK_REV, block->rev) >= 1)
++		mode->dsc_passthrough_timings_support =
++			block->rev & DISPLAYID_BLOCK_PASSTHROUGH_TIMINGS_SUPPORT;
++
+ 	/* resolution is kHz for type VII, and 10 kHz for type I */
+ 	mode->clock = type_7 ? pixel_clock : pixel_clock * 10;
+ 	mode->hdisplay = hactive;
+@@ -6879,7 +6909,6 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector,
+ 	int num_timings;
+ 	struct drm_display_mode *newmode;
+ 	int num_modes = 0;
+-	bool type_7 = block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING;
+ 	/* blocks must be multiple of 20 bytes length */
+ 	if (block->num_bytes % 20)
+ 		return 0;
+@@ -6888,7 +6917,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector,
+ 	for (i = 0; i < num_timings; i++) {
+ 		struct displayid_detailed_timings_1 *timings = &det->timings[i];
+ 
+-		newmode = drm_mode_displayid_detailed(connector->dev, timings, type_7);
++		newmode = drm_mode_displayid_detailed(connector->dev, block, timings);
+ 		if (!newmode)
+ 			continue;
+ 
+@@ -6935,7 +6964,8 @@ static int add_displayid_formula_modes(struct drm_connector *connector,
+ 	struct drm_display_mode *newmode;
+ 	int num_modes = 0;
+ 	bool type_10 = block->tag == DATA_BLOCK_2_TYPE_10_FORMULA_TIMING;
+-	int timing_size = 6 + ((formula_block->base.rev & 0x70) >> 4);
++	int timing_size = 6 +
++		FIELD_GET(DISPLAYID_BLOCK_DESCRIPTOR_PAYLOAD_BYTES, formula_block->base.rev);
+ 
+ 	/* extended blocks are not supported yet */
+ 	if (timing_size != 6)
+diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
+index fa4abfe8971e..a0f86e48192e 100644
+--- a/include/drm/drm_connector.h
++++ b/include/drm/drm_connector.h
+@@ -890,6 +890,12 @@ struct drm_display_info {
+ 	 */
+ 	u32 max_dsc_bpp;
+ 
++	/**
++	 * @dp_dsc_bpp: DP Display-Stream-Compression (DSC) timing's target
++	 * DSC bits per pixel in 6.4 fixed point format. 0 means undefined.
++	 */
++	u16 dp_dsc_bpp_x16;
++
+ 	/**
+ 	 * @vics: Array of vics_len VICs. Internal to EDID parsing.
+ 	 */
+diff --git a/include/drm/drm_modes.h b/include/drm/drm_modes.h
+index b9bb92e4b029..312e5c03af9a 100644
+--- a/include/drm/drm_modes.h
++++ b/include/drm/drm_modes.h
+@@ -417,6 +417,16 @@ struct drm_display_mode {
+ 	 */
+ 	enum hdmi_picture_aspect picture_aspect_ratio;
+ 
++	/**
++	 * @dsc_passthrough_timing_support:
++	 *
++	 * Indicates whether this mode timing descriptor is supported
++	 * with specific target DSC bits per pixel only.
++	 *
++	 * VESA vendor-specific data block shall exist with the relevant
++	 * DSC bits per pixel declaration when this flag is set to true.
++	 */
++	bool dsc_passthrough_timings_support;
+ };
+ 
+ /**
+-- 
+2.53.0
+
diff --git a/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch b/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch
new file mode 100644
index 0000000..05282b3
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch
@@ -0,0 +1,366 @@
+From 5692ec66ac9431cee8522a866cd4b80fdff4ca54 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 27 Feb 2026 09:09:52 +0100
+Subject: [PATCH 8/8] vmscape
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/hw-vuln/vmscape.rst |  8 +++
+ .../admin-guide/kernel-parameters.txt         |  4 +-
+ arch/x86/Kconfig                              |  1 +
+ arch/x86/entry/entry_64.S                     | 13 +++-
+ arch/x86/include/asm/cpufeatures.h            |  2 +-
+ arch/x86/include/asm/entry-common.h           |  9 ++-
+ arch/x86/include/asm/nospec-branch.h          | 11 +++-
+ arch/x86/kernel/cpu/bugs.c                    | 65 +++++++++++++++----
+ arch/x86/kvm/x86.c                            |  4 +-
+ arch/x86/net/bpf_jit_comp.c                   |  2 +
+ 10 files changed, 90 insertions(+), 29 deletions(-)
+
+diff --git a/Documentation/admin-guide/hw-vuln/vmscape.rst b/Documentation/admin-guide/hw-vuln/vmscape.rst
+index d9b9a2b6c114..580f288ae8bf 100644
+--- a/Documentation/admin-guide/hw-vuln/vmscape.rst
++++ b/Documentation/admin-guide/hw-vuln/vmscape.rst
+@@ -86,6 +86,10 @@ The possible values in this file are:
+    run a potentially malicious guest and issues an IBPB before the first
+    exit to userspace after VM-exit.
+ 
++ * 'Mitigation: Clear BHB before exit to userspace':
++
++   As above, conditional BHB clearing mitigation is enabled.
++
+  * 'Mitigation: IBPB on VMEXIT':
+ 
+    IBPB is issued on every VM-exit. This occurs when other mitigations like
+@@ -108,3 +112,7 @@ The mitigation can be controlled via the ``vmscape=`` command line parameter:
+ 
+    Force vulnerability detection and mitigation even on processors that are
+    not known to be affected.
++
++ * ``vmscape=on``:
++
++   Choose the mitigation based on the VMSCAPE variant the CPU is affected by.
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 28f14d664aa3..a3e9684f63c0 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -8281,9 +8281,11 @@ Kernel parameters
+ 
+ 			off		- disable the mitigation
+ 			ibpb		- use Indirect Branch Prediction Barrier
+-					  (IBPB) mitigation (default)
++					  (IBPB) mitigation
+ 			force		- force vulnerability detection even on
+ 					  unaffected processors
++			on		- (default) selects IBPB or BHB clear
++					  mitigation based on CPU
+ 
+ 	vsyscall=	[X86-64,EARLY]
+ 			Controls the behavior of vsyscalls (i.e. calls to
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 80527299f859..e03e35a2a6ce 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2710,6 +2710,7 @@ config MITIGATION_TSA
+ config MITIGATION_VMSCAPE
+ 	bool "Mitigate VMSCAPE"
+ 	depends on KVM
++	depends on HAVE_STATIC_CALL
+ 	default y
+ 	help
+ 	  Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index f9983a1907bf..6d93602dd309 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1529,12 +1529,19 @@ SYM_CODE_END(rewind_stack_and_make_dead)
+  * refactored in the future if needed. The .skips are for safety, to ensure
+  * that all RETs are in the second half of a cacheline to mitigate Indirect
+  * Target Selection, rather than taking the slowpath via its_return_thunk.
++ *
++ * Note, callers should use a speculation barrier like LFENCE immediately after
++ * a call to this function to ensure BHB is cleared before indirect branches.
+  */
+ SYM_FUNC_START(clear_bhb_loop)
+ 	ANNOTATE_NOENDBR
+ 	push	%rbp
+ 	mov	%rsp, %rbp
+-	movl	$5, %ecx
++
++	/* loop count differs based on BHI_CTRL, see Intel's BHI guidance */
++	ALTERNATIVE "movl $5,  %ecx; movl $5, %edx",	\
++		    "movl $12, %ecx; movl $7, %edx", X86_FEATURE_BHI_CTRL
++
+ 	ANNOTATE_INTRA_FUNCTION_CALL
+ 	call	1f
+ 	jmp	5f
+@@ -1555,7 +1562,7 @@ SYM_FUNC_START(clear_bhb_loop)
+ 	 * but some Clang versions (e.g. 18) don't like this.
+ 	 */
+ 	.skip 32 - 18, 0xcc
+-2:	movl	$5, %eax
++2:	movl	%edx, %eax
+ 3:	jmp	4f
+ 	nop
+ 4:	sub	$1, %eax
+@@ -1563,7 +1570,7 @@ SYM_FUNC_START(clear_bhb_loop)
+ 	sub	$1, %ecx
+ 	jnz	1b
+ .Lret2:	RET
+-5:	lfence
++5:
+ 	pop	%rbp
+ 	RET
+ SYM_FUNC_END(clear_bhb_loop)
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index c3b53beb1300..aa39430476d6 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -501,7 +501,7 @@
+ #define X86_FEATURE_TSA_SQ_NO		(21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+ #define X86_FEATURE_TSA_L1_NO		(21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+ #define X86_FEATURE_CLEAR_CPU_BUF_VM	(21*32+13) /* Clear CPU buffers using VERW before VMRUN */
+-#define X86_FEATURE_IBPB_EXIT_TO_USER	(21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
++/* Free */
+ #define X86_FEATURE_ABMC		(21*32+15) /* Assignable Bandwidth Monitoring Counters */
+ #define X86_FEATURE_MSR_IMM		(21*32+16) /* MSR immediate form instructions */
+ #define X86_FEATURE_SGX_EUPDATESVN	(21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */
+diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
+index ce3eb6d5fdf9..783e7cb50cae 100644
+--- a/arch/x86/include/asm/entry-common.h
++++ b/arch/x86/include/asm/entry-common.h
+@@ -4,6 +4,7 @@
+ 
+ #include <linux/randomize_kstack.h>
+ #include <linux/user-return-notifier.h>
++#include <linux/static_call_types.h>
+ 
+ #include <asm/nospec-branch.h>
+ #include <asm/io_bitmap.h>
+@@ -94,11 +95,9 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ 	 */
+ 	choose_random_kstack_offset(rdtsc());
+ 
+-	/* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */
+-	if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) &&
+-	    this_cpu_read(x86_ibpb_exit_to_user)) {
+-		indirect_branch_prediction_barrier();
+-		this_cpu_write(x86_ibpb_exit_to_user, false);
++	if (unlikely(this_cpu_read(x86_predictor_flush_exit_to_user))) {
++		static_call_cond(vmscape_predictor_flush)();
++		this_cpu_write(x86_predictor_flush_exit_to_user, false);
+ 	}
+ }
+ #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
+index 4f4b5e8a1574..80efdb6645ba 100644
+--- a/arch/x86/include/asm/nospec-branch.h
++++ b/arch/x86/include/asm/nospec-branch.h
+@@ -331,11 +331,11 @@
+ 
+ #ifdef CONFIG_X86_64
+ .macro CLEAR_BRANCH_HISTORY
+-	ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
++	ALTERNATIVE "", "call clear_bhb_loop; lfence", X86_FEATURE_CLEAR_BHB_LOOP
+ .endm
+ 
+ .macro CLEAR_BRANCH_HISTORY_VMEXIT
+-	ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_VMEXIT
++	ALTERNATIVE "", "call clear_bhb_loop; lfence", X86_FEATURE_CLEAR_BHB_VMEXIT
+ .endm
+ #else
+ #define CLEAR_BRANCH_HISTORY
+@@ -390,6 +390,8 @@ extern void write_ibpb(void);
+ 
+ #ifdef CONFIG_X86_64
+ extern void clear_bhb_loop(void);
++#else
++static inline void clear_bhb_loop(void) {}
+ #endif
+ 
+ extern void (*x86_return_thunk)(void);
+@@ -533,7 +535,7 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+ 		: "memory");
+ }
+ 
+-DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user);
++DECLARE_PER_CPU(bool, x86_predictor_flush_exit_to_user);
+ 
+ static inline void indirect_branch_prediction_barrier(void)
+ {
+@@ -542,6 +544,9 @@ static inline void indirect_branch_prediction_barrier(void)
+ 			    :: "rax", "rcx", "rdx", "memory");
+ }
+ 
++#include <linux/static_call_types.h>
++DECLARE_STATIC_CALL(vmscape_predictor_flush, write_ibpb);
++
+ /* The Intel SPEC CTRL MSR base value cache */
+ extern u64 x86_spec_ctrl_base;
+ DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index d0a2847a4bb0..2818bfcb9f9f 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -62,12 +62,11 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
+ 
+ /*
+- * Set when the CPU has run a potentially malicious guest. An IBPB will
+- * be needed to before running userspace. That IBPB will flush the branch
+- * predictor content.
++ * Set when the CPU has run a potentially malicious guest. Indicates that a
++ * branch predictor flush is needed before running userspace.
+  */
+-DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+-EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
++DEFINE_PER_CPU(bool, x86_predictor_flush_exit_to_user);
++EXPORT_PER_CPU_SYMBOL_GPL(x86_predictor_flush_exit_to_user);
+ 
+ u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+ 
+@@ -230,6 +229,9 @@ static void x86_amd_ssb_disable(void)
+ 		wrmsrq(MSR_AMD64_LS_CFG, msrval);
+ }
+ 
++DEFINE_STATIC_CALL_NULL(vmscape_predictor_flush, write_ibpb);
++EXPORT_STATIC_CALL_GPL(vmscape_predictor_flush);
++
+ #undef pr_fmt
+ #define pr_fmt(fmt)	"MDS: " fmt
+ 
+@@ -3049,15 +3051,19 @@ static void __init srso_apply_mitigation(void)
+ enum vmscape_mitigations {
+ 	VMSCAPE_MITIGATION_NONE,
+ 	VMSCAPE_MITIGATION_AUTO,
++	VMSCAPE_MITIGATION_ON,
+ 	VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+ 	VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
++	VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER,
+ };
+ 
+ static const char * const vmscape_strings[] = {
+-	[VMSCAPE_MITIGATION_NONE]		= "Vulnerable",
++	[VMSCAPE_MITIGATION_NONE]			= "Vulnerable",
+ 	/* [VMSCAPE_MITIGATION_AUTO] */
+-	[VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER]	= "Mitigation: IBPB before exit to userspace",
+-	[VMSCAPE_MITIGATION_IBPB_ON_VMEXIT]	= "Mitigation: IBPB on VMEXIT",
++	/* [VMSCAPE_MITIGATION_ON] */
++	[VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER]		= "Mitigation: IBPB before exit to userspace",
++	[VMSCAPE_MITIGATION_IBPB_ON_VMEXIT]		= "Mitigation: IBPB on VMEXIT",
++	[VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER]	= "Mitigation: Clear BHB before exit to userspace",
+ };
+ 
+ static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+@@ -3074,7 +3080,9 @@ static int __init vmscape_parse_cmdline(char *str)
+ 		vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ 	} else if (!strcmp(str, "force")) {
+ 		setup_force_cpu_bug(X86_BUG_VMSCAPE);
+-		vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
++		vmscape_mitigation = VMSCAPE_MITIGATION_ON;
++	} else if (!strcmp(str, "on")) {
++		vmscape_mitigation = VMSCAPE_MITIGATION_ON;
+ 	} else {
+ 		pr_err("Ignoring unknown vmscape=%s option.\n", str);
+ 	}
+@@ -3085,17 +3093,42 @@ early_param("vmscape", vmscape_parse_cmdline);
+ 
+ static void __init vmscape_select_mitigation(void)
+ {
+-	if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+-	    !boot_cpu_has(X86_FEATURE_IBPB)) {
++	if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) {
+ 		vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ 		return;
+ 	}
+ 
+-	if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
+-		if (should_mitigate_vuln(X86_BUG_VMSCAPE))
++	if ((vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) &&
++	    !should_mitigate_vuln(X86_BUG_VMSCAPE))
++		vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
++
++	switch (vmscape_mitigation) {
++	case VMSCAPE_MITIGATION_NONE:
++		break;
++
++	case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
++		if (!boot_cpu_has(X86_FEATURE_IBPB))
++			vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
++		break;
++
++	case VMSCAPE_MITIGATION_AUTO:
++	case VMSCAPE_MITIGATION_ON:
++		/*
++		 * CPUs with BHI_CTRL(ADL and newer) can avoid the IBPB and use BHB
++		 * clear sequence. These CPUs are only vulnerable to the BHI variant
++		 * of the VMSCAPE attack and does not require an IBPB flush. In
++		 * 32-bit mode BHB clear sequence is not supported.
++		 */
++		if (boot_cpu_has(X86_FEATURE_BHI_CTRL) && IS_ENABLED(CONFIG_X86_64))
++			vmscape_mitigation = VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER;
++		else if (boot_cpu_has(X86_FEATURE_IBPB))
+ 			vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ 		else
+ 			vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
++		break;
++
++	default:
++		break;
+ 	}
+ }
+ 
+@@ -3114,7 +3147,9 @@ static void __init vmscape_update_mitigation(void)
+ static void __init vmscape_apply_mitigation(void)
+ {
+ 	if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+-		setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
++		static_call_update(vmscape_predictor_flush, write_ibpb);
++	else if (vmscape_mitigation == VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER)
++		static_call_update(vmscape_predictor_flush, clear_bhb_loop);
+ }
+ 
+ #undef pr_fmt
+@@ -3203,9 +3238,11 @@ void cpu_bugs_smt_update(void)
+ 	switch (vmscape_mitigation) {
+ 	case VMSCAPE_MITIGATION_NONE:
+ 	case VMSCAPE_MITIGATION_AUTO:
++	case VMSCAPE_MITIGATION_ON:
+ 		break;
+ 	case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+ 	case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
++	case VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER:
+ 		/*
+ 		 * Hypervisors can be attacked across-threads, warn for SMT when
+ 		 * STIBP is not already enabled system-wide.
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 72d37c8930ad..5b4d44a6b702 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -11437,8 +11437,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+ 	 * set for the CPU that actually ran the guest, and not the CPU that it
+ 	 * may migrate to.
+ 	 */
+-	if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
+-		this_cpu_write(x86_ibpb_exit_to_user, true);
++	if (static_call_query(vmscape_predictor_flush))
++		this_cpu_write(x86_predictor_flush_exit_to_user, true);
+ 
+ 	/*
+ 	 * Consume any pending interrupts, including the possible source of
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index b0bac2a66eff..c31508be0d72 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -1620,6 +1620,8 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip,
+ 
+ 		if (emit_call(&prog, func, ip))
+ 			return -EINVAL;
++		/* Don't speculate past this until BHB is cleared */
++		EMIT_LFENCE();
+ 		EMIT1(0x59); /* pop rcx */
+ 		EMIT1(0x58); /* pop rax */
+ 	}
+-- 
+2.53.0
+
diff --git a/sys-kernel/gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch
similarity index 100%
rename from sys-kernel/gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch
rename to sys-kernel/gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch
diff --git a/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch b/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch
new file mode 100644
index 0000000..7d79ef2
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch
@@ -0,0 +1,3352 @@
+From 0588576f1ca7bc2757bb90e1fac439eccf10afc9 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 15 Mar 2024 20:30:45 +0100
+Subject: [PATCH 1/5] bbr3
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    4 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2231 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 15 files changed, 1934 insertions(+), 551 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 3c5efeeb024f..a0d4afd221d8 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -257,7 +257,9 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
+ 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+ 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+ 	u8	chrono_type:2,	/* current chronograph type */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index fee1e5650551..1d069d636117 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -135,8 +135,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 71af24410443..9c92be8fe029 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -372,6 +372,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -724,6 +726,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 		tcp_fast_path_on(tp);
+ }
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ u32 tcp_delack_max(const struct sock *sk);
+ 
+ /* Compute the actual rto_min value */
+@@ -822,6 +833,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
+ {
+ 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
+@@ -897,9 +913,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1003,6 +1024,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1025,7 +1047,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1045,10 +1071,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1059,7 +1088,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1083,8 +1114,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1150,6 +1184,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1169,6 +1211,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1181,6 +1224,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2212,7 +2270,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 50655de04c9b..82f8bd8f0d16 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index 51c13cf9c5ae..de8dcba26bec 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -506,9 +506,11 @@ enum {
+ #define RTAX_FEATURE_SACK	(1 << 1)
+ #define RTAX_FEATURE_TIMESTAMP	(1 << 2)
+ #define RTAX_FEATURE_ALLFRAG	(1 << 3)
++#define RTAX_FEATURE_ECN_LOW	(1 << 4)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
+-				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
++				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \
++				 | RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index 879eeb0a084b..77270053a5e3 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
++#define TCPI_OPT_ECN_LOW	64 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 2dfb12230f08..2e14db3bee70 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -668,15 +668,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 91c3d8264059..4a5e0abfe8c1 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3099,6 +3099,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -3790,6 +3791,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 146792cd26fe..f4f477a69917 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	}
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+-	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
++	sk->sk_pacing_rate =
++	  bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -294,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		sk->sk_pacing_rate = rate;
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -333,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -344,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -366,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -386,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -457,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -468,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -536,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -613,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -811,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -819,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -861,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -913,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -941,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -966,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -989,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1012,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+ 
+-	bbr_update_model(sk, rs);
+-
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc void bbr_main(struct sock *sk, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1160,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ #endif
+ #endif
+@@ -1198,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index 1b34050a7538..66d40449b3f4 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index c2e4dac42453..62e765afcb2a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -371,7 +371,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -382,7 +382,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1096,7 +1096,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1477,6 +1482,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3732,7 +3748,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3749,6 +3766,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3759,6 +3777,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3867,6 +3890,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -3941,7 +3965,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -3965,6 +3989,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -3984,7 +4009,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5596,13 +5621,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 260bfb9ada38..0381cbdb9a2c 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -435,6 +435,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 5631041ae12c..2125f3ab098e 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -332,10 +332,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -347,6 +346,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -384,7 +386,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1546,7 +1549,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1621,6 +1624,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -1996,13 +2023,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2701,6 +2727,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2914,6 +2941,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 64bcf384e9dd..e8b1adf17e3a 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -664,6 +664,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.46.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.6/0001-bore.patch b/sys-kernel/gentoo-sources-6.6/0001-bore.patch
new file mode 100644
index 0000000..23b02f0
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.6/0001-bore.patch
@@ -0,0 +1,825 @@
+From 90ecff92d4efa9ac452e3e199235f7b7d16a1d80 Mon Sep 17 00:00:00 2001
+From: Masahito S <firelzrd@gmail.com>
+Date: Thu, 1 Aug 2024 02:38:58 +0900
+Subject: [PATCH] linux6.6.30-bore5.1.8
+
+---
+ include/linux/sched.h   |  10 ++
+ init/Kconfig            |  17 +++
+ kernel/Kconfig.hz       |  16 +++
+ kernel/sched/core.c     | 143 +++++++++++++++++++++
+ kernel/sched/debug.c    |  60 ++++++++-
+ kernel/sched/fair.c     | 270 +++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/features.h |   4 +
+ kernel/sched/sched.h    |   7 ++
+ 8 files changed, 524 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 77f01ac385..20fe8ee925 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -559,6 +559,16 @@ struct sched_entity {
+ 	u64				sum_exec_runtime;
+ 	u64				prev_sum_exec_runtime;
+ 	u64				vruntime;
++#ifdef CONFIG_SCHED_BORE
++	u64				burst_time;
++	u8				prev_burst_penalty;
++	u8				curr_burst_penalty;
++	u8				burst_penalty;
++	u8				burst_score;
++	u8				child_burst;
++	u32				child_burst_cnt;
++	u64				child_burst_last_cached;
++#endif // CONFIG_SCHED_BORE
+ 	s64				vlag;
+ 	u64				slice;
+ 
+diff --git a/init/Kconfig b/init/Kconfig
+index e403a29256..06c028a327 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1267,6 +1267,23 @@ config CHECKPOINT_RESTORE
+ 
+ 	  If unsure, say N here.
+ 
++config SCHED_BORE
++	bool "Burst-Oriented Response Enhancer"
++	default y
++	help
++	  In Desktop and Mobile computing, one might prefer interactive
++	  tasks to keep responsive no matter what they run in the background.
++
++	  Enabling this kernel feature modifies the scheduler to discriminate
++	  tasks by their burst time (runtime since it last went sleeping or
++	  yielding state) and prioritize those that run less bursty.
++	  Such tasks usually include window compositor, widgets backend,
++	  terminal emulator, video playback, games and so on.
++	  With a little impact to scheduling fairness, it may improve
++	  responsiveness especially under heavy background workload.
++
++	  If unsure, say Y here.
++
+ config SCHED_AUTOGROUP
+ 	bool "Automatic process group scheduling"
+ 	select CGROUPS
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index 38ef6d0688..5f6eecd1e6 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -55,5 +55,21 @@ config HZ
+ 	default 300 if HZ_300
+ 	default 1000 if HZ_1000
+ 
++config MIN_BASE_SLICE_NS
++	int "Default value for min_base_slice_ns"
++	default 2000000
++	help
++	 The BORE Scheduler automatically calculates the optimal base
++	 slice for the configured HZ using the following equation:
++	 
++	 base_slice_ns = max(min_base_slice_ns, 1000000000/HZ)
++	 
++	 This option sets the default lower bound limit of the base slice
++	 to prevent the loss of task throughput due to overscheduling.
++	 
++	 Setting this value too high can cause the system to boot with
++	 an unnecessarily large base slice, resulting in high scheduling
++	 latency and poor system responsiveness.
++
+ config SCHED_HRTICK
+ 	def_bool HIGH_RES_TIMERS
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 8208809605..dad676f6fd 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4487,6 +4487,138 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+ 	return try_to_wake_up(p, state, 0);
+ }
+ 
++#ifdef CONFIG_SCHED_BORE
++extern u8   sched_burst_fork_atavistic;
++extern uint sched_burst_cache_lifetime;
++
++static void __init sched_init_bore(void) {
++	init_task.se.burst_time = 0;
++	init_task.se.prev_burst_penalty = 0;
++	init_task.se.curr_burst_penalty = 0;
++	init_task.se.burst_penalty = 0;
++	init_task.se.burst_score = 0;
++	init_task.se.child_burst_last_cached = 0;
++}
++
++inline void sched_fork_bore(struct task_struct *p) {
++	p->se.burst_time = 0;
++	p->se.curr_burst_penalty = 0;
++	p->se.burst_score = 0;
++	p->se.child_burst_last_cached = 0;
++}
++
++static u32 count_child_tasks(struct task_struct *p) {
++	struct task_struct *child;
++	u32 cnt = 0;
++	list_for_each_entry(child, &p->children, sibling) {cnt++;}
++	return cnt;
++}
++
++static inline bool task_is_inheritable(struct task_struct *p) {
++	return (p->sched_class == &fair_sched_class);
++}
++
++static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) {
++	u64 expiration_time =
++		p->se.child_burst_last_cached + sched_burst_cache_lifetime;
++	return ((s64)(expiration_time - now) < 0);
++}
++
++static void __update_child_burst_cache(
++	struct task_struct *p, u32 cnt, u32 sum, u64 now) {
++	u8 avg = 0;
++	if (cnt) avg = sum / cnt;
++	p->se.child_burst = max(avg, p->se.burst_penalty);
++	p->se.child_burst_cnt = cnt;
++	p->se.child_burst_last_cached = now;
++}
++
++static inline void update_child_burst_direct(struct task_struct *p, u64 now) {
++	struct task_struct *child;
++	u32 cnt = 0;
++	u32 sum = 0;
++
++	list_for_each_entry(child, &p->children, sibling) {
++		if (!task_is_inheritable(child)) continue;
++		cnt++;
++		sum += child->se.burst_penalty;
++	}
++
++	__update_child_burst_cache(p, cnt, sum, now);
++}
++
++static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) {
++	struct task_struct *parent = p->real_parent;
++	if (child_burst_cache_expired(parent, now))
++		update_child_burst_direct(parent, now);
++
++	return parent->se.child_burst;
++}
++
++static void update_child_burst_topological(
++	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
++	struct task_struct *child, *dec;
++	u32 cnt = 0, dcnt = 0;
++	u32 sum = 0;
++
++	list_for_each_entry(child, &p->children, sibling) {
++		dec = child;
++		while ((dcnt = count_child_tasks(dec)) == 1)
++			dec = list_first_entry(&dec->children, struct task_struct, sibling);
++		
++		if (!dcnt || !depth) {
++			if (!task_is_inheritable(dec)) continue;
++			cnt++;
++			sum += dec->se.burst_penalty;
++			continue;
++		}
++		if (!child_burst_cache_expired(dec, now)) {
++			cnt += dec->se.child_burst_cnt;
++			sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt;
++			continue;
++		}
++		update_child_burst_topological(dec, now, depth - 1, &cnt, &sum);
++	}
++
++	__update_child_burst_cache(p, cnt, sum, now);
++	*acnt += cnt;
++	*asum += sum;
++}
++
++static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) {
++	struct task_struct *anc = p->real_parent;
++	u32 cnt = 0, sum = 0;
++
++	while (anc->real_parent != anc && count_child_tasks(anc) == 1)
++		anc = anc->real_parent;
++
++	if (child_burst_cache_expired(anc, now))
++		update_child_burst_topological(
++			anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum);
++
++	return anc->se.child_burst;
++}
++
++static inline void inherit_burst(struct task_struct *p) {
++	u8 burst_cache;
++	u64 now = ktime_get_ns();
++
++	read_lock(&tasklist_lock);
++	burst_cache = likely(sched_burst_fork_atavistic)?
++		__inherit_burst_topological(p, now):
++		__inherit_burst_direct(p, now);
++	read_unlock(&tasklist_lock);
++
++	p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache);
++}
++
++static void sched_post_fork_bore(struct task_struct *p) {
++	if (p->sched_class == &fair_sched_class)
++		inherit_burst(p);
++	p->se.burst_penalty = p->se.prev_burst_penalty;
++}
++#endif // CONFIG_SCHED_BORE
++
+ /*
+  * Perform scheduler related setup for a newly forked process p.
+  * p is forked by current.
+@@ -4503,6 +4635,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
++#ifdef CONFIG_SCHED_BORE
++	sched_fork_bore(p);
++#endif // CONFIG_SCHED_BORE
+ 	p->se.vlag			= 0;
+ 	p->se.slice			= sysctl_sched_base_slice;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+@@ -4822,6 +4957,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+ 
+ void sched_post_fork(struct task_struct *p)
+ {
++#ifdef CONFIG_SCHED_BORE
++	sched_post_fork_bore(p);
++#endif // CONFIG_SCHED_BORE
+ 	uclamp_post_fork(p);
+ }
+ 
+@@ -9925,6 +10063,11 @@ void __init sched_init(void)
+ 	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	sched_init_bore();
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.1.8 by Masahito Suzuki");
++#endif // CONFIG_SCHED_BORE
++
+ 	wait_bit_init();
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 4c3d0d9f3d..02c8816c26 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = {
+ };
+ 
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf,
++				   size_t cnt, loff_t *ppos)
++{
++	char buf[16];
++	unsigned int value;
++
++	if (cnt > 15)
++		cnt = 15;
++
++	if (copy_from_user(&buf, ubuf, cnt))
++		return -EFAULT;
++	buf[cnt] = '\0';
++
++	if (kstrtouint(buf, 10, &value))
++		return -EINVAL;
+ 
++	if (!value)
++		return -EINVAL;
++
++	sysctl_sched_min_base_slice = value;
++	sched_update_min_base_slice();
++
++	*ppos += cnt;
++	return cnt;
++}
++
++static int sched_min_base_slice_show(struct seq_file *m, void *v)
++{
++	seq_printf(m, "%d\n", sysctl_sched_min_base_slice);
++	return 0;
++}
++
++static int sched_min_base_slice_open(struct inode *inode, struct file *filp)
++{
++	return single_open(filp, sched_min_base_slice_show, NULL);
++}
++
++static const struct file_operations sched_min_base_slice_fops = {
++	.open		= sched_min_base_slice_open,
++	.write		= sched_min_base_slice_write,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++#else // !CONFIG_SCHED_BORE
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = {
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+-
++#endif // CONFIG_SCHED_BORE
+ #endif /* SMP */
+ 
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+@@ -347,13 +392,20 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
++	debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice);
++#else // !CONFIG_SCHED_BORE
+ 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+ 
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
++#endif // CONFIG_SCHED_BORE
+ 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ 
+@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
++#ifdef CONFIG_SCHED_BORE
++	SEQ_printf(m, " %2d", p->se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ #ifdef CONFIG_NUMA_BALANCING
+ 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+ #endif
+@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 
+ 	P(se.load.weight);
+ #ifdef CONFIG_SMP
++#ifdef CONFIG_SCHED_BORE
++	P(se.burst_score);
++#endif // CONFIG_SCHED_BORE
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+ 	P(se.avg.util_sum);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d3d0a1c933..caae4061a7 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -19,6 +19,9 @@
+  *
+  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
++ *
++ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
++ *  Copyright (C) 2021-2024 Masahito Suzuki <firelzrd@gmail.com>
+  */
+ #include <linux/energy_model.h>
+ #include <linux/mmap_lock.h>
+@@ -66,17 +69,29 @@
+  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+  *
+- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
++ * (BORE  default SCHED_TUNABLESCALING_NONE = *1 constant)
++ * (EEVDF default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
+  */
++#ifdef CONFIG_SCHED_BORE
++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound tasks:
+  *
+- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
++ * (BORE  default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds)
++ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_SCHED_BORE
++unsigned int            sysctl_sched_base_slice = 1000000000ULL / HZ;
++static unsigned int configured_sched_base_slice = 1000000000ULL / HZ;
++unsigned int        sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS;
++#else // !CONFIG_SCHED_BORE
+ unsigned int sysctl_sched_base_slice			= 750000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * After fork, child runs first. If set to 0 (default) then
+@@ -86,6 +101,120 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
+ 
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
++#ifdef CONFIG_SCHED_BORE
++u8   __read_mostly sched_bore                   = 1;
++u8   __read_mostly sched_burst_exclude_kthreads = 1;
++u8   __read_mostly sched_burst_smoothness_long  = 1;
++u8   __read_mostly sched_burst_smoothness_short = 0;
++u8   __read_mostly sched_burst_fork_atavistic   = 2;
++u8   __read_mostly sched_burst_penalty_offset   = 22;
++uint __read_mostly sched_burst_penalty_scale    = 1280;
++uint __read_mostly sched_burst_cache_lifetime   = 60000000;
++uint __read_mostly sched_deadline_boost_mask    = ENQUEUE_INITIAL
++                                                | ENQUEUE_WAKEUP;
++uint __read_mostly sched_deadline_preserve_mask = ENQUEUE_RESTORE
++                                                | ENQUEUE_MIGRATED;
++static int __maybe_unused sixty_four     = 64;
++static int __maybe_unused maxval_12_bits = 4095;
++
++#define MAX_BURST_PENALTY (39U <<2)
++
++static inline u32 log2plus1_u64_u32f8(u64 v) {
++	u32 msb = fls64(v);
++	s32 excess_bits = msb - 9;
++    u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits;
++	return msb << 8 | fractional;
++}
++
++static inline u32 calc_burst_penalty(u64 burst_time) {
++	u32 greed, tolerance, penalty, scaled_penalty;
++	
++	greed = log2plus1_u64_u32f8(burst_time);
++	tolerance = sched_burst_penalty_offset << 8;
++	penalty = max(0, (s32)greed - (s32)tolerance);
++	scaled_penalty = penalty * sched_burst_penalty_scale >> 16;
++
++	return min(MAX_BURST_PENALTY, scaled_penalty);
++}
++
++static inline u64 scale_slice(u64 delta, struct sched_entity *se) {
++	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22);
++}
++
++static inline u64 __unscale_slice(u64 delta, u8 score) {
++	return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);
++}
++
++static inline u64 unscale_slice(u64 delta, struct sched_entity *se) {
++	return __unscale_slice(delta, se->burst_score);
++}
++
++static void reweight_entity(
++	struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight);
++
++static void renice_task(struct task_struct *p, int prio)
++{
++	struct sched_entity *se = &p->se;
++	struct cfs_rq *cfs_rq = cfs_rq_of(se);
++	struct load_weight *load = &se->load;
++	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
++
++	reweight_entity(cfs_rq, se, weight);
++	load->inv_weight = sched_prio_to_wmult[prio];
++}
++
++static void update_burst_score(struct sched_entity *se) {
++	if (!entity_is_task(se)) return;
++	struct task_struct *p = task_of(se);
++	u8 prio = p->static_prio - MAX_RT_PRIO;
++	u8 prev_prio = min(39, prio + se->burst_score);
++
++	u8 burst_score = 0;
++	if (!(sched_burst_exclude_kthreads && (p->flags & PF_KTHREAD)))
++		burst_score = se->burst_penalty >> 2;
++
++	se->burst_score = burst_score;
++
++	u8 new_prio = min(39, prio + se->burst_score);
++	if (new_prio != prev_prio)
++		renice_task(p, new_prio);
++}
++
++static void update_burst_penalty(struct sched_entity *se) {
++	se->curr_burst_penalty = calc_burst_penalty(se->burst_time);
++	se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty);
++	update_burst_score(se);
++}
++
++static inline u32 binary_smooth(u32 new, u32 old) {
++  int increment = new - old;
++  return (0 <= increment)?
++    old + ( increment >> (int)sched_burst_smoothness_long):
++    old - (-increment >> (int)sched_burst_smoothness_short);
++}
++
++static void restart_burst(struct sched_entity *se) {
++	se->burst_penalty = se->prev_burst_penalty =
++		binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty);
++	se->curr_burst_penalty = 0;
++	se->burst_time = 0;
++	update_burst_score(se);
++}
++
++static void restart_burst_rescale_deadline(struct sched_entity *se) {
++	s64 vscaled, wremain, vremain = se->deadline - se->vruntime;
++	u8 prev_score = se->burst_score;
++	restart_burst(se);
++	if (prev_score > se->burst_score) {
++		wremain = __unscale_slice(abs(vremain), prev_score);
++		vscaled = scale_slice(wremain, se);
++		if (unlikely(vremain < 0))
++			vscaled = -vscaled;
++		se->deadline = se->vruntime + vscaled;
++	}
++}
++#endif // CONFIG_SCHED_BORE
++
+ int sched_thermal_decay_shift;
+ static int __init setup_sched_thermal_decay_shift(char *str)
+ {
+@@ -145,6 +274,92 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+ 
+ #ifdef CONFIG_SYSCTL
+ static struct ctl_table sched_fair_sysctls[] = {
++#ifdef CONFIG_SCHED_BORE
++	{
++		.procname	= "sched_bore",
++		.data		= &sched_bore,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ONE,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_exclude_kthreads",
++		.data		= &sched_burst_exclude_kthreads,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_smoothness_long",
++		.data		= &sched_burst_smoothness_long,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_smoothness_short",
++		.data		= &sched_burst_smoothness_short,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_fork_atavistic",
++		.data		= &sched_burst_fork_atavistic,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_THREE,
++	},
++	{
++		.procname	= "sched_burst_penalty_offset",
++		.data		= &sched_burst_penalty_offset,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &sixty_four,
++	},
++	{
++		.procname	= "sched_burst_penalty_scale",
++		.data		= &sched_burst_penalty_scale,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_12_bits,
++	},
++	{
++		.procname	= "sched_burst_cache_lifetime",
++		.data		= &sched_burst_cache_lifetime,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_deadline_boost_mask",
++		.data		= &sched_deadline_boost_mask,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++	{
++		.procname	= "sched_deadline_preserve_mask",
++		.data		= &sched_deadline_preserve_mask,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++#endif // CONFIG_SCHED_BORE
+ 	{
+ 		.procname       = "sched_child_runs_first",
+ 		.data           = &sysctl_sched_child_runs_first,
+@@ -210,6 +425,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
+  *
+  * This idea comes from the SD scheduler of Con Kolivas:
+  */
++#ifdef CONFIG_SCHED_BORE
++static void update_sysctl(void) {
++	sysctl_sched_base_slice =
++		max(sysctl_sched_min_base_slice, configured_sched_base_slice);
++}
++void sched_update_min_base_slice(void) { update_sysctl(); }
++#else // !CONFIG_SCHED_BORE
+ static unsigned int get_update_sysctl_factor(void)
+ {
+ 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
+@@ -240,6 +462,7 @@ static void update_sysctl(void)
+ 	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
++#endif // CONFIG_SCHED_BORE
+ 
+ void __init sched_init_granularity(void)
+ {
+@@ -713,6 +936,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+ 
+ 	vlag = avruntime - se->vruntime;
+ 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++#ifdef CONFIG_SCHED_BORE
++	limit >>= 1;
++#endif // CONFIG_SCHED_BORE
+ 
+ 	return clamp(vlag, -limit, limit);
+ }
+@@ -1002,6 +1228,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+  * Scheduling class statistics methods:
+  */
+ #ifdef CONFIG_SMP
++#if !defined(CONFIG_SCHED_BORE)
+ int sched_update_scaling(void)
+ {
+ 	unsigned int factor = get_update_sysctl_factor();
+@@ -1013,6 +1240,7 @@ int sched_update_scaling(void)
+ 
+ 	return 0;
+ }
++#endif // CONFIG_SCHED_BORE
+ #endif
+ #endif
+ 
+@@ -1179,7 +1407,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	curr->sum_exec_runtime += delta_exec;
+ 	schedstat_add(cfs_rq->exec_clock, delta_exec);
+ 
++#ifdef CONFIG_SCHED_BORE
++	curr->burst_time += delta_exec;
++	update_burst_penalty(curr);
++	curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr));
++#else // !CONFIG_SCHED_BORE
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
++#endif // CONFIG_SCHED_BORE
+ 	update_deadline(cfs_rq, curr);
+ 	update_min_vruntime(cfs_rq);
+ 
+@@ -5072,6 +5306,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	s64 lag = 0;
+ 
+ 	se->slice = sysctl_sched_base_slice;
++#ifdef CONFIG_SCHED_BORE
++	if (flags & ~sched_deadline_boost_mask & sched_deadline_preserve_mask)
++		vslice = se->deadline - se->vruntime;
++	else
++#endif // CONFIG_SCHED_BORE
+ 	vslice = calc_delta_fair(se->slice, se);
+ 
+ 	/*
+@@ -5082,6 +5321,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	 *
+ 	 * EEVDF: placement strategy #1 / #2
+ 	 */
++#ifdef CONFIG_SCHED_BORE
++	if (se->vlag)
++#endif // CONFIG_SCHED_BORE
+ 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ 		struct sched_entity *curr = cfs_rq->curr;
+ 		unsigned long load;
+@@ -5157,7 +5399,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	 * on average, halfway through their slice, as such start tasks
+ 	 * off with half a slice to ease into the competition.
+ 	 */
++#if !defined(CONFIG_SCHED_BORE)
+ 	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
++#else // CONFIG_SCHED_BORE
++	if (flags & sched_deadline_boost_mask)
++#endif // CONFIG_SCHED_BORE
+ 		vslice /= 2;
+ 
+ 	/*
+@@ -6724,6 +6970,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	bool was_sched_idle = sched_idle_rq(rq);
+ 
+ 	util_est_dequeue(&rq->cfs, p);
++#ifdef CONFIG_SCHED_BORE
++	if (task_sleep) {
++		cfs_rq = cfs_rq_of(se);
++		if (cfs_rq->curr == se)
++			update_curr(cfs_rq);
++		restart_burst(se);
++	}
++#endif // CONFIG_SCHED_BORE
+ 
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+@@ -8461,16 +8715,25 @@ static void yield_task_fair(struct rq *rq)
+ 	/*
+ 	 * Are we the only task in the tree?
+ 	 */
++#if !defined(CONFIG_SCHED_BORE)
+ 	if (unlikely(rq->nr_running == 1))
+ 		return;
+ 
+ 	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	update_rq_clock(rq);
+ 	/*
+ 	 * Update run-time statistics of the 'current'.
+ 	 */
+ 	update_curr(cfs_rq);
++#ifdef CONFIG_SCHED_BORE
++	restart_burst_rescale_deadline(se);
++	if (unlikely(rq->nr_running == 1))
++		return;
++
++	clear_buddies(cfs_rq, se);
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * Tell update_rq_clock() that we've just updated,
+ 	 * so we don't do microscopic update in schedule()
+@@ -12536,6 +12799,9 @@ static void task_fork_fair(struct task_struct *p)
+ 	curr = cfs_rq->curr;
+ 	if (curr)
+ 		update_curr(cfs_rq);
++#ifdef CONFIG_SCHED_BORE
++	update_burst_score(se);
++#endif // CONFIG_SCHED_BORE
+ 	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
+ 	rq_unlock(rq, &rf);
+ }
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index f770168230..3711c7700d 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -5,8 +5,12 @@
+  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+  */
+ SCHED_FEAT(PLACE_LAG, true)
++#if !defined(CONFIG_SCHED_BORE)
+ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+ SCHED_FEAT(RUN_TO_PARITY, true)
++#else // CONFIG_SCHED_BORE
++SCHED_FEAT(RUN_TO_PARITY, false)
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 2e8f26a919..e8d5ce2027 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1948,7 +1948,11 @@ static inline void dirty_sched_domain_sysctl(int cpu)
+ }
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++extern void sched_update_min_base_slice(void);
++#else // !CONFIG_SCHED_BORE
+ extern int sched_update_scaling(void);
++#endif // CONFIG_SCHED_BORE
+ 
+ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+ {
+@@ -2528,6 +2532,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
+ extern const_debug unsigned int sysctl_sched_migration_cost;
+ 
+ extern unsigned int sysctl_sched_base_slice;
++#ifdef CONFIG_SCHED_BORE
++extern unsigned int sysctl_sched_min_base_slice;
++#endif // CONFIG_SCHED_BORE
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ extern int sysctl_resched_latency_warn_ms;
+-- 
+2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0005-zstd.patch b/sys-kernel/gentoo-sources-6.6/0005-zstd.patch
new file mode 100644
index 0000000..e351b2a
--- /dev/null
+++ b/sys-kernel/gentoo-sources-6.6/0005-zstd.patch
@@ -0,0 +1,13833 @@
+From 2c4b02d9a1c3640cde85fcf17bf264f01975f0e0 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sat, 16 Dec 2023 09:03:26 +0100
+Subject: [PATCH 5/5] zstd
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/zstd.h                          |    2 +-
+ include/linux/zstd_errors.h                   |   23 +-
+ include/linux/zstd_lib.h                      |  697 +++++--
+ lib/zstd/Makefile                             |    2 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  149 ++
+ lib/zstd/common/bitstream.h                   |   53 +-
+ lib/zstd/common/compiler.h                    |   14 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    5 +-
+ lib/zstd/common/debug.h                       |    3 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   12 +-
+ lib/zstd/common/error_private.h               |    3 +-
+ lib/zstd/common/fse.h                         |   89 +-
+ lib/zstd/common/fse_decompress.c              |   94 +-
+ lib/zstd/common/huf.h                         |  222 +--
+ lib/zstd/common/mem.h                         |    2 +-
+ lib/zstd/common/portability_macros.h          |   26 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
+ lib/zstd/common/zstd_deps.h                   |   16 +-
+ lib/zstd/common/zstd_internal.h               |   99 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   59 +-
+ lib/zstd/compress/hist.c                      |    3 +-
+ lib/zstd/compress/hist.h                      |    3 +-
+ lib/zstd/compress/huf_compress.c              |  372 ++--
+ lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
+ lib/zstd/compress/zstd_compress_literals.c    |  155 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |  149 +-
+ lib/zstd/compress/zstd_double_fast.c          |  129 +-
+ lib/zstd/compress/zstd_double_fast.h          |    6 +-
+ lib/zstd/compress/zstd_fast.c                 |  582 ++++--
+ lib/zstd/compress/zstd_fast.h                 |    6 +-
+ lib/zstd/compress/zstd_lazy.c                 |  518 ++---
+ lib/zstd/compress/zstd_lazy.h                 |    7 +-
+ lib/zstd/compress/zstd_ldm.c                  |   11 +-
+ lib/zstd/compress/zstd_ldm.h                  |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  187 +-
+ lib/zstd/compress/zstd_opt.h                  |    3 +-
+ lib/zstd/decompress/huf_decompress.c          |  770 ++++---
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  261 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
+ lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
+ .../decompress/zstd_decompress_internal.h     |    7 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
+ lib/zstd/zstd_compress_module.c               |    2 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 58 files changed, 4789 insertions(+), 2594 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index 113408eef6ec..f109d49f43f8 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..6d5cf55f0bf3 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+ 
+ 
+ /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE 
++
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..8b4ffe649df5 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
++/* ======   Dependencies   ======*/
+ #include <linux/limits.h>   /* INT_MAX */
+ #include <linux/types.h>   /* size_t */
+ 
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +85,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  5
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+  * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+ 
+ 
+ /*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -412,6 +457,9 @@ typedef enum {
+      * ZSTD_c_validateSequences
+      * ZSTD_c_useBlockSplitter
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -430,7 +478,11 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +545,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +596,15 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004
+ 
+ } ZSTD_dParameter;
+ 
+@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ *   call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is re-used,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1303,7 +1369,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+ 
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
++ * @zc can be used to insert custom compression params.
++ * This function invokes ZSTD_compress2().
+  *
+  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+  * @return : number of sequences generated
+  */
+ 
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences( ZSTD_CCtx* zc,
++                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+  * The entire source is compressed into a single frame.
+  *
+@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+  *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *
++ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
+  */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ *  Note 2 : only single-threaded compression is supported.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @p cparams into the working @p cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Without validation, providing a sequence that does not conform to the zstd spec will cause
+  * undefined behavior, and may produce a corrupted block.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+  *
+@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+ /*!
+@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *
+  * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t ZSTD_sequenceProducer_F (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F* sequenceProducer
++);
++
++
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+-
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+ 
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..464c410b2768 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..05adbbeccaa9
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "mem.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..aa3487ec4b6a
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,149 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_ctz(val);
++#   else
++        return ZSTD_countTrailingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_clz(val);
++#   else
++        return ZSTD_countLeadingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4) && defined(__LP64__)
++        return (unsigned)__builtin_ctzll(val);
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (leastSignificantWord == 0) {
++                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++            } else {
++                return ZSTD_countTrailingZeros32(leastSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)(__builtin_clzll(val));
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (mostSignificantWord == 0) {
++                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++            } else {
++                return ZSTD_countLeadingZeros32(mostSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..444dc4f85c64 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ 
+ /*=========================================
+@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
++ *  unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+  *  This function is safe, it guarantees it will not read beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+         return BIT_DStream_overflow;
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..c437e0975575 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -179,6 +180,17 @@
+ *  Sanitizer
+ *****************************************************************/
+ 
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..d77926cbad14 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -21,4 +22,6 @@
+ 
+ #include "debug.h"
+ 
++#if (DEBUGLEVEL>=2)
+ int g_debuglevel = DEBUGLEVEL;
++#endif
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..da0dbfc614b8 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..a4062d30d170 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..9a4699a38a88 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..c4e25a219142 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39767..99ce8fa54d08 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -24,6 +25,7 @@
+ #include "error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #include "zstd_deps.h"
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +57,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+     return op-ostart;
+ }
+ 
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+     FSE_DTable dtable[]; /* Dynamically sized */
+@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..8e7943092ed1 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+ 
+-/* ***   Advanced function   *** */
+-
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +144,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+-
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif   /* HUF_H_298734234 */
+ 
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index 1d9cc03924ca..a7231822b6e3 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a527d..7ede8cf1ffe5 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -65,7 +66,7 @@
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+  * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +91,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..44b95b25344a 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33a1c..f931f7d0e294 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+ 
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..7f023e4d4774 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+     seqDef* sequencesStart;
+     seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
+     size_t maxNbSeq;
+     size_t maxNbLit;
+ 
+@@ -301,8 +297,8 @@ typedef struct {
+      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+      * the existing value of the litLength or matchLength by 0x10000.
+      */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+ 
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+     seqLen.matchLength = seq->mlBase + MINMATCH;
+     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
++            seqLen.litLength += 0x10000;
+         }
+         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
++            seqLen.matchLength += 0x10000;
+         }
+     }
+     return seqLen;
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ 
+ /* ZSTD_invalidateRepCodes() :
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..e46ca6621b48 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,6 +27,7 @@
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+ #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..0b12587cc14b 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..f7687b0fc20a 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..83241abafe35 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
+ }
+ 
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +475,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +494,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +503,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+     CTable[0] = maxNbBits;
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1216,79 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t maxBits, hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++            if (ERR_isError(maxBits)) continue;
++
++            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+     /* Zero unused symbols in CTable, so we can check it for validity */
+     {
+@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index f620cafca633..c1c316e9e289 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +28,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -55,14 +57,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+     }
+     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+     default:
+         return 0;
+     }
+@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         CCtxParams->deterministicRefPrefix = !!value;
+         return CCtxParams->deterministicRefPrefix;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        CCtxParams->maxBlockSize = value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
++
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+ }
+@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_searchForExternalRepcodes:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    /* only update if all parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
+         ZSTD_clearAllDicts(cctx);
++        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+     return 0;
+@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_cParamMode_e mode,
++                            ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_paramSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+         entropySpace +
+@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+     }
+ }
+ 
+@@ -1637,6 +1833,19 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
++    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
++        }
++        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
++            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
++            assert(cParams->hashLog >= rowLog);
++            ms->rowHashLog = cParams->hashLog - rowLog;
++        }
++    }
++
+     /* opt parser space */
+     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+         DEBUGLOG(4, "reserving optimal parser space");
+@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+     }
+ 
+-    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+-        }
+-        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+-            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+-            assert(cParams->hashLog >= rowLog);
+-            ms->rowHashLog = cParams->hashLog - rowLog;
+-        }
+-    }
+-
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+     assert(params->useBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
++                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
+         int resizeWorkspace;
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
++        /* reserve space for block-level external sequences */
++        if (params->useSequenceProducer) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
++            zc->externalMatchCtx.seqBuffer =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
+ 
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2602,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2613,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const seqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                        const seqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* dst, size_t dstCapacity,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+     {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ }
+ 
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++ZSTD_entropyCompressSeqStore(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
+@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                zc->appliedParams.useSequenceProducer,
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+ 
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                zc->appliedParams.useSequenceProducer,
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+             /* Updates ldmSeqStore.size */
+@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
++        } else if (zc->appliedParams.useSequenceProducer) {
++            assert(
++                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->externalMatchCtx.mFinder != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
++                    zc->externalMatchCtx.mState,
++                    zc->externalMatchCtx.seqBuffer,
++                    zc->externalMatchCtx.seqBufferCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->externalMatchCtx.seqBuffer,
++                    nbExternalSeqs,
++                    zc->externalMatchCtx.seqBufferCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_sequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++                            zc, &seqPos,
++                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
++                                                                                            zc->appliedParams.useRowMatchFinder,
++                                                                                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
+             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                     zc->appliedParams.useRowMatchFinder,
+                                                                                     dictMode);
+@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+            so we provide seqStoreSeqs[i].offset - 1 */
+         ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
++                       seqStoreSeqs[i].offBase,
+                        seqStoreSeqs[i].litLength == 0);
+         literalsRead += outSeqs[i].litLength;
+     }
+@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+     zc->seqCollector.seqIndex += seqStoreSeqSize;
+ }
+ 
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
++}
++
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                               size_t outSeqsSize, const void* src, size_t srcSize)
+ {
+@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const seqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const seqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        seqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+         seqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+  */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                                const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++                        const seqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+     for (; idx < nbSeq; ++idx) {
+         seqDef* const seq = seqStore->sequencesStart + idx;
+-        U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const seqStore_t* const seqStore,
+                                   repcodes_t* const dRep, repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3481,45 +3930,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+     return cSize;
+@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ 
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+         if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+ 
+ 
+             ip += blockSize;
+@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++                                         ZSTD_dictTableLoadMethod_e dtlm,
++                                         ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++    }
++
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    if (params->cParams.strategy < ZSTD_btultra) {
++        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
+     }
+ 
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+         break;
+ 
+     case ZSTD_greedy:
+@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSize - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSize;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                 op, oend-op, ip, iend-ip);
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+                                         ip, iend-ip);
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 void* cDst;
+                 size_t cSize;
+                 size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                               ZSTD_sequencePosition* seqPos,
+                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++                                        const void* src, size_t blockSize,
++                                        ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+     repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         dictSize = 0;
+     }
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
++
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
+     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+     return 0;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ 
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
++                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+     ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+     return sequenceCopier;
+ }
+ 
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
++    }
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++    int const lastBlock = (remaining <= blockSize);
++    return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters)
++        return blockSize_noDelimiter(blockSize, remaining);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+     ZSTD_sequencePosition seqPos = {0, 0, 0};
+ 
+@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+         size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSize, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+         blockSize -= additionalByteAdjustment;
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
++            ZSTD_isRLE(ip, blockSize)) {
+             /* We don't want to emit our first block as a RLE even if it qualifies because
+             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+             * This is only an issue for zstd <= v1.4.3
+@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+     size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+     /* Begin writing output, starting with frame header */
+@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+ 
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc, void* mState,
++    ZSTD_sequenceProducer_F* mFinder
++) {
++    if (mFinder != NULL) {
++        ZSTD_externalMatchCtx emctx;
++        emctx.mState = mState;
++        emctx.mFinder = mFinder;
++        emctx.seqBuffer = NULL;
++        emctx.seqBufferCapacity = 0;
++        zc->externalMatchCtx = emctx;
++        zc->requestedParams.useSequenceProducer = 1;
++    } else {
++        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
++        zc->requestedParams.useSequenceProducer = 0;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..899f5e2de8e9 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+ 
+ 
+ /*-*************************************
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -142,6 +145,12 @@ typedef struct {
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
+     const ZSTD_matchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+     const rawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
+ };
+ 
+ typedef struct {
+@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_paramSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Indicates whether an external matchfinder has been referenced.
++     * Users can't set this externally.
++     * It is set internally in ZSTD_registerSequenceProducer(). */
++    int useSequenceProducer;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_paramSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -355,6 +396,14 @@ typedef struct {
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ } ZSTD_blockSplitCtx;
+ 
++/* Context for block-level external matchfinder API */
++typedef struct {
++  void* mState;
++  ZSTD_sequenceProducer_F* mFinder;
++  ZSTD_Sequence* seqBuffer;
++  size_t seqBufferCapacity;
++} ZSTD_externalMatchCtx;
++
+ struct ZSTD_CCtx_s {
+     ZSTD_compressionStage_e stage;
+     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Workspace for external matchfinder */
++    ZSTD_externalMatchCtx externalMatchCtx;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,7 +495,7 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     seqStorePtr->sequences[0].litLength = (U16)litLength;
+ 
+     /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++    seqStorePtr->sequences[0].offBase = offBase;
+ 
+     /* match Length */
+     assert(matchLength >= MINMATCH);
+@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +728,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+ 
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+     repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ 
+ /* ===============================================================
+@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++                                              ZSTD_sequencePosition* seqPos,
++                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..3e9ea46a670a 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+     symbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..5c028c78d889 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..7fe6f4ff5cf2 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..dbacbaf72733 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     return op-ostart;
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++                   const seqDef* sequences, size_t nbSeq,
++                         size_t litSize, int lastSequence)
++{
+     const seqDef* const sstart = sequences;
+     const seqDef* const send = sequences + nbSeq;
+     const seqDef* sp = sstart;
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const seqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+             repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..65ea53b62844 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+ 
+ 
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+     assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+ }
+ 
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+  */
+@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..ab9440a99603 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,43 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes)
++        PREFETCH_AREA(dictHashSmall, chainTableBytes)
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                 goto _match_found;
+             }
+-        } else {
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                     goto _match_found;
+                 }
+-            } else {
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..0204f12e4cf7 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_doubleFast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..3399b39c5dbc 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,42 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++                }   }   }   }
++}
++
++static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 == ip0 + 1, so
++             * we know we will resume searching after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _offset;
+         }
+ 
+@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* first write next hash table entry; we've already calculated it */
++            if (step <= 4) {
++                /* We need to avoid writing an index into the hash table >= the
++                 * position at which we will pick up our searching after we've
++                 * taken this match.
++                 *
++                 * The minimum possible match has length 4, so the earliest ip0
++                 * can be after we take this match will be the current ip0 + 4.
++                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++                 * write this position.
++                 */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
++
+             goto _offset;
+         }
+ 
+@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     match0 = base + idx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes)
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++                /* found a regular match */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..e64d9e1b2d39 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+ 
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..f6b4978ceba7 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,6 +11,9 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#define kLazySkippingStep 8
+ 
+ 
+ /*-*************************************
+@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
+ static size_t
+ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ FORCE_INLINE_TEMPLATE size_t
+ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+                         ZSTD_matchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ 
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+                                                         U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
++    U32 hash;
+ 
+     /* DMS/DDS variables that may be referenced laster */
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+                 break;
+@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
+ 
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..9505bed93c03 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -22,6 +23,8 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+ 
+@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
++
+ 
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..b7da76b0db7c 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_greedy:
+@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..c540731abde7 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..1e41cb04f482 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,7 @@
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +27,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+ }
+ 
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++FORCE_INLINE_TEMPLATE U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_matchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     ZSTD_optimal_t lastSequence;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+                     lastSequence.litlen = litlen;
+                     lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
++                    lastSequence.off = maxOffBase;
+                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
++                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+                         U32 const sequencePrice = literalsPrice + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+-                                    pos, ZSTD_fCost(sequencePrice));
++                                    pos, ZSTD_fCost((int)sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
++                        opt[pos].off = offBase;
+                         opt[pos].litlen = litlen;
+                         opt[pos].price = (int)sequencePrice;
+                 }   }
+@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+                                 matchNb, matches[matchNb].off, lastML, litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
+@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+ static void
+ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+-     * the cost is 2x cpu time on first block. */
++    ** the cost is 2x cpu time on first block. */
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..faa73ff4b03d 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..db670d71fdab 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -34,6 +35,12 @@
+ *  Macros
+ ****************************************************************/
+ 
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+  * Huffman decompression implementations. You can't force in both directions
+  * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +111,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +123,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,15 +144,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilimit, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+@@ -151,15 +174,17 @@ typedef struct {
+     BYTE const* ilimit;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+@@ -168,9 +193,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     BYTE* const oend = (BYTE*)dst + dstSize;
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,7 +208,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+@@ -195,13 +222,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+          * starts.
+          */
+         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +245,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,10 +254,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
+ 
+     /* If ip[] >= ilimit, it is guaranteed to be safe to
+         * reload bits[]. It may be beyond its section, but is
+@@ -241,10 +268,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +285,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+     bit->start = (const char*)args->iend[0];
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++    {                             \
++        X(0)                      \
++        X(1)                      \
++        X(2)                      \
++        X(3)                      \
++    }
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++    {                                           \
++        X(0, (var))                             \
++        X(1, (var))                             \
++        X(2, (var))                             \
++        X(3, (var))                             \
++    }
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +328,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +375,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +390,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +417,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +445,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -519,7 +557,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -545,6 +583,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -588,6 +630,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,38 +693,156 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilimit = args->ilimit;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilimit);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we get close to the end. */
++            if (op[3] + 20 > olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
++    {                                                           \
++        int const index = (int)(bits[(_stream)] >> 53);         \
++        int const entry = (int)dtable[index];                   \
++        bits[(_stream)] <<= (entry & 0x3F);                     \
++        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++    }
++
++#define HUF_4X1_RELOAD_STREAM(_stream)                              \
++    {                                                               \
++        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++        int const nbBits = ctz & 7;                                 \
++        int const nbBytes = ctz >> 3;                               \
++        op[(_stream)] += 5;                                         \
++        ip[(_stream)] -= nbBytes;                                   \
++        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++        bits[(_stream)] <<= nbBits;                                 \
++    }
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4)
++
++            /* Reload each of the 4 the bitstreams */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM)
++        } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++    }
++
++_out:
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+     const BYTE* const iend = (const BYTE*)cSrc + 6;
+     BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+     assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    loopFn(&args);
+ 
+     /* Our loop guarantees that ip[] >= ilimit and that we haven't
+     * overwritten any op[].
+@@ -694,8 +855,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     (void)iend;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +872,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1107,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1162,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1184,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1240,6 +1355,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1280,8 +1400,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,36 +1487,178 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilimit = args->ilimit;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilimit);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilimit) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop if we are too close to the end. */
++            if (op[3] + 10 > olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)        \
++    if ((_decode3) || (_stream) != 3) {                 \
++        int const index = (int)(bits[(_stream)] >> 53); \
++        HUF_DEltX2 const entry = dtable[index];         \
++        MEM_write16(op[(_stream)], entry.sequence);     \
++        bits[(_stream)] <<= (entry.nbBits) & 0x3F;      \
++        op[(_stream)] += (entry.length);                \
++    }
++
++#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
++    {                                                                   \
++        HUF_4X2_DECODE_SYMBOL(3, 1)                                     \
++        {                                                               \
++            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++            int const nbBits = ctz & 7;                                 \
++            int const nbBytes = ctz >> 3;                               \
++            ip[(_stream)] -= nbBytes;                                   \
++            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
++            bits[(_stream)] <<= nbBits;                                 \
++        }                                                               \
++    }
++
++        /* Manually unroll the loop because compilers don't consistently
++         * unroll the inner loops, which destroys performance.
++         */
++        do {
++            /* Decode 5 symbols from each of the first 3 streams.
++             * The final stream will be decoded during the reload phase
++             * to reduce register pressure.
++             */
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++
++            /* Decode one symbol from the final stream */
++            HUF_4X2_DECODE_SYMBOL(3, 1)
++
++            /* Decode 4 symbols from the final stream & reload bitstreams.
++             * The final stream is reloaded last, meaning that all 5 symbols
++             * are decoded from the final stream before it is reloaded.
++             */
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM)
++        } while (op[3] < olimit);
++    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+     const BYTE* const iend = (const BYTE*)cSrc + 6;
+     BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+     assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+     assert(args.ip[0] >= iend);
+@@ -1426,91 +1689,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1762,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1816,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1831,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1905,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..30ef65e1ab5c 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94711..03dbdf39109f 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -52,17 +53,18 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+ #include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ 
+ 
+@@ -72,11 +74,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
++**           or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+     unsigned long long totalDstSize = 0;
+@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (totalDstSize + fcs < totalDstSize)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_frameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
+@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+  * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++    DEBUGLOG(4, "ZSTD_resetDStream");
+     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..9f5577e5bc19 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+         }
+         else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
++     * loaded in one operation and extracted its fields by simply shifting or
++     * bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+     (void)frame;
+ 
+     /* Regen sequences */
+@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+ 
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
++        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
+ 
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..5888e6cc788b 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..32f79fb2873d 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..466828e35752 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index 04e1b5c01d9b..8ecf43226af2 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index f4ed952ed485..7d31518e9d5a 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.46.0.rc1
+
diff --git a/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch b/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch
deleted file mode 100644
index 4094dce..0000000
--- a/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch
+++ /dev/null
@@ -1,19747 +0,0 @@
-From 1d5eefab83823197c1de81da58ba61bef161635b Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Thu, 7 Dec 2023 20:43:19 +0100
-Subject: [PATCH] sched-ext
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- Documentation/scheduler/index.rst             |    1 +
- Documentation/scheduler/sched-ext.rst         |  229 +
- MAINTAINERS                                   |    3 +
- Makefile                                      |    8 +-
- drivers/tty/sysrq.c                           |    1 +
- include/asm-generic/vmlinux.lds.h             |    1 +
- include/linux/cgroup-defs.h                   |    8 +
- include/linux/cgroup.h                        |    5 +-
- include/linux/sched.h                         |    5 +
- include/linux/sched/ext.h                     |  716 +++
- include/linux/sched/task.h                    |    3 +-
- include/uapi/linux/sched.h                    |    1 +
- init/Kconfig                                  |    5 +
- init/init_task.c                              |   12 +
- kernel/Kconfig.preempt                        |   24 +-
- kernel/bpf/bpf_struct_ops_types.h             |    4 +
- kernel/cgroup/cgroup.c                        |   97 +-
- kernel/fork.c                                 |   17 +-
- kernel/sched/build_policy.c                   |    5 +
- kernel/sched/core.c                           |  316 +-
- kernel/sched/deadline.c                       |    4 +-
- kernel/sched/debug.c                          |    6 +
- kernel/sched/ext.c                            | 4497 +++++++++++++++++
- kernel/sched/ext.h                            |  266 +
- kernel/sched/fair.c                           |    9 +-
- kernel/sched/idle.c                           |    2 +
- kernel/sched/rt.c                             |    4 +-
- kernel/sched/sched.h                          |  117 +-
- kernel/sched/topology.c                       |    4 +-
- lib/dump_stack.c                              |    1 +
- tools/Makefile                                |   10 +-
- tools/sched_ext/.gitignore                    |   10 +
- tools/sched_ext/Kconfig                       |    9 +
- tools/sched_ext/Makefile                      |  301 ++
- tools/sched_ext/README.md                     |  403 ++
- tools/sched_ext/gnu/stubs.h                   |    1 +
- tools/sched_ext/ravg.bpf.h                    |   42 +
- tools/sched_ext/ravg_impl.bpf.h               |  358 ++
- tools/sched_ext/ravg_read.rs.h                |   82 +
- tools/sched_ext/scx_central.bpf.c             |  346 ++
- tools/sched_ext/scx_central.c                 |  123 +
- tools/sched_ext/scx_common.bpf.h              |  244 +
- tools/sched_ext/scx_common.h                  |   59 +
- tools/sched_ext/scx_flatcg.bpf.c              |  912 ++++
- tools/sched_ext/scx_flatcg.c                  |  221 +
- tools/sched_ext/scx_flatcg.h                  |   49 +
- tools/sched_ext/scx_layered/.gitignore        |    3 +
- tools/sched_ext/scx_layered/Cargo.toml        |   30 +
- tools/sched_ext/scx_layered/build.rs          |   77 +
- tools/sched_ext/scx_layered/rustfmt.toml      |    8 +
- .../scx_layered/src/bpf/layered.bpf.c         |  974 ++++
- tools/sched_ext/scx_layered/src/bpf/layered.h |  100 +
- .../sched_ext/scx_layered/src/bpf/util.bpf.c  |   68 +
- .../sched_ext/scx_layered/src/layered_sys.rs  |   10 +
- tools/sched_ext/scx_layered/src/main.rs       | 1641 ++++++
- tools/sched_ext/scx_nest.bpf.c                |  681 +++
- tools/sched_ext/scx_nest.c                    |  227 +
- tools/sched_ext/scx_nest.h                    |   18 +
- tools/sched_ext/scx_nest_stats_table.h        |   19 +
- tools/sched_ext/scx_pair.bpf.c                |  626 +++
- tools/sched_ext/scx_pair.c                    |  168 +
- tools/sched_ext/scx_pair.h                    |    9 +
- tools/sched_ext/scx_qmap.bpf.c                |  401 ++
- tools/sched_ext/scx_qmap.c                    |  105 +
- tools/sched_ext/scx_rusty/.gitignore          |    3 +
- tools/sched_ext/scx_rusty/Cargo.toml          |   28 +
- tools/sched_ext/scx_rusty/build.rs            |   72 +
- tools/sched_ext/scx_rusty/rustfmt.toml        |    8 +
- tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 1153 +++++
- tools/sched_ext/scx_rusty/src/bpf/rusty.h     |   97 +
- tools/sched_ext/scx_rusty/src/main.rs         | 1265 +++++
- tools/sched_ext/scx_rusty/src/rusty_sys.rs    |   10 +
- tools/sched_ext/scx_simple.bpf.c              |  143 +
- tools/sched_ext/scx_simple.c                  |   99 +
- tools/sched_ext/scx_userland.bpf.c            |  262 +
- tools/sched_ext/scx_userland.c                |  366 ++
- tools/sched_ext/scx_userland.h                |   19 +
- tools/sched_ext/user_exit_info.h              |   50 +
- 78 files changed, 18176 insertions(+), 105 deletions(-)
- create mode 100644 Documentation/scheduler/sched-ext.rst
- create mode 100644 include/linux/sched/ext.h
- create mode 100644 kernel/sched/ext.c
- create mode 100644 kernel/sched/ext.h
- create mode 100644 tools/sched_ext/.gitignore
- create mode 100644 tools/sched_ext/Kconfig
- create mode 100644 tools/sched_ext/Makefile
- create mode 100644 tools/sched_ext/README.md
- create mode 100644 tools/sched_ext/gnu/stubs.h
- create mode 100644 tools/sched_ext/ravg.bpf.h
- create mode 100644 tools/sched_ext/ravg_impl.bpf.h
- create mode 100644 tools/sched_ext/ravg_read.rs.h
- create mode 100644 tools/sched_ext/scx_central.bpf.c
- create mode 100644 tools/sched_ext/scx_central.c
- create mode 100644 tools/sched_ext/scx_common.bpf.h
- create mode 100644 tools/sched_ext/scx_common.h
- create mode 100644 tools/sched_ext/scx_flatcg.bpf.c
- create mode 100644 tools/sched_ext/scx_flatcg.c
- create mode 100644 tools/sched_ext/scx_flatcg.h
- create mode 100644 tools/sched_ext/scx_layered/.gitignore
- create mode 100644 tools/sched_ext/scx_layered/Cargo.toml
- create mode 100644 tools/sched_ext/scx_layered/build.rs
- create mode 100644 tools/sched_ext/scx_layered/rustfmt.toml
- create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
- create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.h
- create mode 100644 tools/sched_ext/scx_layered/src/bpf/util.bpf.c
- create mode 100644 tools/sched_ext/scx_layered/src/layered_sys.rs
- create mode 100644 tools/sched_ext/scx_layered/src/main.rs
- create mode 100644 tools/sched_ext/scx_nest.bpf.c
- create mode 100644 tools/sched_ext/scx_nest.c
- create mode 100644 tools/sched_ext/scx_nest.h
- create mode 100644 tools/sched_ext/scx_nest_stats_table.h
- create mode 100644 tools/sched_ext/scx_pair.bpf.c
- create mode 100644 tools/sched_ext/scx_pair.c
- create mode 100644 tools/sched_ext/scx_pair.h
- create mode 100644 tools/sched_ext/scx_qmap.bpf.c
- create mode 100644 tools/sched_ext/scx_qmap.c
- create mode 100644 tools/sched_ext/scx_rusty/.gitignore
- create mode 100644 tools/sched_ext/scx_rusty/Cargo.toml
- create mode 100644 tools/sched_ext/scx_rusty/build.rs
- create mode 100644 tools/sched_ext/scx_rusty/rustfmt.toml
- create mode 100644 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
- create mode 100644 tools/sched_ext/scx_rusty/src/bpf/rusty.h
- create mode 100644 tools/sched_ext/scx_rusty/src/main.rs
- create mode 100644 tools/sched_ext/scx_rusty/src/rusty_sys.rs
- create mode 100644 tools/sched_ext/scx_simple.bpf.c
- create mode 100644 tools/sched_ext/scx_simple.c
- create mode 100644 tools/sched_ext/scx_userland.bpf.c
- create mode 100644 tools/sched_ext/scx_userland.c
- create mode 100644 tools/sched_ext/scx_userland.h
- create mode 100644 tools/sched_ext/user_exit_info.h
-
-diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
-index 317074722..0b650bb55 100644
---- a/Documentation/scheduler/index.rst
-+++ b/Documentation/scheduler/index.rst
-@@ -19,6 +19,7 @@ Scheduler
-     sched-nice-design
-     sched-rt-group
-     sched-stats
-+    sched-ext
-     sched-debug
- 
-     text_files
-diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
-new file mode 100644
-index 000000000..25ddb535c
---- /dev/null
-+++ b/Documentation/scheduler/sched-ext.rst
-@@ -0,0 +1,229 @@
-+==========================
-+Extensible Scheduler Class
-+==========================
-+
-+sched_ext is a scheduler class whose behavior can be defined by a set of BPF
-+programs - the BPF scheduler.
-+
-+* sched_ext exports a full scheduling interface so that any scheduling
-+  algorithm can be implemented on top.
-+
-+* The BPF scheduler can group CPUs however it sees fit and schedule them
-+  together, as tasks aren't tied to specific CPUs at the time of wakeup.
-+
-+* The BPF scheduler can be turned on and off dynamically anytime.
-+
-+* The system integrity is maintained no matter what the BPF scheduler does.
-+  The default scheduling behavior is restored anytime an error is detected,
-+  a runnable task stalls, or on invoking the SysRq key sequence
-+  :kbd:`SysRq-S`.
-+
-+Switching to and from sched_ext
-+===============================
-+
-+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and
-+``tools/sched_ext`` contains the example schedulers.
-+
-+sched_ext is used only when the BPF scheduler is loaded and running.
-+
-+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be
-+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is
-+loaded. On load, such tasks will be switched to and scheduled by sched_ext.
-+
-+The BPF scheduler can choose to schedule all normal and lower class tasks by
-+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this
-+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and
-+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers,
-+this mode can be selected with the ``-a`` option.
-+
-+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or
-+detection of any internal error including stalled runnable tasks aborts the
-+BPF scheduler and reverts all tasks back to CFS.
-+
-+.. code-block:: none
-+
-+    # make -j16 -C tools/sched_ext
-+    # tools/sched_ext/scx_simple
-+    local=0 global=3
-+    local=5 global=24
-+    local=9 global=44
-+    local=13 global=56
-+    local=17 global=72
-+    ^CEXIT: BPF scheduler unregistered
-+
-+If ``CONFIG_SCHED_DEBUG`` is set, the current status of the BPF scheduler
-+and whether a given task is on sched_ext can be determined as follows:
-+
-+.. code-block:: none
-+
-+    # cat /sys/kernel/debug/sched/ext
-+    ops                           : simple
-+    enabled                       : 1
-+    switching_all                 : 1
-+    switched_all                  : 1
-+    enable_state                  : enabled
-+
-+    # grep ext /proc/self/sched
-+    ext.enabled                                  :                    1
-+
-+The Basics
-+==========
-+
-+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
-+programs that implement ``struct sched_ext_ops``. The only mandatory field
-+is ``ops.name`` which must be a valid BPF object name. All operations are
-+optional. The following modified excerpt is from
-+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler.
-+
-+.. code-block:: c
-+
-+    s32 BPF_STRUCT_OPS(simple_init)
-+    {
-+            if (!switch_partial)
-+                    scx_bpf_switch_all();
-+            return 0;
-+    }
-+
-+    void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
-+    {
-+            if (enq_flags & SCX_ENQ_LOCAL)
-+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-+            else
-+                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+    }
-+
-+    void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
-+    {
-+            exit_type = ei->type;
-+    }
-+
-+    SEC(".struct_ops")
-+    struct sched_ext_ops simple_ops = {
-+            .enqueue                = (void *)simple_enqueue,
-+            .init                   = (void *)simple_init,
-+            .exit                   = (void *)simple_exit,
-+            .name                   = "simple",
-+    };
-+
-+Dispatch Queues
-+---------------
-+
-+To match the impedance between the scheduler core and the BPF scheduler,
-+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a
-+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``),
-+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage
-+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
-+``scx_bpf_destroy_dsq()``.
-+
-+A CPU always executes a task from its local DSQ. A task is "dispatched" to a
-+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
-+local DSQ.
-+
-+When a CPU is looking for the next task to run, if the local DSQ is not
-+empty, the first task is picked. Otherwise, the CPU tries to consume the
-+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()``
-+is invoked.
-+
-+Scheduling Cycle
-+----------------
-+
-+The following briefly shows how a waking task is scheduled and executed.
-+
-+1. When a task is waking up, ``ops.select_cpu()`` is the first operation
-+   invoked. This serves two purposes. First, CPU selection optimization
-+   hint. Second, waking up the selected CPU if idle.
-+
-+   The CPU selected by ``ops.select_cpu()`` is an optimization hint and not
-+   binding. The actual decision is made at the last step of scheduling.
-+   However, there is a small performance gain if the CPU
-+   ``ops.select_cpu()`` returns matches the CPU the task eventually runs on.
-+
-+   A side-effect of selecting a CPU is waking it up from idle. While a BPF
-+   scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
-+   using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
-+
-+   Note that the scheduler core will ignore an invalid CPU selection, for
-+   example, if it's outside the allowed cpumask of the task.
-+
-+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked. It can
-+   make one of the following decisions:
-+
-+   * Immediately dispatch the task to either the global or local DSQ by
-+     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
-+     ``SCX_DSQ_LOCAL``, respectively.
-+
-+   * Immediately dispatch the task to a custom DSQ by calling
-+     ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63.
-+
-+   * Queue the task on the BPF side.
-+
-+3. When a CPU is ready to schedule, it first looks at its local DSQ. If
-+   empty, it then looks at the global DSQ. If there still isn't a task to
-+   run, ``ops.dispatch()`` is invoked which can use the following two
-+   functions to populate the local DSQ.
-+
-+   * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can
-+     be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
-+     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()``
-+     currently can't be called with BPF locks held, this is being worked on
-+     and will be supported. ``scx_bpf_dispatch()`` schedules dispatching
-+     rather than performing them immediately. There can be up to
-+     ``ops.dispatch_max_batch`` pending tasks.
-+
-+   * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ
-+     to the dispatching DSQ. This function cannot be called with any BPF
-+     locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks
-+     before trying to consume the specified DSQ.
-+
-+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,
-+   the CPU runs the first one. If empty, the following steps are taken:
-+
-+   * Try to consume the global DSQ. If successful, run the task.
-+
-+   * If ``ops.dispatch()`` has dispatched any tasks, retry #3.
-+
-+   * If the previous task is an SCX task and still runnable, keep executing
-+     it (see ``SCX_OPS_ENQ_LAST``).
-+
-+   * Go idle.
-+
-+Note that the BPF scheduler can always choose to dispatch tasks immediately
-+in ``ops.enqueue()`` as illustrated in the above simple example. If only the
-+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
-+a task is never queued on the BPF scheduler and both the local and global
-+DSQs are consumed automatically.
-+
-+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
-+``scx_bpf_dispatch_vtime()`` for the priority queue. See the function
-+documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for more
-+information.
-+
-+Where to Look
-+=============
-+
-+* ``include/linux/sched/ext.h`` defines the core data structures, ops table
-+  and constants.
-+
-+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers.
-+  The functions prefixed with ``scx_bpf_`` can be called from the BPF
-+  scheduler.
-+
-+* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
-+
-+  * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
-+    custom DSQ.
-+
-+  * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
-+    levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
-+
-+ABI Instability
-+===============
-+
-+The APIs provided by sched_ext to BPF schedulers programs have no stability
-+guarantees. This includes the ops table callbacks and constants defined in
-+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
-+``kernel/sched/ext.c``.
-+
-+While we will attempt to provide a relatively stable API surface when
-+possible, they are subject to change without warning between kernel
-+versions.
-diff --git a/MAINTAINERS b/MAINTAINERS
-index dd5de540e..286abb83c 100644
---- a/MAINTAINERS
-+++ b/MAINTAINERS
-@@ -19080,6 +19080,8 @@ R:	Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
- R:	Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
- R:	Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
- R:	Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
-+R:	Tejun Heo <tj@kernel.org> (SCHED_EXT)
-+R:	David Vernet <void@manifault.com> (SCHED_EXT)
- L:	linux-kernel@vger.kernel.org
- S:	Maintained
- T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
-@@ -19088,6 +19090,7 @@ F:	include/linux/sched.h
- F:	include/linux/wait.h
- F:	include/uapi/linux/sched.h
- F:	kernel/sched/
-+F:	tools/sched_ext/
- 
- SCSI LIBSAS SUBSYSTEM
- R:	John Garry <john.g.garry@oracle.com>
-diff --git a/Makefile b/Makefile
-index cbe63ba91..8f2fc39a0 100644
---- a/Makefile
-+++ b/Makefile
-@@ -1341,6 +1341,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),)
- 	$(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
- endif
- 
-+tools-clean-targets := sched_ext
-+PHONY += $(tools-clean-targets)
-+$(tools-clean-targets):
-+	$(Q)$(MAKE) -sC tools $@_clean
-+tools_clean: $(tools-clean-targets)
-+
- # Clear a bunch of variables before executing the submake
- ifeq ($(quiet),silent_)
- tools_silent=s
-@@ -1510,7 +1516,7 @@ PHONY += $(mrproper-dirs) mrproper
- $(mrproper-dirs):
- 	$(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
- 
--mrproper: clean $(mrproper-dirs)
-+mrproper: clean $(mrproper-dirs) tools_clean
- 	$(call cmd,rmfiles)
- 	@find . $(RCS_FIND_IGNORE) \
- 		\( -name '*.rmeta' \) \
-diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index 6b4a28bcf..6ec15c131 100644
---- a/drivers/tty/sysrq.c
-+++ b/drivers/tty/sysrq.c
-@@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
- 	NULL,				/* P */
- 	NULL,				/* Q */
- 	NULL,				/* R */
-+	/* S: May be registered by sched_ext for resetting */
- 	NULL,				/* S */
- 	NULL,				/* T */
- 	NULL,				/* U */
-diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
-index 67d8dd2f1..575322902 100644
---- a/include/asm-generic/vmlinux.lds.h
-+++ b/include/asm-generic/vmlinux.lds.h
-@@ -131,6 +131,7 @@
- 	*(__dl_sched_class)			\
- 	*(__rt_sched_class)			\
- 	*(__fair_sched_class)			\
-+	*(__ext_sched_class)			\
- 	*(__idle_sched_class)			\
- 	__sched_class_lowest = .;
- 
-diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
-index 265da00a1..6194d7c13 100644
---- a/include/linux/cgroup-defs.h
-+++ b/include/linux/cgroup-defs.h
-@@ -127,12 +127,18 @@ enum {
- 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
- 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
- 
-+	CFTYPE_HIDDEN		= (1 << 6),	/* file type hidden, see cgroup_show_cftypes() */
-+
- 	/* internal flags, do not use outside cgroup core proper */
- 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
- 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
- 	__CFTYPE_ADDED		= (1 << 18),
- };
- 
-+enum cfile_flags {
-+	CFILE_HIDDEN		= (1 << 0),	/* file instance hidden */
-+};
-+
- /*
-  * cgroup_file is the handle for a file instance created in a cgroup which
-  * is used, for example, to generate file changed notifications.  This can
-@@ -140,7 +146,9 @@ enum {
-  */
- struct cgroup_file {
- 	/* do not access any fields from outside cgroup core */
-+	struct cftype *cft;
- 	struct kernfs_node *kn;
-+	unsigned int flags;
- 	unsigned long notified_at;
- 	struct timer_list notify_timer;
- };
-diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
-index b307013b9..08b54094b 100644
---- a/include/linux/cgroup.h
-+++ b/include/linux/cgroup.h
-@@ -29,8 +29,6 @@
- 
- struct kernel_clone_args;
- 
--#ifdef CONFIG_CGROUPS
--
- /*
-  * All weight knobs on the default hierarchy should use the following min,
-  * default and max values.  The default value is the logarithmic center of
-@@ -40,6 +38,8 @@ struct kernel_clone_args;
- #define CGROUP_WEIGHT_DFL		100
- #define CGROUP_WEIGHT_MAX		10000
- 
-+#ifdef CONFIG_CGROUPS
-+
- /* walk only threadgroup leaders */
- #define CSS_TASK_ITER_PROCS		(1U << 0)
- /* walk all threaded css_sets in the domain */
-@@ -115,6 +115,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
- int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
- int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
- int cgroup_rm_cftypes(struct cftype *cfts);
-+void cgroup_show_cftype(struct cftype *cft, bool show);
- void cgroup_file_notify(struct cgroup_file *cfile);
- void cgroup_file_show(struct cgroup_file *cfile, bool show);
- 
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 77f01ac38..f81ff964c 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -71,6 +71,8 @@ struct task_delay_info;
- struct task_group;
- struct user_event_mm;
- 
-+#include <linux/sched/ext.h>
-+
- /*
-  * Task state bitmask. NOTE! These bits are also
-  * encoded in fs/proc/array.c: get_task_state().
-@@ -794,6 +796,9 @@ struct task_struct {
- 	struct sched_entity		se;
- 	struct sched_rt_entity		rt;
- 	struct sched_dl_entity		dl;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	struct sched_ext_entity		scx;
-+#endif
- 	const struct sched_class	*sched_class;
- 
- #ifdef CONFIG_SCHED_CORE
-diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
-new file mode 100644
-index 000000000..b20a7620b
---- /dev/null
-+++ b/include/linux/sched/ext.h
-@@ -0,0 +1,716 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef _LINUX_SCHED_EXT_H
-+#define _LINUX_SCHED_EXT_H
-+
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+
-+#include <linux/rhashtable.h>
-+#include <linux/llist.h>
-+
-+struct cgroup;
-+
-+enum scx_consts {
-+	SCX_OPS_NAME_LEN	= 128,
-+	SCX_EXIT_REASON_LEN	= 128,
-+	SCX_EXIT_BT_LEN		= 64,
-+	SCX_EXIT_MSG_LEN	= 1024,
-+
-+	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
-+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
-+};
-+
-+/*
-+ * DSQ (dispatch queue) IDs are 64bit of the format:
-+ *
-+ *   Bits: [63] [62 ..  0]
-+ *         [ B] [   ID   ]
-+ *
-+ *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
-+ *   ID: 63 bit ID
-+ *
-+ * Built-in IDs:
-+ *
-+ *   Bits: [63] [62] [61..32] [31 ..  0]
-+ *         [ 1] [ L] [   R  ] [    V   ]
-+ *
-+ *    1: 1 for built-in DSQs.
-+ *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
-+ *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
-+ */
-+enum scx_dsq_id_flags {
-+	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
-+	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
-+
-+	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
-+	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
-+	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
-+	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
-+	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
-+};
-+
-+enum scx_exit_kind {
-+	SCX_EXIT_NONE,
-+	SCX_EXIT_DONE,
-+
-+	SCX_EXIT_UNREG = 64,	/* BPF unregistration */
-+	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
-+
-+	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
-+	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
-+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
-+};
-+
-+/*
-+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
-+ * being disabled.
-+ */
-+struct scx_exit_info {
-+	/* %SCX_EXIT_* - broad category of the exit reason */
-+	enum scx_exit_kind	kind;
-+	/* textual representation of the above */
-+	char			reason[SCX_EXIT_REASON_LEN];
-+	/* number of entries in the backtrace */
-+	u32			bt_len;
-+	/* backtrace if exiting due to an error */
-+	unsigned long		bt[SCX_EXIT_BT_LEN];
-+	/* extra message */
-+	char			msg[SCX_EXIT_MSG_LEN];
-+};
-+
-+/* sched_ext_ops.flags */
-+enum scx_ops_flags {
-+	/*
-+	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
-+	 */
-+	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-+
-+	/*
-+	 * By default, if there are no other task to run on the CPU, ext core
-+	 * keeps running the current task even after its slice expires. If this
-+	 * flag is specified, such tasks are passed to ops.enqueue() with
-+	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
-+	 */
-+	SCX_OPS_ENQ_LAST	= 1LLU << 1,
-+
-+	/*
-+	 * An exiting task may schedule after PF_EXITING is set. In such cases,
-+	 * bpf_task_from_pid() may not be able to find the task and if the BPF
-+	 * scheduler depends on pid lookup for dispatching, the task will be
-+	 * lost leading to various issues including RCU grace period stalls.
-+	 *
-+	 * To mask this problem, by default, unhashed tasks are automatically
-+	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
-+	 * depend on pid lookups and wants to handle these tasks directly, the
-+	 * following flag can be used.
-+	 */
-+	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
-+
-+	/*
-+	 * CPU cgroup knob enable flags
-+	 */
-+	SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16,	/* cpu.weight */
-+
-+	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
-+				  SCX_OPS_ENQ_LAST |
-+				  SCX_OPS_ENQ_EXITING |
-+				  SCX_OPS_CGROUP_KNOB_WEIGHT,
-+};
-+
-+/* argument container for ops.enable() and friends */
-+struct scx_enable_args {
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	/* the cgroup the task is joining */
-+	struct cgroup		*cgroup;
-+#endif
-+};
-+
-+/* argument container for ops->cgroup_init() */
-+struct scx_cgroup_init_args {
-+	/* the weight of the cgroup [1..10000] */
-+	u32			weight;
-+};
-+
-+enum scx_cpu_preempt_reason {
-+	/* next task is being scheduled by &sched_class_rt */
-+        SCX_CPU_PREEMPT_RT,
-+	/* next task is being scheduled by &sched_class_dl */
-+        SCX_CPU_PREEMPT_DL,
-+	/* next task is being scheduled by &sched_class_stop */
-+        SCX_CPU_PREEMPT_STOP,
-+	/* unknown reason for SCX being preempted */
-+        SCX_CPU_PREEMPT_UNKNOWN,
-+};
-+
-+/*
-+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
-+ * expanded in the future.
-+ */
-+struct scx_cpu_acquire_args {};
-+
-+/* argument container for ops->cpu_release() */
-+struct scx_cpu_release_args {
-+	/* the reason the CPU was preempted */
-+	enum scx_cpu_preempt_reason reason;
-+
-+	/* the task that's going to be scheduled on the CPU */
-+	struct task_struct *task;
-+};
-+
-+/**
-+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
-+ *
-+ * Userland can implement an arbitrary scheduling policy by implementing and
-+ * loading operations in this table.
-+ */
-+struct sched_ext_ops {
-+	/**
-+	 * select_cpu - Pick the target CPU for a task which is being woken up
-+	 * @p: task being woken up
-+	 * @prev_cpu: the cpu @p was on before sleeping
-+	 * @wake_flags: SCX_WAKE_*
-+	 *
-+	 * Decision made here isn't final. @p may be moved to any CPU while it
-+	 * is getting dispatched for execution later. However, as @p is not on
-+	 * the rq at this point, getting the eventual execution CPU right here
-+	 * saves a small bit of overhead down the line.
-+	 *
-+	 * If an idle CPU is returned, the CPU is kicked and will try to
-+	 * dispatch. While an explicit custom mechanism can be added,
-+	 * select_cpu() serves as the default way to wake up idle CPUs.
-+	 */
-+	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-+
-+	/**
-+	 * enqueue - Enqueue a task on the BPF scheduler
-+	 * @p: task being enqueued
-+	 * @enq_flags: %SCX_ENQ_*
-+	 *
-+	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
-+	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
-+	 * scheduler owns @p and if it fails to dispatch @p, the task will
-+	 * stall.
-+	 */
-+	void (*enqueue)(struct task_struct *p, u64 enq_flags);
-+
-+	/**
-+	 * dequeue - Remove a task from the BPF scheduler
-+	 * @p: task being dequeued
-+	 * @deq_flags: %SCX_DEQ_*
-+	 *
-+	 * Remove @p from the BPF scheduler. This is usually called to isolate
-+	 * the task while updating its scheduling properties (e.g. priority).
-+	 *
-+	 * The ext core keeps track of whether the BPF side owns a given task or
-+	 * not and can gracefully ignore spurious dispatches from BPF side,
-+	 * which makes it safe to not implement this method. However, depending
-+	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
-+	 * scheduling position not being updated across a priority change.
-+	 */
-+	void (*dequeue)(struct task_struct *p, u64 deq_flags);
-+
-+	/**
-+	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
-+	 * @cpu: CPU to dispatch tasks for
-+	 * @prev: previous task being switched out
-+	 *
-+	 * Called when a CPU's local dsq is empty. The operation should dispatch
-+	 * one or more tasks from the BPF scheduler into the DSQs using
-+	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
-+	 * scx_bpf_consume().
-+	 *
-+	 * The maximum number of times scx_bpf_dispatch() can be called without
-+	 * an intervening scx_bpf_consume() is specified by
-+	 * ops.dispatch_max_batch. See the comments on top of the two functions
-+	 * for more details.
-+	 *
-+	 * When not %NULL, @prev is an SCX task with its slice depleted. If
-+	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
-+	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
-+	 * ops.dispatch() returns. To keep executing @prev, return without
-+	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
-+	 */
-+	void (*dispatch)(s32 cpu, struct task_struct *prev);
-+
-+	/**
-+	 * runnable - A task is becoming runnable on its associated CPU
-+	 * @p: task becoming runnable
-+	 * @enq_flags: %SCX_ENQ_*
-+	 *
-+	 * This and the following three functions can be used to track a task's
-+	 * execution state transitions. A task becomes ->runnable() on a CPU,
-+	 * and then goes through one or more ->running() and ->stopping() pairs
-+	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
-+	 * done running on the CPU.
-+	 *
-+	 * @p is becoming runnable on the CPU because it's
-+	 *
-+	 * - waking up (%SCX_ENQ_WAKEUP)
-+	 * - being moved from another CPU
-+	 * - being restored after temporarily taken off the queue for an
-+	 *   attribute change.
-+	 *
-+	 * This and ->enqueue() are related but not coupled. This operation
-+	 * notifies @p's state transition and may not be followed by ->enqueue()
-+	 * e.g. when @p is being dispatched to a remote CPU. Likewise, a task
-+	 * may be ->enqueue()'d without being preceded by this operation e.g.
-+	 * after exhausting its slice.
-+	 */
-+	void (*runnable)(struct task_struct *p, u64 enq_flags);
-+
-+	/**
-+	 * running - A task is starting to run on its associated CPU
-+	 * @p: task starting to run
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers.
-+	 */
-+	void (*running)(struct task_struct *p);
-+
-+	/**
-+	 * stopping - A task is stopping execution
-+	 * @p: task stopping to run
-+	 * @runnable: is task @p still runnable?
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers. If
-+	 * !@runnable, ->quiescent() will be invoked after this operation
-+	 * returns.
-+	 */
-+	void (*stopping)(struct task_struct *p, bool runnable);
-+
-+	/**
-+	 * quiescent - A task is becoming not runnable on its associated CPU
-+	 * @p: task becoming not runnable
-+	 * @deq_flags: %SCX_DEQ_*
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers.
-+	 *
-+	 * @p is becoming quiescent on the CPU because it's
-+	 *
-+	 * - sleeping (%SCX_DEQ_SLEEP)
-+	 * - being moved to another CPU
-+	 * - being temporarily taken off the queue for an attribute change
-+	 *   (%SCX_DEQ_SAVE)
-+	 *
-+	 * This and ->dequeue() are related but not coupled. This operation
-+	 * notifies @p's state transition and may not be preceded by ->dequeue()
-+	 * e.g. when @p is being dispatched to a remote CPU.
-+	 */
-+	void (*quiescent)(struct task_struct *p, u64 deq_flags);
-+
-+	/**
-+	 * yield - Yield CPU
-+	 * @from: yielding task
-+	 * @to: optional yield target task
-+	 *
-+	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
-+	 * The BPF scheduler should ensure that other available tasks are
-+	 * dispatched before the yielding task. Return value is ignored in this
-+	 * case.
-+	 *
-+	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
-+	 * scheduler can implement the request, return %true; otherwise, %false.
-+	 */
-+	bool (*yield)(struct task_struct *from, struct task_struct *to);
-+
-+	/**
-+	 * core_sched_before - Task ordering for core-sched
-+	 * @a: task A
-+	 * @b: task B
-+	 *
-+	 * Used by core-sched to determine the ordering between two tasks. See
-+	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
-+	 * core-sched.
-+	 *
-+	 * Both @a and @b are runnable and may or may not currently be queued on
-+	 * the BPF scheduler. Should return %true if @a should run before @b.
-+	 * %false if there's no required ordering or @b should run before @a.
-+	 *
-+	 * If not specified, the default is ordering them according to when they
-+	 * became runnable.
-+	 */
-+	bool (*core_sched_before)(struct task_struct *a,struct task_struct *b);
-+
-+	/**
-+	 * set_weight - Set task weight
-+	 * @p: task to set weight for
-+	 * @weight: new eight [1..10000]
-+	 *
-+	 * Update @p's weight to @weight.
-+	 */
-+	void (*set_weight)(struct task_struct *p, u32 weight);
-+
-+	/**
-+	 * set_cpumask - Set CPU affinity
-+	 * @p: task to set CPU affinity for
-+	 * @cpumask: cpumask of cpus that @p can run on
-+	 *
-+	 * Update @p's CPU affinity to @cpumask.
-+	 */
-+	void (*set_cpumask)(struct task_struct *p,
-+			    const struct cpumask *cpumask);
-+
-+	/**
-+	 * update_idle - Update the idle state of a CPU
-+	 * @cpu: CPU to udpate the idle state for
-+	 * @idle: whether entering or exiting the idle state
-+	 *
-+	 * This operation is called when @rq's CPU goes or leaves the idle
-+	 * state. By default, implementing this operation disables the built-in
-+	 * idle CPU tracking and the following helpers become unavailable:
-+	 *
-+	 * - scx_bpf_select_cpu_dfl()
-+	 * - scx_bpf_test_and_clear_cpu_idle()
-+	 * - scx_bpf_pick_idle_cpu()
-+	 *
-+	 * The user also must implement ops.select_cpu() as the default
-+	 * implementation relies on scx_bpf_select_cpu_dfl().
-+	 *
-+	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
-+	 * tracking.
-+	 */
-+	void (*update_idle)(s32 cpu, bool idle);
-+
-+	/**
-+	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
-+	 * @cpu: The CPU being acquired by the BPF scheduler.
-+	 * @args: Acquire arguments, see the struct definition.
-+	 *
-+	 * A CPU that was previously released from the BPF scheduler is now once
-+	 * again under its control.
-+	 */
-+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-+
-+	/**
-+	 * cpu_release - A CPU is taken away from the BPF scheduler
-+	 * @cpu: The CPU being released by the BPF scheduler.
-+	 * @args: Release arguments, see the struct definition.
-+	 *
-+	 * The specified CPU is no longer under the control of the BPF
-+	 * scheduler. This could be because it was preempted by a higher
-+	 * priority sched_class, though there may be other reasons as well. The
-+	 * caller should consult @args->reason to determine the cause.
-+	 */
-+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-+
-+	/**
-+	 * cpu_online - A CPU became online
-+	 * @cpu: CPU which just came up
-+	 *
-+	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
-+	 * associated with other CPUs beforehand.
-+	 */
-+	void (*cpu_online)(s32 cpu);
-+
-+	/**
-+	 * cpu_offline - A CPU is going offline
-+	 * @cpu: CPU which is going offline
-+	 *
-+	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
-+	 * associated with other CPUs afterwards.
-+	 */
-+	void (*cpu_offline)(s32 cpu);
-+
-+	/**
-+	 * prep_enable - Prepare to enable BPF scheduling for a task
-+	 * @p: task to prepare BPF scheduling for
-+	 * @args: enable arguments, see the struct definition
-+	 *
-+	 * Either we're loading a BPF scheduler or a new task is being forked.
-+	 * Prepare BPF scheduling for @p. This operation may block and can be
-+	 * used for allocations.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return while
-+	 * loading will abort loading of the BPF scheduler. During a fork, will
-+	 * abort the specific fork.
-+	 */
-+	s32 (*prep_enable)(struct task_struct *p, struct scx_enable_args *args);
-+
-+	/**
-+	 * enable - Enable BPF scheduling for a task
-+	 * @p: task to enable BPF scheduling for
-+	 * @args: enable arguments, see the struct definition
-+	 *
-+	 * Enable @p for BPF scheduling. @p is now in the cgroup specified for
-+	 * the preceding prep_enable() and will start running soon.
-+	 */
-+	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
-+
-+	/**
-+	 * cancel_enable - Cancel prep_enable()
-+	 * @p: task being canceled
-+	 * @args: enable arguments, see the struct definition
-+	 *
-+	 * @p was prep_enable()'d but failed before reaching enable(). Undo the
-+	 * preparation.
-+	 */
-+	void (*cancel_enable)(struct task_struct *p,
-+			      struct scx_enable_args *args);
-+
-+	/**
-+	 * disable - Disable BPF scheduling for a task
-+	 * @p: task to disable BPF scheduling for
-+	 *
-+	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
-+	 * Disable BPF scheduling for @p.
-+	 */
-+	void (*disable)(struct task_struct *p);
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	/**
-+	 * cgroup_init - Initialize a cgroup
-+	 * @cgrp: cgroup being initialized
-+	 * @args: init arguments, see the struct definition
-+	 *
-+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
-+	 * @cgrp for sched_ext. This operation may block.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return while
-+	 * loading will abort loading of the BPF scheduler. During cgroup
-+	 * creation, it will abort the specific cgroup creation.
-+	 */
-+	s32 (*cgroup_init)(struct cgroup *cgrp,
-+			   struct scx_cgroup_init_args *args);
-+
-+	/**
-+	 * cgroup_exit - Exit a cgroup
-+	 * @cgrp: cgroup being exited
-+	 *
-+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
-+	 * @cgrp for sched_ext. This operation my block.
-+	 */
-+	void (*cgroup_exit)(struct cgroup *cgrp);
-+
-+	/**
-+	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
-+	 * @p: task being moved
-+	 * @from: cgroup @p is being moved from
-+	 * @to: cgroup @p is being moved to
-+	 *
-+	 * Prepare @p for move from cgroup @from to @to. This operation may
-+	 * block and can be used for allocations.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return aborts the
-+	 * migration.
-+	 */
-+	s32 (*cgroup_prep_move)(struct task_struct *p,
-+				struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_move - Commit cgroup move
-+	 * @p: task being moved
-+	 * @from: cgroup @p is being moved from
-+	 * @to: cgroup @p is being moved to
-+	 *
-+	 * Commit the move. @p is dequeued during this operation.
-+	 */
-+	void (*cgroup_move)(struct task_struct *p,
-+			    struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_cancel_move - Cancel cgroup move
-+	 * @p: task whose cgroup move is being canceled
-+	 * @from: cgroup @p was being moved from
-+	 * @to: cgroup @p was being moved to
-+	 *
-+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
-+	 * Undo the preparation.
-+	 */
-+	void (*cgroup_cancel_move)(struct task_struct *p,
-+				   struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_set_weight - A cgroup's weight is being changed
-+	 * @cgrp: cgroup whose weight is being updated
-+	 * @weight: new weight [1..10000]
-+	 *
-+	 * Update @tg's weight to @weight.
-+	 */
-+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-+#endif	/* CONFIG_CGROUPS */
-+
-+	/*
-+	 * All online ops must come before ops.init().
-+	 */
-+
-+	/**
-+	 * init - Initialize the BPF scheduler
-+	 */
-+	s32 (*init)(void);
-+
-+	/**
-+	 * exit - Clean up after the BPF scheduler
-+	 * @info: Exit info
-+	 */
-+	void (*exit)(struct scx_exit_info *info);
-+
-+	/**
-+	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
-+	 */
-+	u32 dispatch_max_batch;
-+
-+	/**
-+	 * flags - %SCX_OPS_* flags
-+	 */
-+	u64 flags;
-+
-+	/**
-+	 * timeout_ms - The maximum amount of time, in milliseconds, that a
-+	 * runnable task should be able to wait before being scheduled. The
-+	 * maximum timeout may not exceed the default timeout of 30 seconds.
-+	 *
-+	 * Defaults to the maximum allowed timeout value of 30 seconds.
-+	 */
-+	u32 timeout_ms;
-+
-+	/**
-+	 * name - BPF scheduler's name
-+	 *
-+	 * Must be a non-zero valid BPF object name including only isalnum(),
-+	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
-+	 * BPF scheduler is enabled.
-+	 */
-+	char name[SCX_OPS_NAME_LEN];
-+};
-+
-+/*
-+ * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
-+ * scheduler core and the BPF scheduler. See the documentation for more details.
-+ */
-+struct scx_dispatch_q {
-+	raw_spinlock_t		lock;
-+	struct list_head	fifo;	/* processed in dispatching order */
-+	struct rb_root_cached	priq;	/* processed in p->scx.dsq_vtime order */
-+	u32			nr;
-+	u64			id;
-+	struct rhash_head	hash_node;
-+	struct llist_node	free_node;
-+	struct rcu_head		rcu;
-+};
-+
-+/* scx_entity.flags */
-+enum scx_ent_flags {
-+	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
-+	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
-+	SCX_TASK_ENQ_LOCAL	= 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
-+
-+	SCX_TASK_OPS_PREPPED	= 1 << 8, /* prepared for BPF scheduler enable */
-+	SCX_TASK_OPS_ENABLED	= 1 << 9, /* task has BPF scheduler enabled */
-+
-+	SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */
-+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 17, /* last dequeue was for SLEEP */
-+
-+	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
-+};
-+
-+/* scx_entity.dsq_flags */
-+enum scx_ent_dsq_flags {
-+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
-+};
-+
-+/*
-+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
-+ * everywhere and the following bits track which kfunc sets are currently
-+ * allowed for %current. This simple per-task tracking works because SCX ops
-+ * nest in a limited way. BPF will likely implement a way to allow and disallow
-+ * kfuncs depending on the calling context which will replace this manual
-+ * mechanism. See scx_kf_allow().
-+ */
-+enum scx_kf_mask {
-+	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
-+	/* all non-sleepables may be nested inside INIT and SLEEPABLE */
-+	SCX_KF_INIT		= 1 << 0, /* running ops.init() */
-+	SCX_KF_SLEEPABLE	= 1 << 1, /* other sleepable init operations */
-+	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-+	SCX_KF_CPU_RELEASE	= 1 << 2, /* ops.cpu_release() */
-+	/* ops.dequeue (in REST) may be nested inside DISPATCH */
-+	SCX_KF_DISPATCH		= 1 << 3, /* ops.dispatch() */
-+	SCX_KF_ENQUEUE		= 1 << 4, /* ops.enqueue() */
-+	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
-+
-+	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
-+				  SCX_KF_ENQUEUE | SCX_KF_REST,
-+	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_REST,
-+};
-+
-+/*
-+ * The following is embedded in task_struct and contains all fields necessary
-+ * for a task to be scheduled by SCX.
-+ */
-+struct sched_ext_entity {
-+	struct scx_dispatch_q	*dsq;
-+	struct {
-+		struct list_head	fifo;	/* dispatch order */
-+		struct rb_node		priq;	/* p->scx.dsq_vtime order */
-+	} dsq_node;
-+	struct list_head	watchdog_node;
-+	u32			flags;		/* protected by rq lock */
-+	u32			dsq_flags;	/* protected by dsq lock */
-+	u32			weight;
-+	s32			sticky_cpu;
-+	s32			holding_cpu;
-+	u32			kf_mask;	/* see scx_kf_mask above */
-+	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
-+	atomic_long_t		ops_state;
-+	unsigned long		runnable_at;
-+#ifdef CONFIG_SCHED_CORE
-+	u64			core_sched_at;	/* see scx_prio_less() */
-+#endif
-+
-+	/* BPF scheduler modifiable fields */
-+
-+	/*
-+	 * Runtime budget in nsecs. This is usually set through
-+	 * scx_bpf_dispatch() but can also be modified directly by the BPF
-+	 * scheduler. Automatically decreased by SCX as the task executes. On
-+	 * depletion, a scheduling event is triggered.
-+	 *
-+	 * This value is cleared to zero if the task is preempted by
-+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
-+	 * task ran. Use p->se.sum_exec_runtime instead.
-+	 */
-+	u64			slice;
-+
-+	/*
-+	 * Used to order tasks when dispatching to the vtime-ordered priority
-+	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
-+	 * but can also be modified directly by the BPF scheduler. Modifying it
-+	 * while a task is queued on a dsq may mangle the ordering and is not
-+	 * recommended.
-+	 */
-+	u64			dsq_vtime;
-+
-+	/*
-+	 * If set, reject future sched_setscheduler(2) calls updating the policy
-+	 * to %SCHED_EXT with -%EACCES.
-+	 *
-+	 * If set from ops.prep_enable() and the task's policy is already
-+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
-+	 * or by inhering the parent's policy during fork, the task's policy is
-+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of such
-+	 * events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
-+	 */
-+	bool			disallow;	/* reject switching into SCX */
-+
-+	/* cold fields */
-+	struct list_head	tasks_node;
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	struct cgroup		*cgrp_moving_from;
-+#endif
-+};
-+
-+void sched_ext_free(struct task_struct *p);
-+void print_scx_info(const char *log_lvl, struct task_struct *p);
-+
-+#else	/* !CONFIG_SCHED_CLASS_EXT */
-+
-+static inline void sched_ext_free(struct task_struct *p) {}
-+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
-+
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-+#endif	/* _LINUX_SCHED_EXT_H */
-diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
-index a23af225c..03d35e3ed 100644
---- a/include/linux/sched/task.h
-+++ b/include/linux/sched/task.h
-@@ -61,7 +61,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
- extern void init_idle(struct task_struct *idle, int cpu);
- 
- extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
--extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
-+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
-+extern void sched_cancel_fork(struct task_struct *p);
- extern void sched_post_fork(struct task_struct *p);
- extern void sched_dead(struct task_struct *p);
- 
-diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
-index 3bac0a8ce..359a14cc7 100644
---- a/include/uapi/linux/sched.h
-+++ b/include/uapi/linux/sched.h
-@@ -118,6 +118,7 @@ struct clone_args {
- /* SCHED_ISO: reserved but not implemented yet */
- #define SCHED_IDLE		5
- #define SCHED_DEADLINE		6
-+#define SCHED_EXT		7
- 
- /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
- #define SCHED_RESET_ON_FORK     0x40000000
-diff --git a/init/Kconfig b/init/Kconfig
-index 6d35728b9..6a247f11c 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -1012,6 +1012,11 @@ config RT_GROUP_SCHED
- 	  realtime bandwidth for them.
- 	  See Documentation/scheduler/sched-rt-group.rst for more information.
- 
-+config EXT_GROUP_SCHED
-+	bool
-+	depends on SCHED_CLASS_EXT && CGROUP_SCHED
-+	default y
-+
- endif #CGROUP_SCHED
- 
- config SCHED_MM_CID
-diff --git a/init/init_task.c b/init/init_task.c
-index ff6c4b9bf..7ea89ccd0 100644
---- a/init/init_task.c
-+++ b/init/init_task.c
-@@ -6,6 +6,7 @@
- #include <linux/sched/sysctl.h>
- #include <linux/sched/rt.h>
- #include <linux/sched/task.h>
-+#include <linux/sched/ext.h>
- #include <linux/init.h>
- #include <linux/fs.h>
- #include <linux/mm.h>
-@@ -101,6 +102,17 @@ struct task_struct init_task
- #endif
- #ifdef CONFIG_CGROUP_SCHED
- 	.sched_task_group = &root_task_group,
-+#endif
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	.scx		= {
-+		.dsq_node.fifo	= LIST_HEAD_INIT(init_task.scx.dsq_node.fifo),
-+		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
-+		.sticky_cpu	= -1,
-+		.holding_cpu	= -1,
-+		.ops_state	= ATOMIC_INIT(0),
-+		.runnable_at	= INITIAL_JIFFIES,
-+		.slice		= SCX_SLICE_DFL,
-+	},
- #endif
- 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
- 	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
-diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a..bae49b743 100644
---- a/kernel/Kconfig.preempt
-+++ b/kernel/Kconfig.preempt
-@@ -133,4 +133,26 @@ config SCHED_CORE
- 	  which is the likely usage by Linux distributions, there should
- 	  be no measurable impact on performance.
- 
--
-+config SCHED_CLASS_EXT
-+	bool "Extensible Scheduling Class"
-+	depends on BPF_SYSCALL && BPF_JIT
-+	help
-+	  This option enables a new scheduler class sched_ext (SCX), which
-+	  allows scheduling policies to be implemented as BPF programs to
-+	  achieve the following:
-+
-+	  - Ease of experimentation and exploration: Enabling rapid
-+	    iteration of new scheduling policies.
-+	  - Customization: Building application-specific schedulers which
-+	    implement policies that are not applicable to general-purpose
-+	    schedulers.
-+	  - Rapid scheduler deployments: Non-disruptive swap outs of
-+	    scheduling policies in production environments.
-+
-+	  sched_ext leverages BPFâs struct_ops feature to define a structure
-+	  which exports function callbacks and flags to BPF programs that
-+	  wish to implement scheduling policies. The struct_ops structure
-+	  exported by sched_ext is struct sched_ext_ops, and is conceptually
-+	  similar to struct sched_class.
-+
-+	  See Documentation/scheduler/sched-ext.rst for more details.
-diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
-index 5678a9ddf..3618769d8 100644
---- a/kernel/bpf/bpf_struct_ops_types.h
-+++ b/kernel/bpf/bpf_struct_ops_types.h
-@@ -9,4 +9,8 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
- #include <net/tcp.h>
- BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
- #endif
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+#include <linux/sched/ext.h>
-+BPF_STRUCT_OPS_TYPE(sched_ext_ops)
-+#endif
- #endif
-diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
-index 518725b57..f426f4be7 100644
---- a/kernel/cgroup/cgroup.c
-+++ b/kernel/cgroup/cgroup.c
-@@ -4196,10 +4196,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
- 		return ret;
- 	}
- 
-+	kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN));
-+
- 	if (cft->file_offset) {
- 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
- 
- 		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
-+		cfile->cft = cft;
- 
- 		spin_lock_irq(&cgroup_file_kn_lock);
- 		cfile->kn = kn;
-@@ -4475,6 +4478,24 @@ void cgroup_file_notify(struct cgroup_file *cfile)
- 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
- }
- 
-+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile)
-+{
-+	struct kernfs_node *kn;
-+
-+	spin_lock_irq(&cgroup_file_kn_lock);
-+	kn = cfile->kn;
-+	kernfs_get(kn);
-+	spin_unlock_irq(&cgroup_file_kn_lock);
-+
-+	return kn;
-+}
-+
-+static bool cfile_visible(struct cgroup_file *cfile)
-+{
-+	return !(cfile->cft->flags & CFTYPE_HIDDEN) &&
-+		!(cfile->flags & CFILE_HIDDEN);
-+}
-+
- /**
-  * cgroup_file_show - show or hide a hidden cgroup file
-  * @cfile: target cgroup_file obtained by setting cftype->file_offset
-@@ -4484,15 +4505,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
- {
- 	struct kernfs_node *kn;
- 
--	spin_lock_irq(&cgroup_file_kn_lock);
--	kn = cfile->kn;
--	kernfs_get(kn);
--	spin_unlock_irq(&cgroup_file_kn_lock);
-+	mutex_lock(&cgroup_mutex);
- 
--	if (kn)
--		kernfs_show(kn, show);
-+	if (show)
-+		cfile->flags &= ~CFILE_HIDDEN;
-+	else
-+		cfile->flags |= CFILE_HIDDEN;
- 
--	kernfs_put(kn);
-+	kn = cfile_kn_get(cfile);
-+	if (kn) {
-+		kernfs_show(kn, cfile_visible(cfile));
-+		kernfs_put(kn);
-+	}
-+
-+	mutex_unlock(&cgroup_mutex);
- }
- 
- /**
-@@ -5510,6 +5536,63 @@ static void offline_css(struct cgroup_subsys_state *css)
- 	wake_up_all(&css->cgroup->offline_waitq);
- }
- 
-+/**
-+ * cgroup_show_cftype - show or hide a cgroup file type
-+ * @cft: cftype to show or hide
-+ * @show: whether to show or hide
-+ *
-+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show.
-+ * @cft may or may not be added at the time of this call. After hiding, it's
-+ * guaranteed that there are no in-flight operations on the hidden files.
-+ */
-+void cgroup_show_cftype(struct cftype *cft, bool show)
-+{
-+	struct cgroup_subsys *ss = cft->ss;
-+	struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp;
-+	struct cgroup_subsys_state *css;
-+
-+	mutex_lock(&cgroup_mutex);
-+
-+	if (show)
-+		cft->flags &= ~CFTYPE_HIDDEN;
-+	else
-+		cft->flags |= CFTYPE_HIDDEN;
-+
-+	if (!(cft->flags & __CFTYPE_ADDED))
-+		goto out_unlock;
-+
-+	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
-+		struct cgroup *cgrp = css->cgroup;
-+		struct kernfs_node *kn;
-+
-+		if (!(css->flags & CSS_VISIBLE))
-+			continue;
-+
-+		if (cft->file_offset) {
-+			struct cgroup_file *cfile =
-+				(void *)css + cft->file_offset;
-+
-+			kn = cfile_kn_get(cfile);
-+			if (kn) {
-+				kernfs_show(kn, cfile_visible(cfile));
-+				kernfs_put(kn);
-+			}
-+		} else {
-+			char buf[CGROUP_FILE_NAME_MAX];
-+
-+			kn = kernfs_find_and_get(cgrp->kn,
-+					cgroup_file_name(cgrp, cft, buf));
-+			if (kn) {
-+				kernfs_show(kn, show);
-+				kernfs_put(kn);
-+			}
-+		}
-+	}
-+
-+out_unlock:
-+	mutex_unlock(&cgroup_mutex);
-+}
-+
- /**
-  * css_create - create a cgroup_subsys_state
-  * @cgrp: the cgroup new css will be associated with
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 177ce7438..141fceb3b 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -23,6 +23,7 @@
- #include <linux/sched/task.h>
- #include <linux/sched/task_stack.h>
- #include <linux/sched/cputime.h>
-+#include <linux/sched/ext.h>
- #include <linux/seq_file.h>
- #include <linux/rtmutex.h>
- #include <linux/init.h>
-@@ -970,6 +971,7 @@ void __put_task_struct(struct task_struct *tsk)
- 	WARN_ON(refcount_read(&tsk->usage));
- 	WARN_ON(tsk == current);
- 
-+	sched_ext_free(tsk);
- 	io_uring_free(tsk);
- 	cgroup_free(tsk);
- 	task_numa_free(tsk, true);
-@@ -2474,7 +2476,7 @@ __latent_entropy struct task_struct *copy_process(
- 
- 	retval = perf_event_init_task(p, clone_flags);
- 	if (retval)
--		goto bad_fork_cleanup_policy;
-+		goto bad_fork_sched_cancel_fork;
- 	retval = audit_alloc(p);
- 	if (retval)
- 		goto bad_fork_cleanup_perf;
-@@ -2606,7 +2608,9 @@ __latent_entropy struct task_struct *copy_process(
- 	 * cgroup specific, it unconditionally needs to place the task on a
- 	 * runqueue.
- 	 */
--	sched_cgroup_fork(p, args);
-+	retval = sched_cgroup_fork(p, args);
-+	if (retval)
-+		goto bad_fork_cancel_cgroup;
- 
- 	/*
- 	 * From this point on we must avoid any synchronous user-space
-@@ -2652,13 +2656,13 @@ __latent_entropy struct task_struct *copy_process(
- 	/* Don't start children in a dying pid namespace */
- 	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
- 		retval = -ENOMEM;
--		goto bad_fork_cancel_cgroup;
-+		goto bad_fork_core_free;
- 	}
- 
- 	/* Let kill terminate clone/fork in the middle */
- 	if (fatal_signal_pending(current)) {
- 		retval = -EINTR;
--		goto bad_fork_cancel_cgroup;
-+		goto bad_fork_core_free;
- 	}
- 
- 	/* No more failure paths after this point. */
-@@ -2734,10 +2738,11 @@ __latent_entropy struct task_struct *copy_process(
- 
- 	return p;
- 
--bad_fork_cancel_cgroup:
-+bad_fork_core_free:
- 	sched_core_free(p);
- 	spin_unlock(&current->sighand->siglock);
- 	write_unlock_irq(&tasklist_lock);
-+bad_fork_cancel_cgroup:
- 	cgroup_cancel_fork(p, args);
- bad_fork_put_pidfd:
- 	if (clone_flags & CLONE_PIDFD) {
-@@ -2776,6 +2781,8 @@ __latent_entropy struct task_struct *copy_process(
- 	audit_free(p);
- bad_fork_cleanup_perf:
- 	perf_event_free_task(p);
-+bad_fork_sched_cancel_fork:
-+	sched_cancel_fork(p);
- bad_fork_cleanup_policy:
- 	lockdep_free_task(p);
- #ifdef CONFIG_NUMA
-diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
-index d9dc9ab37..005025f55 100644
---- a/kernel/sched/build_policy.c
-+++ b/kernel/sched/build_policy.c
-@@ -28,6 +28,8 @@
- #include <linux/suspend.h>
- #include <linux/tsacct_kern.h>
- #include <linux/vtime.h>
-+#include <linux/sysrq.h>
-+#include <linux/percpu-rwsem.h>
- 
- #include <uapi/linux/sched/types.h>
- 
-@@ -52,3 +54,6 @@
- #include "cputime.c"
- #include "deadline.c"
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+# include "ext.c"
-+#endif
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index a854b7183..5f2f52fc7 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -167,7 +167,10 @@ static inline int __task_prio(const struct task_struct *p)
- 	if (p->sched_class == &idle_sched_class)
- 		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
- 
--	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
-+	if (task_on_scx(p))
-+		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
-+
-+	return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
- }
- 
- /*
-@@ -196,6 +199,11 @@ static inline bool prio_less(const struct task_struct *a,
- 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
- 		return cfs_prio_less(a, b, in_fi);
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	if (pa == MAX_RT_PRIO + MAX_NICE + 1)	/* ext */
-+		return scx_prio_less(a, b, in_fi);
-+#endif
-+
- 	return false;
- }
- 
-@@ -1233,11 +1241,14 @@ bool sched_can_stop_tick(struct rq *rq)
- 		return true;
- 
- 	/*
--	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
--	 * if there's more than one we need the tick for involuntary
--	 * preemption.
-+	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
-+	 * left. For CFS, if there's more than one we need the tick for
-+	 * involuntary preemption. For SCX, ask.
- 	 */
--	if (rq->nr_running > 1)
-+	if (!scx_switched_all() && rq->nr_running > 1)
-+		return false;
-+
-+	if (scx_enabled() && !scx_can_stop_tick(rq))
- 		return false;
- 
- 	/*
-@@ -1320,8 +1331,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
- 	 * SCHED_OTHER tasks have to update their load when changing their
- 	 * weight
- 	 */
--	if (update_load && p->sched_class == &fair_sched_class) {
--		reweight_task(p, prio);
-+	if (update_load && p->sched_class->reweight_task) {
-+		p->sched_class->reweight_task(task_rq(p), p, prio);
- 	} else {
- 		load->weight = scale_load(sched_prio_to_weight[prio]);
- 		load->inv_weight = sched_prio_to_wmult[prio];
-@@ -2198,6 +2209,17 @@ inline int task_curr(const struct task_struct *p)
- 	return cpu_curr(task_cpu(p)) == p;
- }
- 
-+/*
-+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
-+ * mess with locking.
-+ */
-+void check_class_changing(struct rq *rq, struct task_struct *p,
-+			  const struct sched_class *prev_class)
-+{
-+	if (prev_class != p->sched_class && p->sched_class->switching_to)
-+		p->sched_class->switching_to(rq, p);
-+}
-+
- /*
-  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
-  * use the balance_callback list if you want balancing.
-@@ -2205,9 +2227,9 @@ inline int task_curr(const struct task_struct *p)
-  * this means any call to check_class_changed() must be followed by a call to
-  * balance_callback().
-  */
--static inline void check_class_changed(struct rq *rq, struct task_struct *p,
--				       const struct sched_class *prev_class,
--				       int oldprio)
-+void check_class_changed(struct rq *rq, struct task_struct *p,
-+			 const struct sched_class *prev_class,
-+			 int oldprio)
- {
- 	if (prev_class != p->sched_class) {
- 		if (prev_class->switched_from)
-@@ -3962,6 +3984,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
- 
- static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
- {
-+	/*
-+	 * The BPF scheduler may depend on select_task_rq() being invoked during
-+	 * wakeups. In addition, @p may end up executing on a different CPU
-+	 * regardless of what happens in the wakeup path making the ttwu_queue
-+	 * optimization less meaningful. Skip if on SCX.
-+	 */
-+	if (task_on_scx(p))
-+		return false;
-+
- 	/*
- 	 * Do not complicate things with the async wake_list while the CPU is
- 	 * in hotplug state.
-@@ -4528,6 +4559,21 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->rt.on_rq		= 0;
- 	p->rt.on_list		= 0;
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	p->scx.dsq		= NULL;
-+	INIT_LIST_HEAD(&p->scx.dsq_node.fifo);
-+	RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-+	INIT_LIST_HEAD(&p->scx.watchdog_node);
-+	p->scx.flags		= 0;
-+	p->scx.weight		= 0;
-+	p->scx.sticky_cpu	= -1;
-+	p->scx.holding_cpu	= -1;
-+	p->scx.kf_mask		= 0;
-+	atomic64_set(&p->scx.ops_state, 0);
-+	p->scx.runnable_at	= INITIAL_JIFFIES;
-+	p->scx.slice		= SCX_SLICE_DFL;
-+#endif
-+
- #ifdef CONFIG_PREEMPT_NOTIFIERS
- 	INIT_HLIST_HEAD(&p->preempt_notifiers);
- #endif
-@@ -4731,6 +4777,8 @@ late_initcall(sched_core_sysctl_init);
-  */
- int sched_fork(unsigned long clone_flags, struct task_struct *p)
- {
-+	int ret;
-+
- 	__sched_fork(clone_flags, p);
- 	/*
- 	 * We mark the process as NEW here. This guarantees that
-@@ -4767,12 +4815,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
- 		p->sched_reset_on_fork = 0;
- 	}
- 
--	if (dl_prio(p->prio))
--		return -EAGAIN;
--	else if (rt_prio(p->prio))
-+	scx_pre_fork(p);
-+
-+	if (dl_prio(p->prio)) {
-+		ret = -EAGAIN;
-+		goto out_cancel;
-+	} else if (rt_prio(p->prio)) {
- 		p->sched_class = &rt_sched_class;
--	else
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	} else if (task_should_scx(p)) {
-+		p->sched_class = &ext_sched_class;
-+#endif
-+	} else {
- 		p->sched_class = &fair_sched_class;
-+	}
- 
- 	init_entity_runnable_average(&p->se);
- 
-@@ -4790,9 +4846,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
- #endif
- 	return 0;
-+
-+out_cancel:
-+	scx_cancel_fork(p);
-+	return ret;
- }
- 
--void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
-+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
- {
- 	unsigned long flags;
- 
-@@ -4819,11 +4879,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
- 	if (p->sched_class->task_fork)
- 		p->sched_class->task_fork(p);
- 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-+
-+	return scx_fork(p);
-+}
-+
-+void sched_cancel_fork(struct task_struct *p)
-+{
-+	scx_cancel_fork(p);
- }
- 
- void sched_post_fork(struct task_struct *p)
- {
- 	uclamp_post_fork(p);
-+	scx_post_fork(p);
- }
- 
- unsigned long to_ratio(u64 period, u64 runtime)
-@@ -5668,14 +5736,17 @@ void scheduler_tick(void)
- 	if (sched_feat(LATENCY_WARN) && resched_latency)
- 		resched_latency_warn(cpu, resched_latency);
- 
-+	scx_notify_sched_tick();
- 	perf_event_task_tick();
- 
- 	if (curr->flags & PF_WQ_WORKER)
- 		wq_worker_tick(curr);
- 
- #ifdef CONFIG_SMP
--	rq->idle_balance = idle_cpu(cpu);
--	trigger_load_balance(rq);
-+	if (!scx_switched_all()) {
-+		rq->idle_balance = idle_cpu(cpu);
-+		trigger_load_balance(rq);
-+	}
- #endif
- }
- 
-@@ -5976,7 +6047,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
- 	 * We can terminate the balance pass as soon as we know there is
- 	 * a runnable task of @class priority or higher.
- 	 */
--	for_class_range(class, prev->sched_class, &idle_sched_class) {
-+	for_balance_class_range(class, prev->sched_class, &idle_sched_class) {
- 		if (class->balance(rq, prev, rf))
- 			break;
- 	}
-@@ -5994,6 +6065,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- 	const struct sched_class *class;
- 	struct task_struct *p;
- 
-+	if (scx_enabled())
-+		goto restart;
-+
- 	/*
- 	 * Optimization: we know that if all tasks are in the fair class we can
- 	 * call that function directly, but only if the @prev task wasn't of a
-@@ -6019,10 +6093,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- restart:
- 	put_prev_task_balance(rq, prev, rf);
- 
--	for_each_class(class) {
-+	for_each_active_class(class) {
- 		p = class->pick_next_task(rq);
--		if (p)
-+		if (p) {
-+			scx_notify_pick_next_task(rq, p, class);
- 			return p;
-+		}
- 	}
- 
- 	BUG(); /* The idle class should always have a runnable task. */
-@@ -6052,7 +6128,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
- 	const struct sched_class *class;
- 	struct task_struct *p;
- 
--	for_each_class(class) {
-+	for_each_active_class(class) {
- 		p = class->pick_task(rq);
- 		if (p)
- 			return p;
-@@ -7021,12 +7097,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
- }
- EXPORT_SYMBOL(default_wake_function);
- 
--static void __setscheduler_prio(struct task_struct *p, int prio)
-+void __setscheduler_prio(struct task_struct *p, int prio)
- {
- 	if (dl_prio(prio))
- 		p->sched_class = &dl_sched_class;
- 	else if (rt_prio(prio))
- 		p->sched_class = &rt_sched_class;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	else if (task_should_scx(p))
-+		p->sched_class = &ext_sched_class;
-+#endif
- 	else
- 		p->sched_class = &fair_sched_class;
- 
-@@ -7161,6 +7241,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
- 	}
- 
- 	__setscheduler_prio(p, prio);
-+	check_class_changing(rq, p, prev_class);
- 
- 	if (queued)
- 		enqueue_task(rq, p, queue_flag);
-@@ -7707,6 +7788,10 @@ static int __sched_setscheduler(struct task_struct *p,
- 		goto unlock;
- 	}
- 
-+	retval = scx_check_setscheduler(p, policy);
-+	if (retval)
-+		goto unlock;
-+
- 	/*
- 	 * If not changing anything there's no need to proceed further,
- 	 * but store a possible modification of reset_on_fork.
-@@ -7809,6 +7894,7 @@ static int __sched_setscheduler(struct task_struct *p,
- 		__setscheduler_prio(p, newprio);
- 	}
- 	__setscheduler_uclamp(p, attr);
-+	check_class_changing(rq, p, prev_class);
- 
- 	if (queued) {
- 		/*
-@@ -9050,6 +9136,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 		break;
- 	}
-@@ -9077,6 +9164,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 	}
- 	return ret;
-@@ -9180,6 +9268,7 @@ void sched_show_task(struct task_struct *p)
- 
- 	print_worker_info(KERN_INFO, p);
- 	print_stop_info(KERN_INFO, p);
-+	print_scx_info(KERN_INFO, p);
- 	show_stack(p, NULL, KERN_INFO);
- 	put_task_stack(p);
- }
-@@ -9565,7 +9654,7 @@ static inline void balance_hotplug_wait(void)
- 
- #endif /* CONFIG_HOTPLUG_CPU */
- 
--void set_rq_online(struct rq *rq)
-+void set_rq_online(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (!rq->online) {
- 		const struct sched_class *class;
-@@ -9575,12 +9664,12 @@ void set_rq_online(struct rq *rq)
- 
- 		for_each_class(class) {
- 			if (class->rq_online)
--				class->rq_online(rq);
-+				class->rq_online(rq, reason);
- 		}
- 	}
- }
- 
--void set_rq_offline(struct rq *rq)
-+void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (rq->online) {
- 		const struct sched_class *class;
-@@ -9588,7 +9677,7 @@ void set_rq_offline(struct rq *rq)
- 		update_rq_clock(rq);
- 		for_each_class(class) {
- 			if (class->rq_offline)
--				class->rq_offline(rq);
-+				class->rq_offline(rq, reason);
- 		}
- 
- 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
-@@ -9684,7 +9773,7 @@ int sched_cpu_activate(unsigned int cpu)
- 	rq_lock_irqsave(rq, &rf);
- 	if (rq->rd) {
- 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
--		set_rq_online(rq);
-+		set_rq_online(rq, RQ_ONOFF_HOTPLUG);
- 	}
- 	rq_unlock_irqrestore(rq, &rf);
- 
-@@ -9728,7 +9817,7 @@ int sched_cpu_deactivate(unsigned int cpu)
- 	rq_lock_irqsave(rq, &rf);
- 	if (rq->rd) {
- 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
--		set_rq_offline(rq);
-+		set_rq_offline(rq, RQ_ONOFF_HOTPLUG);
- 	}
- 	rq_unlock_irqrestore(rq, &rf);
- 
-@@ -9915,11 +10004,15 @@ void __init sched_init(void)
- 	int i;
- 
- 	/* Make sure the linker didn't screw up */
--	BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
--	       &fair_sched_class != &rt_sched_class + 1 ||
--	       &rt_sched_class   != &dl_sched_class + 1);
- #ifdef CONFIG_SMP
--	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
-+	BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-+#endif
-+	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
-+	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
-+	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
-+	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
- #endif
- 
- 	wait_bit_init();
-@@ -9943,6 +10036,9 @@ void __init sched_init(void)
- 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
- #endif /* CONFIG_FAIR_GROUP_SCHED */
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
-+#endif /* CONFIG_EXT_GROUP_SCHED */
- #ifdef CONFIG_RT_GROUP_SCHED
- 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
- 		ptr += nr_cpu_ids * sizeof(void **);
-@@ -10090,6 +10186,7 @@ void __init sched_init(void)
- 	balance_push_set(smp_processor_id(), false);
- #endif
- 	init_sched_fair_class();
-+	init_sched_ext_class();
- 
- 	psi_init();
- 
-@@ -10398,6 +10495,7 @@ struct task_group *sched_create_group(struct task_group *parent)
- 	if (!alloc_rt_sched_group(tg, parent))
- 		goto err;
- 
-+	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
- 	alloc_uclamp_sched_group(tg, parent);
- 
- 	return tg;
-@@ -10524,6 +10622,7 @@ void sched_move_task(struct task_struct *tsk)
- 		put_prev_task(rq, tsk);
- 
- 	sched_change_group(tsk, group);
-+	scx_move_task(tsk);
- 
- 	if (queued)
- 		enqueue_task(rq, tsk, queue_flags);
-@@ -10541,11 +10640,6 @@ void sched_move_task(struct task_struct *tsk)
- 	task_rq_unlock(rq, tsk, &rf);
- }
- 
--static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
--{
--	return css ? container_of(css, struct task_group, css) : NULL;
--}
--
- static struct cgroup_subsys_state *
- cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
- {
-@@ -10569,6 +10663,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
- {
- 	struct task_group *tg = css_tg(css);
- 	struct task_group *parent = css_tg(css->parent);
-+	int ret;
-+
-+	ret = scx_tg_online(tg);
-+	if (ret)
-+		return ret;
- 
- 	if (parent)
- 		sched_online_group(tg, parent);
-@@ -10585,6 +10684,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
- 	return 0;
- }
- 
-+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-+{
-+	struct task_group *tg = css_tg(css);
-+
-+	scx_tg_offline(tg);
-+}
-+
- static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
- {
- 	struct task_group *tg = css_tg(css);
-@@ -10602,9 +10708,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
- 	sched_unregister_group(tg);
- }
- 
--#ifdef CONFIG_RT_GROUP_SCHED
-+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
- static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
- {
-+#ifdef CONFIG_RT_GROUP_SCHED
- 	struct task_struct *task;
- 	struct cgroup_subsys_state *css;
- 
-@@ -10612,7 +10719,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
- 		if (!sched_rt_can_attach(css_tg(css), task))
- 			return -EINVAL;
- 	}
--	return 0;
-+#endif
-+	return scx_cgroup_can_attach(tset);
- }
- #endif
- 
-@@ -10623,8 +10731,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
- 
- 	cgroup_taskset_for_each(task, css, tset)
- 		sched_move_task(task);
-+
-+	scx_cgroup_finish_attach();
- }
- 
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
-+{
-+	scx_cgroup_cancel_attach(tset);
-+}
-+#endif
-+
- #ifdef CONFIG_UCLAMP_TASK_GROUP
- static void cpu_util_update_eff(struct cgroup_subsys_state *css)
- {
-@@ -10806,9 +10923,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
- static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
- 				struct cftype *cftype, u64 shareval)
- {
-+	int ret;
-+
- 	if (shareval > scale_load_down(ULONG_MAX))
- 		shareval = MAX_SHARES;
--	return sched_group_set_shares(css_tg(css), scale_load(shareval));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css),
-+				     sched_weight_to_cgroup(shareval));
-+	return ret;
- }
- 
- static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
-@@ -11209,7 +11332,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
- }
- #endif
- 
--static struct cftype cpu_legacy_files[] = {
-+static struct cftype cpu_legacy_cftypes[] = {
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 	{
- 		.name = "shares",
-@@ -11320,38 +11443,44 @@ static int cpu_local_stat_show(struct seq_file *sf,
- 	return 0;
- }
- 
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+
-+static unsigned long tg_weight(struct task_group *tg)
-+{
- #ifdef CONFIG_FAIR_GROUP_SCHED
-+	return scale_load_down(tg->shares);
-+#else
-+	return sched_weight_from_cgroup(tg->scx_weight);
-+#endif
-+}
-+
- static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
- 			       struct cftype *cft)
- {
--	struct task_group *tg = css_tg(css);
--	u64 weight = scale_load_down(tg->shares);
--
--	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
-+	return sched_weight_to_cgroup(tg_weight(css_tg(css)));
- }
- 
- static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
--				struct cftype *cft, u64 weight)
-+				struct cftype *cft, u64 cgrp_weight)
- {
--	/*
--	 * cgroup weight knobs should use the common MIN, DFL and MAX
--	 * values which are 1, 100 and 10000 respectively.  While it loses
--	 * a bit of range on both ends, it maps pretty well onto the shares
--	 * value used by scheduler and the round-trip conversions preserve
--	 * the original value over the entire range.
--	 */
--	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
-+	unsigned long weight;
-+	int ret;
-+
-+	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
- 		return -ERANGE;
- 
--	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
-+	weight = sched_weight_from_cgroup(cgrp_weight);
- 
--	return sched_group_set_shares(css_tg(css), scale_load(weight));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css), cgrp_weight);
-+	return ret;
- }
- 
- static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
- 				    struct cftype *cft)
- {
--	unsigned long weight = scale_load_down(css_tg(css)->shares);
-+	unsigned long weight = tg_weight(css_tg(css));
- 	int last_delta = INT_MAX;
- 	int prio, delta;
- 
-@@ -11370,7 +11499,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
- 				     struct cftype *cft, s64 nice)
- {
- 	unsigned long weight;
--	int idx;
-+	int idx, ret;
- 
- 	if (nice < MIN_NICE || nice > MAX_NICE)
- 		return -ERANGE;
-@@ -11379,7 +11508,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
- 	idx = array_index_nospec(idx, 40);
- 	weight = sched_prio_to_weight[idx];
- 
--	return sched_group_set_shares(css_tg(css), scale_load(weight));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css),
-+				     sched_weight_to_cgroup(weight));
-+	return ret;
- }
- #endif
- 
-@@ -11440,21 +11573,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
- }
- #endif
- 
--static struct cftype cpu_files[] = {
--#ifdef CONFIG_FAIR_GROUP_SCHED
--	{
-+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+	[CPU_CFTYPE_WEIGHT] = {
- 		.name = "weight",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_u64 = cpu_weight_read_u64,
- 		.write_u64 = cpu_weight_write_u64,
- 	},
--	{
-+	[CPU_CFTYPE_WEIGHT_NICE] = {
- 		.name = "weight.nice",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_s64 = cpu_weight_nice_read_s64,
- 		.write_s64 = cpu_weight_nice_write_s64,
- 	},
--	{
-+#endif
-+#ifdef CONFIG_FAIR_GROUP_SCHED
-+	[CPU_CFTYPE_IDLE] = {
- 		.name = "idle",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_s64 = cpu_idle_read_s64,
-@@ -11462,13 +11597,13 @@ static struct cftype cpu_files[] = {
- 	},
- #endif
- #ifdef CONFIG_CFS_BANDWIDTH
--	{
-+	[CPU_CFTYPE_MAX] = {
- 		.name = "max",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_max_show,
- 		.write = cpu_max_write,
- 	},
--	{
-+	[CPU_CFTYPE_MAX_BURST] = {
- 		.name = "max.burst",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_u64 = cpu_cfs_burst_read_u64,
-@@ -11476,13 +11611,13 @@ static struct cftype cpu_files[] = {
- 	},
- #endif
- #ifdef CONFIG_UCLAMP_TASK_GROUP
--	{
-+	[CPU_CFTYPE_UCLAMP_MIN] = {
- 		.name = "uclamp.min",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_uclamp_min_show,
- 		.write = cpu_uclamp_min_write,
- 	},
--	{
-+	[CPU_CFTYPE_UCLAMP_MAX] = {
- 		.name = "uclamp.max",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_uclamp_max_show,
-@@ -11495,16 +11630,20 @@ static struct cftype cpu_files[] = {
- struct cgroup_subsys cpu_cgrp_subsys = {
- 	.css_alloc	= cpu_cgroup_css_alloc,
- 	.css_online	= cpu_cgroup_css_online,
-+	.css_offline	= cpu_cgroup_css_offline,
- 	.css_released	= cpu_cgroup_css_released,
- 	.css_free	= cpu_cgroup_css_free,
- 	.css_extra_stat_show = cpu_extra_stat_show,
- 	.css_local_stat_show = cpu_local_stat_show,
--#ifdef CONFIG_RT_GROUP_SCHED
-+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
- 	.can_attach	= cpu_cgroup_can_attach,
- #endif
- 	.attach		= cpu_cgroup_attach,
--	.legacy_cftypes	= cpu_legacy_files,
--	.dfl_cftypes	= cpu_files,
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	.cancel_attach	= cpu_cgroup_cancel_attach,
-+#endif
-+	.legacy_cftypes	= cpu_legacy_cftypes,
-+	.dfl_cftypes	= cpu_cftypes,
- 	.early_init	= true,
- 	.threaded	= true,
- };
-@@ -12104,3 +12243,38 @@ void sched_mm_cid_fork(struct task_struct *t)
- 	t->mm_cid_active = 1;
- }
- #endif
-+
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-+			    struct sched_enq_and_set_ctx *ctx)
-+{
-+	struct rq *rq = task_rq(p);
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	*ctx = (struct sched_enq_and_set_ctx){
-+		.p = p,
-+		.queue_flags = queue_flags,
-+		.queued = task_on_rq_queued(p),
-+		.running = task_current(rq, p),
-+	};
-+
-+	update_rq_clock(rq);
-+	if (ctx->queued)
-+		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
-+	if (ctx->running)
-+		put_prev_task(rq, p);
-+}
-+
-+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
-+{
-+	struct rq *rq = task_rq(ctx->p);
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	if (ctx->queued)
-+		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
-+	if (ctx->running)
-+		set_next_task(rq, ctx->p);
-+}
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
-index d78f2e876..77e7bc42e 100644
---- a/kernel/sched/deadline.c
-+++ b/kernel/sched/deadline.c
-@@ -2512,7 +2512,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
- }
- 
- /* Assumes rq->lock is held */
--static void rq_online_dl(struct rq *rq)
-+static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (rq->dl.overloaded)
- 		dl_set_overload(rq);
-@@ -2523,7 +2523,7 @@ static void rq_online_dl(struct rq *rq)
- }
- 
- /* Assumes rq->lock is held */
--static void rq_offline_dl(struct rq *rq)
-+static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (rq->dl.overloaded)
- 		dl_clear_overload(rq);
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 4c3d0d9f3..bbc6b8e37 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -374,6 +374,9 @@ static __init int sched_init_debug(void)
- 
- 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops);
-+#endif
- 	return 0;
- }
- late_initcall(sched_init_debug);
-@@ -1090,6 +1093,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
- 		P(dl.runtime);
- 		P(dl.deadline);
- 	}
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	__PS("ext.enabled", task_on_scx(p));
-+#endif
- #undef PN_SCHEDSTAT
- #undef P_SCHEDSTAT
- 
-diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
-new file mode 100644
-index 000000000..a4d3d8397
---- /dev/null
-+++ b/kernel/sched/ext.c
-@@ -0,0 +1,4497 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-+
-+enum scx_internal_consts {
-+	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
-+	SCX_DSP_DFL_MAX_BATCH	= 32,
-+	SCX_DSP_MAX_LOOPS	= 32,
-+	SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-+};
-+
-+enum scx_ops_enable_state {
-+	SCX_OPS_PREPPING,
-+	SCX_OPS_ENABLING,
-+	SCX_OPS_ENABLED,
-+	SCX_OPS_DISABLING,
-+	SCX_OPS_DISABLED,
-+};
-+
-+static const char *scx_ops_enable_state_str[] = {
-+	[SCX_OPS_PREPPING]	= "prepping",
-+	[SCX_OPS_ENABLING]	= "enabling",
-+	[SCX_OPS_ENABLED]	= "enabled",
-+	[SCX_OPS_DISABLING]	= "disabling",
-+	[SCX_OPS_DISABLED]	= "disabled",
-+};
-+
-+/*
-+ * sched_ext_entity->ops_state
-+ *
-+ * Used to track the task ownership between the SCX core and the BPF scheduler.
-+ * State transitions look as follows:
-+ *
-+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
-+ *   ^              |                 |
-+ *   |              v                 v
-+ *   \-------------------------------/
-+ *
-+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
-+ * sites for explanations on the conditions being waited upon and why they are
-+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
-+ * waiters should load_acquire.
-+ *
-+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
-+ * any given task can be dispatched by the BPF scheduler at all times and thus
-+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
-+ * to try to dispatch any task anytime regardless of its state as the SCX core
-+ * can safely reject invalid dispatches.
-+ */
-+enum scx_ops_state {
-+	SCX_OPSS_NONE,		/* owned by the SCX core */
-+	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
-+	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
-+	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
-+
-+	/*
-+	 * QSEQ brands each QUEUED instance so that, when dispatch races
-+	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
-+	 * on the task being dispatched.
-+	 *
-+	 * As some 32bit archs can't do 64bit store_release/load_acquire,
-+	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
-+	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
-+	 * and runs with IRQ disabled. 30 bits should be sufficient.
-+	 */
-+	SCX_OPSS_QSEQ_SHIFT	= 2,
-+};
-+
-+/* Use macros to ensure that the type is unsigned long for the masks */
-+#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-+#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
-+
-+/*
-+ * During exit, a task may schedule after losing its PIDs. When disabling the
-+ * BPF scheduler, we need to be able to iterate tasks in every state to
-+ * guarantee system safety. Maintain a dedicated task list which contains every
-+ * task between its fork and eventual free.
-+ */
-+static DEFINE_SPINLOCK(scx_tasks_lock);
-+static LIST_HEAD(scx_tasks);
-+
-+/* ops enable/disable */
-+static struct kthread_worker *scx_ops_helper;
-+static DEFINE_MUTEX(scx_ops_enable_mutex);
-+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
-+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-+static bool scx_switch_all_req;
-+static bool scx_switching_all;
-+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-+
-+static struct sched_ext_ops scx_ops;
-+static bool scx_warned_zero_slice;
-+
-+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-+
-+struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
-+	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
-+
-+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-+static struct scx_exit_info scx_exit_info;
-+
-+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
-+
-+/*
-+ * The maximum amount of time in jiffies that a task may be runnable without
-+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
-+ * scx_ops_error().
-+ */
-+unsigned long scx_watchdog_timeout;
-+
-+/*
-+ * The last time the delayed work was run. This delayed work relies on
-+ * ksoftirqd being able to run to service timer interrupts, so it's possible
-+ * that this work itself could get wedged. To account for this, we check that
-+ * it's not stalled in the timer tick, and trigger an error if it is.
-+ */
-+unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
-+
-+static struct delayed_work scx_watchdog_work;
-+
-+/* idle tracking */
-+#ifdef CONFIG_SMP
-+#ifdef CONFIG_CPUMASK_OFFSTACK
-+#define CL_ALIGNED_IF_ONSTACK
-+#else
-+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-+#endif
-+
-+static struct {
-+	cpumask_var_t cpu;
-+	cpumask_var_t smt;
-+} idle_masks CL_ALIGNED_IF_ONSTACK;
-+
-+#endif	/* CONFIG_SMP */
-+
-+/* for %SCX_KICK_WAIT */
-+static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
-+
-+/*
-+ * Direct dispatch marker.
-+ *
-+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
-+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
-+ * to indicate that direct dispatch has already happened.
-+ */
-+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-+
-+/* dispatch queues */
-+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
-+
-+static const struct rhashtable_params dsq_hash_params = {
-+	.key_len		= 8,
-+	.key_offset		= offsetof(struct scx_dispatch_q, id),
-+	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
-+};
-+
-+static struct rhashtable dsq_hash;
-+static LLIST_HEAD(dsqs_to_free);
-+
-+/* dispatch buf */
-+struct scx_dsp_buf_ent {
-+	struct task_struct	*task;
-+	unsigned long		qseq;
-+	u64			dsq_id;
-+	u64			enq_flags;
-+};
-+
-+static u32 scx_dsp_max_batch;
-+static struct scx_dsp_buf_ent __percpu *scx_dsp_buf;
-+
-+struct scx_dsp_ctx {
-+	struct rq		*rq;
-+	struct rq_flags		*rf;
-+	u32			buf_cursor;
-+	u32			nr_tasks;
-+};
-+
-+static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
-+
-+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
-+		      u64 enq_flags);
-+void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-+
-+struct scx_task_iter {
-+	struct sched_ext_entity		cursor;
-+	struct task_struct		*locked;
-+	struct rq			*rq;
-+	struct rq_flags			rf;
-+};
-+
-+#define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
-+
-+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
-+static u32 higher_bits(u32 flags)
-+{
-+	return ~((1 << fls(flags)) - 1);
-+}
-+
-+/* return the mask with only the highest bit set */
-+static u32 highest_bit(u32 flags)
-+{
-+	int bit = fls(flags);
-+	return bit ? 1 << (bit - 1) : 0;
-+}
-+
-+/*
-+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
-+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
-+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
-+ * whether it's running from an allowed context.
-+ *
-+ * @mask is constant, always inline to cull the mask calculations.
-+ */
-+static __always_inline void scx_kf_allow(u32 mask)
-+{
-+	/* nesting is allowed only in increasing scx_kf_mask order */
-+	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
-+		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
-+		  current->scx.kf_mask, mask);
-+	current->scx.kf_mask |= mask;
-+}
-+
-+static void scx_kf_disallow(u32 mask)
-+{
-+	current->scx.kf_mask &= ~mask;
-+}
-+
-+#define SCX_CALL_OP(mask, op, args...)						\
-+do {										\
-+	if (mask) {								\
-+		scx_kf_allow(mask);						\
-+		scx_ops.op(args);						\
-+		scx_kf_disallow(mask);						\
-+	} else {								\
-+		scx_ops.op(args);						\
-+	}									\
-+} while (0)
-+
-+#define SCX_CALL_OP_RET(mask, op, args...)					\
-+({										\
-+	__typeof__(scx_ops.op(args)) __ret;					\
-+	if (mask) {								\
-+		scx_kf_allow(mask);						\
-+		__ret = scx_ops.op(args);					\
-+		scx_kf_disallow(mask);						\
-+	} else {								\
-+		__ret = scx_ops.op(args);					\
-+	}									\
-+	__ret;									\
-+})
-+
-+/*
-+ * Some kfuncs are allowed only on the tasks that are subjects of the
-+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
-+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
-+ * invoking scx_ops operations that take task arguments. These can only be used
-+ * for non-nesting operations due to the way the tasks are tracked.
-+ *
-+ * kfuncs which can only operate on such tasks can in turn use
-+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
-+ * the specific task.
-+ */
-+#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
-+do {										\
-+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task;					\
-+	SCX_CALL_OP(mask, op, task, ##args);					\
-+	current->scx.kf_tasks[0] = NULL;					\
-+} while (0)
-+
-+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
-+({										\
-+	__typeof__(scx_ops.op(task, ##args)) __ret;				\
-+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task;					\
-+	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
-+	current->scx.kf_tasks[0] = NULL;					\
-+	__ret;									\
-+})
-+
-+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
-+({										\
-+	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
-+	BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task0;					\
-+	current->scx.kf_tasks[1] = task1;					\
-+	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
-+	current->scx.kf_tasks[0] = NULL;					\
-+	current->scx.kf_tasks[1] = NULL;					\
-+	__ret;									\
-+})
-+
-+/* @mask is constant, always inline to cull unnecessary branches */
-+static __always_inline bool scx_kf_allowed(u32 mask)
-+{
-+	if (unlikely(!(current->scx.kf_mask & mask))) {
-+		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
-+			      mask, current->scx.kf_mask);
-+		return false;
-+	}
-+
-+	if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) &&
-+		     in_interrupt())) {
-+		scx_ops_error("sleepable kfunc called from non-sleepable context");
-+		return false;
-+	}
-+
-+	/*
-+	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
-+	 * DISPATCH must not be called if we're running DEQUEUE which is nested
-+	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
-+	 * boundary thanks to the above in_interrupt() check.
-+	 */
-+	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
-+		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
-+		scx_ops_error("cpu_release kfunc called from a nested operation");
-+		return false;
-+	}
-+
-+	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
-+		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
-+		scx_ops_error("dispatch kfunc called from a nested operation");
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+/* see SCX_CALL_OP_TASK() */
-+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
-+							struct task_struct *p)
-+{
-+	if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED))
-+		return false;
-+
-+	if (unlikely((p != current->scx.kf_tasks[0] &&
-+		      p != current->scx.kf_tasks[1]))) {
-+		scx_ops_error("called on a task not being operated on");
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+/**
-+ * scx_task_iter_init - Initialize a task iterator
-+ * @iter: iterator to init
-+ *
-+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
-+ * @iter must eventually be exited with scx_task_iter_exit().
-+ *
-+ * scx_tasks_lock may be released between this and the first next() call or
-+ * between any two next() calls. If scx_tasks_lock is released between two
-+ * next() calls, the caller is responsible for ensuring that the task being
-+ * iterated remains accessible either through RCU read lock or obtaining a
-+ * reference count.
-+ *
-+ * All tasks which existed when the iteration started are guaranteed to be
-+ * visited as long as they still exist.
-+ */
-+static void scx_task_iter_init(struct scx_task_iter *iter)
-+{
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
-+	list_add(&iter->cursor.tasks_node, &scx_tasks);
-+	iter->locked = NULL;
-+}
-+
-+/**
-+ * scx_task_iter_exit - Exit a task iterator
-+ * @iter: iterator to exit
-+ *
-+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
-+ * If the iterator holds a task's rq lock, that rq lock is released. See
-+ * scx_task_iter_init() for details.
-+ */
-+static void scx_task_iter_exit(struct scx_task_iter *iter)
-+{
-+	struct list_head *cursor = &iter->cursor.tasks_node;
-+
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	if (iter->locked) {
-+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-+		iter->locked = NULL;
-+	}
-+
-+	if (list_empty(cursor))
-+		return;
-+
-+	list_del_init(cursor);
-+}
-+
-+/**
-+ * scx_task_iter_next - Next task
-+ * @iter: iterator to walk
-+ *
-+ * Visit the next task. See scx_task_iter_init() for details.
-+ */
-+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
-+{
-+	struct list_head *cursor = &iter->cursor.tasks_node;
-+	struct sched_ext_entity *pos;
-+
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	list_for_each_entry(pos, cursor, tasks_node) {
-+		if (&pos->tasks_node == &scx_tasks)
-+			return NULL;
-+		if (!(pos->flags & SCX_TASK_CURSOR)) {
-+			list_move(cursor, &pos->tasks_node);
-+			return container_of(pos, struct task_struct, scx);
-+		}
-+	}
-+
-+	/* can't happen, should always terminate at scx_tasks above */
-+	BUG();
-+}
-+
-+/**
-+ * scx_task_iter_next_filtered - Next non-idle task
-+ * @iter: iterator to walk
-+ *
-+ * Visit the next non-idle task. See scx_task_iter_init() for details.
-+ */
-+static struct task_struct *
-+scx_task_iter_next_filtered(struct scx_task_iter *iter)
-+{
-+	struct task_struct *p;
-+
-+	while ((p = scx_task_iter_next(iter))) {
-+		/*
-+		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
-+		 * which haven't yet been onlined. Test sched_class directly.
-+		 */
-+		if (p->sched_class != &idle_sched_class)
-+			return p;
-+	}
-+	return NULL;
-+}
-+
-+/**
-+ * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked
-+ * @iter: iterator to walk
-+ *
-+ * Visit the next non-idle task with its rq lock held. See scx_task_iter_init()
-+ * for details.
-+ */
-+static struct task_struct *
-+scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
-+{
-+	struct task_struct *p;
-+
-+	if (iter->locked) {
-+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-+		iter->locked = NULL;
-+	}
-+
-+	p = scx_task_iter_next_filtered(iter);
-+	if (!p)
-+		return NULL;
-+
-+	iter->rq = task_rq_lock(p, &iter->rf);
-+	iter->locked = p;
-+	return p;
-+}
-+
-+static enum scx_ops_enable_state scx_ops_enable_state(void)
-+{
-+	return atomic_read(&scx_ops_enable_state_var);
-+}
-+
-+static enum scx_ops_enable_state
-+scx_ops_set_enable_state(enum scx_ops_enable_state to)
-+{
-+	return atomic_xchg(&scx_ops_enable_state_var, to);
-+}
-+
-+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
-+					enum scx_ops_enable_state from)
-+{
-+	int from_v = from;
-+
-+	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
-+}
-+
-+static bool scx_ops_disabling(void)
-+{
-+	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
-+}
-+
-+/**
-+ * wait_ops_state - Busy-wait the specified ops state to end
-+ * @p: target task
-+ * @opss: state to wait the end of
-+ *
-+ * Busy-wait for @p to transition out of @opss. This can only be used when the
-+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
-+ * has load_acquire semantics to ensure that the caller can see the updates made
-+ * in the enqueueing and dispatching paths.
-+ */
-+static void wait_ops_state(struct task_struct *p, unsigned long opss)
-+{
-+	do {
-+		cpu_relax();
-+	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
-+}
-+
-+/**
-+ * ops_cpu_valid - Verify a cpu number
-+ * @cpu: cpu number which came from a BPF ops
-+ *
-+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
-+ * Verify that it is in range and one of the possible cpus.
-+ */
-+static bool ops_cpu_valid(s32 cpu)
-+{
-+	return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
-+}
-+
-+/**
-+ * ops_sanitize_err - Sanitize a -errno value
-+ * @ops_name: operation to blame on failure
-+ * @err: -errno value to sanitize
-+ *
-+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
-+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
-+ * cause misbehaviors. For an example, a large negative return from
-+ * ops.prep_enable() triggers an oops when passed up the call chain because the
-+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
-+ * handled as a pointer.
-+ */
-+static int ops_sanitize_err(const char *ops_name, s32 err)
-+{
-+	if (err < 0 && err >= -MAX_ERRNO)
-+		return err;
-+
-+	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
-+	return -EPROTO;
-+}
-+
-+/**
-+ * touch_core_sched - Update timestamp used for core-sched task ordering
-+ * @rq: rq to read clock from, must be locked
-+ * @p: task to update the timestamp for
-+ *
-+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
-+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
-+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
-+ * exhaustion).
-+ */
-+static void touch_core_sched(struct rq *rq, struct task_struct *p)
-+{
-+#ifdef CONFIG_SCHED_CORE
-+	/*
-+	 * It's okay to update the timestamp spuriously. Use
-+	 * sched_core_disabled() which is cheaper than enabled().
-+	 */
-+	if (!sched_core_disabled())
-+		p->scx.core_sched_at = rq_clock_task(rq);
-+#endif
-+}
-+
-+/**
-+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
-+ * @rq: rq to read clock from, must be locked
-+ * @p: task being dispatched
-+ *
-+ * If the BPF scheduler implements custom core-sched ordering via
-+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
-+ * ordering within each local DSQ. This function is called from dispatch paths
-+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
-+ */
-+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(rq);
-+	assert_clock_updated(rq);
-+
-+#ifdef CONFIG_SCHED_CORE
-+	if (SCX_HAS_OP(core_sched_before))
-+		touch_core_sched(rq, p);
-+#endif
-+}
-+
-+static void update_curr_scx(struct rq *rq)
-+{
-+	struct task_struct *curr = rq->curr;
-+	u64 now = rq_clock_task(rq);
-+	u64 delta_exec;
-+
-+	if (time_before_eq64(now, curr->se.exec_start))
-+		return;
-+
-+	delta_exec = now - curr->se.exec_start;
-+	curr->se.exec_start = now;
-+	curr->se.sum_exec_runtime += delta_exec;
-+	account_group_exec_runtime(curr, delta_exec);
-+	cgroup_account_cputime(curr, delta_exec);
-+
-+	if (curr->scx.slice != SCX_SLICE_INF) {
-+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
-+		if (!curr->scx.slice)
-+			touch_core_sched(rq, curr);
-+	}
-+}
-+
-+static bool scx_dsq_priq_less(struct rb_node *node_a,
-+			      const struct rb_node *node_b)
-+{
-+	const struct task_struct *a =
-+		container_of(node_a, struct task_struct, scx.dsq_node.priq);
-+	const struct task_struct *b =
-+		container_of(node_b, struct task_struct, scx.dsq_node.priq);
-+
-+	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
-+}
-+
-+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
-+			     u64 enq_flags)
-+{
-+	bool is_local = dsq->id == SCX_DSQ_LOCAL;
-+
-+	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
-+	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
-+		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
-+
-+	if (!is_local) {
-+		raw_spin_lock(&dsq->lock);
-+		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
-+			scx_ops_error("attempting to dispatch to a destroyed dsq");
-+			/* fall back to the global dsq */
-+			raw_spin_unlock(&dsq->lock);
-+			dsq = &scx_dsq_global;
-+			raw_spin_lock(&dsq->lock);
-+		}
-+	}
-+
-+	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
-+		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
-+		rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
-+			      scx_dsq_priq_less);
-+	} else {
-+		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
-+			list_add(&p->scx.dsq_node.fifo, &dsq->fifo);
-+		else
-+			list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo);
-+	}
-+	dsq->nr++;
-+	p->scx.dsq = dsq;
-+
-+	/*
-+	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
-+	 * match waiters' load_acquire.
-+	 */
-+	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-+
-+	if (is_local) {
-+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
-+		bool preempt = false;
-+
-+		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
-+		    rq->curr->sched_class == &ext_sched_class) {
-+			rq->curr->scx.slice = 0;
-+			preempt = true;
-+		}
-+
-+		if (preempt || sched_class_above(&ext_sched_class,
-+						 rq->curr->sched_class))
-+			resched_curr(rq);
-+	} else {
-+		raw_spin_unlock(&dsq->lock);
-+	}
-+}
-+
-+static void task_unlink_from_dsq(struct task_struct *p,
-+				 struct scx_dispatch_q *dsq)
-+{
-+	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
-+		rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
-+		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-+		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
-+	} else {
-+		list_del_init(&p->scx.dsq_node.fifo);
-+	}
-+}
-+
-+static bool task_linked_on_dsq(struct task_struct *p)
-+{
-+	return !list_empty(&p->scx.dsq_node.fifo) ||
-+		!RB_EMPTY_NODE(&p->scx.dsq_node.priq);
-+}
-+
-+static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
-+{
-+	struct scx_dispatch_q *dsq = p->scx.dsq;
-+	bool is_local = dsq == &scx_rq->local_dsq;
-+
-+	if (!dsq) {
-+		WARN_ON_ONCE(task_linked_on_dsq(p));
-+		/*
-+		 * When dispatching directly from the BPF scheduler to a local
-+		 * DSQ, the task isn't associated with any DSQ but
-+		 * @p->scx.holding_cpu may be set under the protection of
-+		 * %SCX_OPSS_DISPATCHING.
-+		 */
-+		if (p->scx.holding_cpu >= 0)
-+			p->scx.holding_cpu = -1;
-+		return;
-+	}
-+
-+	if (!is_local)
-+		raw_spin_lock(&dsq->lock);
-+
-+	/*
-+	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
-+	 * can't change underneath us.
-+	*/
-+	if (p->scx.holding_cpu < 0) {
-+		/* @p must still be on @dsq, dequeue */
-+		WARN_ON_ONCE(!task_linked_on_dsq(p));
-+		task_unlink_from_dsq(p, dsq);
-+		dsq->nr--;
-+	} else {
-+		/*
-+		 * We're racing against dispatch_to_local_dsq() which already
-+		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
-+		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
-+		 * the race.
-+		 */
-+		WARN_ON_ONCE(task_linked_on_dsq(p));
-+		p->scx.holding_cpu = -1;
-+	}
-+	p->scx.dsq = NULL;
-+
-+	if (!is_local)
-+		raw_spin_unlock(&dsq->lock);
-+}
-+
-+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
-+{
-+	lockdep_assert(rcu_read_lock_any_held());
-+
-+	if (dsq_id == SCX_DSQ_GLOBAL)
-+		return &scx_dsq_global;
-+	else
-+		return rhashtable_lookup_fast(&dsq_hash, &dsq_id,
-+					      dsq_hash_params);
-+}
-+
-+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
-+						    struct task_struct *p)
-+{
-+	struct scx_dispatch_q *dsq;
-+
-+	if (dsq_id == SCX_DSQ_LOCAL)
-+		return &rq->scx.local_dsq;
-+
-+	dsq = find_non_local_dsq(dsq_id);
-+	if (unlikely(!dsq)) {
-+		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
-+			      dsq_id, p->comm, p->pid);
-+		return &scx_dsq_global;
-+	}
-+
-+	return dsq;
-+}
-+
-+static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p,
-+			    u64 dsq_id, u64 enq_flags)
-+{
-+	struct scx_dispatch_q *dsq;
-+
-+	/* @p must match the task which is being enqueued */
-+	if (unlikely(p != ddsp_task)) {
-+		if (IS_ERR(ddsp_task))
-+			scx_ops_error("%s[%d] already direct-dispatched",
-+				      p->comm, p->pid);
-+		else
-+			scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]",
-+				      ddsp_task->comm, ddsp_task->pid,
-+				      p->comm, p->pid);
-+		return;
-+	}
-+
-+	/*
-+	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
-+	 * dispatching to the local DSQ of a different CPU requires unlocking
-+	 * the current rq which isn't allowed in the enqueue path. Use
-+	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
-+	 */
-+	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
-+		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
-+		return;
-+	}
-+
-+	touch_core_sched_dispatch(task_rq(p), p);
-+
-+	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
-+	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-+
-+	/*
-+	 * Mark that dispatch already happened by spoiling direct_dispatch_task
-+	 * with a non-NULL value which can never match a valid task pointer.
-+	 */
-+	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
-+}
-+
-+static bool test_rq_online(struct rq *rq)
-+{
-+#ifdef CONFIG_SMP
-+	return rq->online;
-+#else
-+	return true;
-+#endif
-+}
-+
-+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
-+			    int sticky_cpu)
-+{
-+	struct task_struct **ddsp_taskp;
-+	unsigned long qseq;
-+
-+	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
-+
-+	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
-+		enq_flags |= SCX_ENQ_LOCAL;
-+		p->scx.flags &= ~SCX_TASK_ENQ_LOCAL;
-+	}
-+
-+	/* rq migration */
-+	if (sticky_cpu == cpu_of(rq))
-+		goto local_norefill;
-+
-+	/*
-+	 * If !rq->online, we already told the BPF scheduler that the CPU is
-+	 * offline. We're just trying to on/offline the CPU. Don't bother the
-+	 * BPF scheduler.
-+	 */
-+	if (unlikely(!test_rq_online(rq)))
-+		goto local;
-+
-+	/* see %SCX_OPS_ENQ_EXITING */
-+	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
-+	    unlikely(p->flags & PF_EXITING))
-+		goto local;
-+
-+	/* see %SCX_OPS_ENQ_LAST */
-+	if (!static_branch_unlikely(&scx_ops_enq_last) &&
-+	    (enq_flags & SCX_ENQ_LAST))
-+		goto local;
-+
-+	if (!SCX_HAS_OP(enqueue)) {
-+		if (enq_flags & SCX_ENQ_LOCAL)
-+			goto local;
-+		else
-+			goto global;
-+	}
-+
-+	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
-+	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
-+
-+	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
-+	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
-+
-+	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
-+	WARN_ON_ONCE(*ddsp_taskp);
-+	*ddsp_taskp = p;
-+
-+	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
-+
-+	/*
-+	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
-+	 * dequeue may be waiting. The store_release matches their load_acquire.
-+	 */
-+	if (*ddsp_taskp == p)
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
-+	*ddsp_taskp = NULL;
-+	return;
-+
-+local:
-+	/*
-+	 * For task-ordering, slice refill must be treated as implying the end
-+	 * of the current slice. Otherwise, the longer @p stays on the CPU, the
-+	 * higher priority it becomes from scx_prio_less()'s POV.
-+	 */
-+	touch_core_sched(rq, p);
-+	p->scx.slice = SCX_SLICE_DFL;
-+local_norefill:
-+	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
-+	return;
-+
-+global:
-+	touch_core_sched(rq, p);	/* see the comment in local: */
-+	p->scx.slice = SCX_SLICE_DFL;
-+	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
-+}
-+
-+static bool watchdog_task_watched(const struct task_struct *p)
-+{
-+	return !list_empty(&p->scx.watchdog_node);
-+}
-+
-+static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(rq);
-+	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
-+		p->scx.runnable_at = jiffies;
-+	p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET;
-+	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
-+}
-+
-+static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
-+{
-+	list_del_init(&p->scx.watchdog_node);
-+	if (reset_timeout)
-+		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
-+}
-+
-+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
-+{
-+	int sticky_cpu = p->scx.sticky_cpu;
-+
-+	enq_flags |= rq->scx.extra_enq_flags;
-+
-+	if (sticky_cpu >= 0)
-+		p->scx.sticky_cpu = -1;
-+
-+	/*
-+	 * Restoring a running task will be immediately followed by
-+	 * set_next_task_scx() which expects the task to not be on the BPF
-+	 * scheduler as tasks can only start running through local DSQs. Force
-+	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
-+	 */
-+	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
-+		sticky_cpu = cpu_of(rq);
-+
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		WARN_ON_ONCE(!watchdog_task_watched(p));
-+		return;
-+	}
-+
-+	watchdog_watch_task(rq, p);
-+	p->scx.flags |= SCX_TASK_QUEUED;
-+	rq->scx.nr_running++;
-+	add_nr_running(rq, 1);
-+
-+	if (SCX_HAS_OP(runnable))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
-+
-+	if (enq_flags & SCX_ENQ_WAKEUP)
-+		touch_core_sched(rq, p);
-+
-+	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
-+}
-+
-+static void ops_dequeue(struct task_struct *p, u64 deq_flags)
-+{
-+	unsigned long opss;
-+
-+	watchdog_unwatch_task(p, false);
-+
-+	/* acquire ensures that we see the preceding updates on QUEUED */
-+	opss = atomic_long_read_acquire(&p->scx.ops_state);
-+
-+	switch (opss & SCX_OPSS_STATE_MASK) {
-+	case SCX_OPSS_NONE:
-+		break;
-+	case SCX_OPSS_QUEUEING:
-+		/*
-+		 * QUEUEING is started and finished while holding @p's rq lock.
-+		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
-+		 */
-+		BUG();
-+	case SCX_OPSS_QUEUED:
-+		if (SCX_HAS_OP(dequeue))
-+			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
-+
-+		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
-+					    SCX_OPSS_NONE))
-+			break;
-+		fallthrough;
-+	case SCX_OPSS_DISPATCHING:
-+		/*
-+		 * If @p is being dispatched from the BPF scheduler to a DSQ,
-+		 * wait for the transfer to complete so that @p doesn't get
-+		 * added to its DSQ after dequeueing is complete.
-+		 *
-+		 * As we're waiting on DISPATCHING with the rq locked, the
-+		 * dispatching side shouldn't try to lock the rq while
-+		 * DISPATCHING is set. See dispatch_to_local_dsq().
-+		 *
-+		 * DISPATCHING shouldn't have qseq set and control can reach
-+		 * here with NONE @opss from the above QUEUED case block.
-+		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
-+		 */
-+		wait_ops_state(p, SCX_OPSS_DISPATCHING);
-+		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
-+		break;
-+	}
-+}
-+
-+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
-+{
-+	struct scx_rq *scx_rq = &rq->scx;
-+
-+	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
-+		WARN_ON_ONCE(watchdog_task_watched(p));
-+		return;
-+	}
-+
-+	ops_dequeue(p, deq_flags);
-+
-+	/*
-+	 * A currently running task which is going off @rq first gets dequeued
-+	 * and then stops running. As we want running <-> stopping transitions
-+	 * to be contained within runnable <-> quiescent transitions, trigger
-+	 * ->stopping() early here instead of in put_prev_task_scx().
-+	 *
-+	 * @p may go through multiple stopping <-> running transitions between
-+	 * here and put_prev_task_scx() if task attribute changes occur while
-+	 * balance_scx() leaves @rq unlocked. However, they don't contain any
-+	 * information meaningful to the BPF scheduler and can be suppressed by
-+	 * skipping the callbacks if the task is !QUEUED.
-+	 */
-+	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
-+		update_curr_scx(rq);
-+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
-+	}
-+
-+	if (SCX_HAS_OP(quiescent))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
-+
-+	if (deq_flags & SCX_DEQ_SLEEP)
-+		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
-+	else
-+		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
-+
-+	p->scx.flags &= ~SCX_TASK_QUEUED;
-+	scx_rq->nr_running--;
-+	sub_nr_running(rq, 1);
-+
-+	dispatch_dequeue(scx_rq, p);
-+}
-+
-+static void yield_task_scx(struct rq *rq)
-+{
-+	struct task_struct *p = rq->curr;
-+
-+	if (SCX_HAS_OP(yield))
-+		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
-+	else
-+		p->scx.slice = 0;
-+}
-+
-+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
-+{
-+	struct task_struct *from = rq->curr;
-+
-+	if (SCX_HAS_OP(yield))
-+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
-+	else
-+		return false;
-+}
-+
-+#ifdef CONFIG_SMP
-+/**
-+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
-+ * @rq: rq to move the task into, currently locked
-+ * @p: task to move
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller
-+ * must:
-+ *
-+ * 1. Start with exclusive access to @p either through its DSQ lock or
-+ *    %SCX_OPSS_DISPATCHING flag.
-+ *
-+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
-+ *
-+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't
-+ *    deadlock with dequeue.
-+ *
-+ * 4. Lock @rq and the task_rq from #3.
-+ *
-+ * 5. Call this function.
-+ *
-+ * Returns %true if @p was successfully moved. %false after racing dequeue and
-+ * losing.
-+ */
-+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-+				   u64 enq_flags)
-+{
-+	struct rq *task_rq;
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * If dequeue got to @p while we were trying to lock both rq's, it'd
-+	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
-+	 * updated it to different values afterwards, as this operation can't be
-+	 * preempted or recurse, @p->scx.holding_cpu can never become
-+	 * raw_smp_processor_id() again before we're done. Thus, we can tell
-+	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
-+	 * still raw_smp_processor_id().
-+	 *
-+	 * See dispatch_dequeue() for the counterpart.
-+	 */
-+	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
-+		return false;
-+
-+	/* @p->rq couldn't have changed if we're still the holding cpu */
-+	task_rq = task_rq(p);
-+	lockdep_assert_rq_held(task_rq);
-+
-+	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
-+	deactivate_task(task_rq, p, 0);
-+	set_task_cpu(p, cpu_of(rq));
-+	p->scx.sticky_cpu = cpu_of(rq);
-+
-+	/*
-+	 * We want to pass scx-specific enq_flags but activate_task() will
-+	 * truncate the upper 32 bit. As we own @rq, we can pass them through
-+	 * @rq->scx.extra_enq_flags instead.
-+	 */
-+	WARN_ON_ONCE(rq->scx.extra_enq_flags);
-+	rq->scx.extra_enq_flags = enq_flags;
-+	activate_task(rq, p, 0);
-+	rq->scx.extra_enq_flags = 0;
-+
-+	return true;
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
-+ *
-+ * We're holding @rq lock and trying to dispatch a task from @src_rq to
-+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
-+ * @rq stays locked isn't important as long as the state is restored after
-+ * dispatch_to_local_dsq_unlock().
-+ */
-+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
-+				       struct rq *src_rq, struct rq *dst_rq)
-+{
-+	rq_unpin_lock(rq, rf);
-+
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(rq);
-+		raw_spin_rq_lock(dst_rq);
-+	} else if (rq == src_rq) {
-+		double_lock_balance(rq, dst_rq);
-+		rq_repin_lock(rq, rf);
-+	} else if (rq == dst_rq) {
-+		double_lock_balance(rq, src_rq);
-+		rq_repin_lock(rq, rf);
-+	} else {
-+		raw_spin_rq_unlock(rq);
-+		double_rq_lock(src_rq, dst_rq);
-+	}
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
-+ *
-+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
-+ */
-+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
-+					 struct rq *src_rq, struct rq *dst_rq)
-+{
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(dst_rq);
-+		raw_spin_rq_lock(rq);
-+		rq_repin_lock(rq, rf);
-+	} else if (rq == src_rq) {
-+		double_unlock_balance(rq, dst_rq);
-+	} else if (rq == dst_rq) {
-+		double_unlock_balance(rq, src_rq);
-+	} else {
-+		double_rq_unlock(src_rq, dst_rq);
-+		raw_spin_rq_lock(rq);
-+		rq_repin_lock(rq, rf);
-+	}
-+}
-+#endif	/* CONFIG_SMP */
-+
-+
-+static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
-+{
-+	return likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
-+		cpumask_test_cpu(cpu_of(rq), p->cpus_ptr);
-+}
-+
-+static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
-+			       struct scx_dispatch_q *dsq)
-+{
-+	struct scx_rq *scx_rq = &rq->scx;
-+	struct task_struct *p;
-+	struct rb_node *rb_node;
-+	struct rq *task_rq;
-+	bool moved = false;
-+retry:
-+	if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq))
-+		return false;
-+
-+	raw_spin_lock(&dsq->lock);
-+
-+	list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) {
-+		task_rq = task_rq(p);
-+		if (rq == task_rq)
-+			goto this_rq;
-+		if (task_can_run_on_rq(p, rq))
-+			goto remote_rq;
-+	}
-+
-+	for (rb_node = rb_first_cached(&dsq->priq); rb_node;
-+	     rb_node = rb_next(rb_node)) {
-+		p = container_of(rb_node, struct task_struct, scx.dsq_node.priq);
-+		task_rq = task_rq(p);
-+		if (rq == task_rq)
-+			goto this_rq;
-+		if (task_can_run_on_rq(p, rq))
-+			goto remote_rq;
-+	}
-+
-+	raw_spin_unlock(&dsq->lock);
-+	return false;
-+
-+this_rq:
-+	/* @dsq is locked and @p is on this rq */
-+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+	task_unlink_from_dsq(p, dsq);
-+	list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo);
-+	dsq->nr--;
-+	scx_rq->local_dsq.nr++;
-+	p->scx.dsq = &scx_rq->local_dsq;
-+	raw_spin_unlock(&dsq->lock);
-+	return true;
-+
-+remote_rq:
-+#ifdef CONFIG_SMP
-+	/*
-+	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
-+	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
-+	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
-+	 * rq lock or fail, do a little dancing from our side. See
-+	 * move_task_to_local_dsq().
-+	 */
-+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+	task_unlink_from_dsq(p, dsq);
-+	dsq->nr--;
-+	p->scx.holding_cpu = raw_smp_processor_id();
-+	raw_spin_unlock(&dsq->lock);
-+
-+	rq_unpin_lock(rq, rf);
-+	double_lock_balance(rq, task_rq);
-+	rq_repin_lock(rq, rf);
-+
-+	moved = move_task_to_local_dsq(rq, p, 0);
-+
-+	double_unlock_balance(rq, task_rq);
-+#endif /* CONFIG_SMP */
-+	if (likely(moved))
-+		return true;
-+	goto retry;
-+}
-+
-+enum dispatch_to_local_dsq_ret {
-+	DTL_DISPATCHED,		/* successfully dispatched */
-+	DTL_LOST,		/* lost race to dequeue */
-+	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
-+	DTL_INVALID,		/* invalid local dsq_id */
-+};
-+
-+/**
-+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @dsq_id: destination dsq ID
-+ * @p: task to dispatch
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
-+ * @dsq_id. This function performs all the synchronization dancing needed
-+ * because local DSQs are protected with rq locks.
-+ *
-+ * The caller must have exclusive ownership of @p (e.g. through
-+ * %SCX_OPSS_DISPATCHING).
-+ */
-+static enum dispatch_to_local_dsq_ret
-+dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
-+		      struct task_struct *p, u64 enq_flags)
-+{
-+	struct rq *src_rq = task_rq(p);
-+	struct rq *dst_rq;
-+
-+	/*
-+	 * We're synchronized against dequeue through DISPATCHING. As @p can't
-+	 * be dequeued, its task_rq and cpus_allowed are stable too.
-+	 */
-+	if (dsq_id == SCX_DSQ_LOCAL) {
-+		dst_rq = rq;
-+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+		if (!ops_cpu_valid(cpu)) {
-+			scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]",
-+				      cpu, p->comm, p->pid);
-+			return DTL_INVALID;
-+		}
-+		dst_rq = cpu_rq(cpu);
-+	} else {
-+		return DTL_NOT_LOCAL;
-+	}
-+
-+	/* if dispatching to @rq that @p is already on, no lock dancing needed */
-+	if (rq == src_rq && rq == dst_rq) {
-+		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+				 enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		return DTL_DISPATCHED;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
-+		struct rq *locked_dst_rq = dst_rq;
-+		bool dsp;
-+
-+		/*
-+		 * @p is on a possibly remote @src_rq which we need to lock to
-+		 * move the task. If dequeue is in progress, it'd be locking
-+		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
-+		 * lock while holding DISPATCHING.
-+		 *
-+		 * As DISPATCHING guarantees that @p is wholly ours, we can
-+		 * pretend that we're moving from a DSQ and use the same
-+		 * mechanism - mark the task under transfer with holding_cpu,
-+		 * release DISPATCHING and then follow the same protocol.
-+		 */
-+		p->scx.holding_cpu = raw_smp_processor_id();
-+
-+		/* store_release ensures that dequeue sees the above */
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-+
-+		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
-+
-+		/*
-+		 * We don't require the BPF scheduler to avoid dispatching to
-+		 * offline CPUs mostly for convenience but also because CPUs can
-+		 * go offline between scx_bpf_dispatch() calls and here. If @p
-+		 * is destined to an offline CPU, queue it on its current CPU
-+		 * instead, which should always be safe. As this is an allowed
-+		 * behavior, don't trigger an ops error.
-+		 */
-+		if (unlikely(!test_rq_online(dst_rq)))
-+			dst_rq = src_rq;
-+
-+		if (src_rq == dst_rq) {
-+			/*
-+			 * As @p is staying on the same rq, there's no need to
-+			 * go through the full deactivate/activate cycle.
-+			 * Optimize by abbreviating the operations in
-+			 * move_task_to_local_dsq().
-+			 */
-+			dsp = p->scx.holding_cpu == raw_smp_processor_id();
-+			if (likely(dsp)) {
-+				p->scx.holding_cpu = -1;
-+				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+						 enq_flags);
-+			}
-+		} else {
-+			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
-+		}
-+
-+		/* if the destination CPU is idle, wake it up */
-+		if (dsp && p->sched_class > dst_rq->curr->sched_class)
-+			resched_curr(dst_rq);
-+
-+		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
-+
-+		return dsp ? DTL_DISPATCHED : DTL_LOST;
-+	}
-+#endif /* CONFIG_SMP */
-+
-+	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
-+		      cpu_of(dst_rq), p->comm, p->pid);
-+	return DTL_INVALID;
-+}
-+
-+/**
-+ * finish_dispatch - Asynchronously finish dispatching a task
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @p: task to finish dispatching
-+ * @qseq_at_dispatch: qseq when @p started getting dispatched
-+ * @dsq_id: destination DSQ ID
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * Dispatching to local DSQs may need to wait for queueing to complete or
-+ * require rq lock dancing. As we don't wanna do either while inside
-+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
-+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
-+ * task and its qseq. Once ops.dispatch() returns, this function is called to
-+ * finish up.
-+ *
-+ * There is no guarantee that @p is still valid for dispatching or even that it
-+ * was valid in the first place. Make sure that the task is still owned by the
-+ * BPF scheduler and claim the ownership before dispatching.
-+ */
-+static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
-+			    struct task_struct *p,
-+			    unsigned long qseq_at_dispatch,
-+			    u64 dsq_id, u64 enq_flags)
-+{
-+	struct scx_dispatch_q *dsq;
-+	unsigned long opss;
-+
-+	touch_core_sched_dispatch(rq, p);
-+retry:
-+	/*
-+	 * No need for _acquire here. @p is accessed only after a successful
-+	 * try_cmpxchg to DISPATCHING.
-+	 */
-+	opss = atomic_long_read(&p->scx.ops_state);
-+
-+	switch (opss & SCX_OPSS_STATE_MASK) {
-+	case SCX_OPSS_DISPATCHING:
-+	case SCX_OPSS_NONE:
-+		/* someone else already got to it */
-+		return;
-+	case SCX_OPSS_QUEUED:
-+		/*
-+		 * If qseq doesn't match, @p has gone through at least one
-+		 * dispatch/dequeue and re-enqueue cycle between
-+		 * scx_bpf_dispatch() and here and we have no claim on it.
-+		 */
-+		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
-+			return;
-+
-+		/*
-+		 * While we know @p is accessible, we don't yet have a claim on
-+		 * it - the BPF scheduler is allowed to dispatch tasks
-+		 * spuriously and there can be a racing dequeue attempt. Let's
-+		 * claim @p by atomically transitioning it from QUEUED to
-+		 * DISPATCHING.
-+		 */
-+		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
-+						   SCX_OPSS_DISPATCHING)))
-+			break;
-+		goto retry;
-+	case SCX_OPSS_QUEUEING:
-+		/*
-+		 * do_enqueue_task() is in the process of transferring the task
-+		 * to the BPF scheduler while holding @p's rq lock. As we aren't
-+		 * holding any kernel or BPF resource that the enqueue path may
-+		 * depend upon, it's safe to wait.
-+		 */
-+		wait_ops_state(p, opss);
-+		goto retry;
-+	}
-+
-+	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
-+
-+	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
-+	case DTL_DISPATCHED:
-+		break;
-+	case DTL_LOST:
-+		break;
-+	case DTL_INVALID:
-+		dsq_id = SCX_DSQ_GLOBAL;
-+		fallthrough;
-+	case DTL_NOT_LOCAL:
-+		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
-+					    dsq_id, p);
-+		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		break;
-+	}
-+}
-+
-+static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
-+	u32 u;
-+
-+	for (u = 0; u < dspc->buf_cursor; u++) {
-+		struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u];
-+
-+		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
-+				ent->enq_flags);
-+	}
-+
-+	dspc->nr_tasks += dspc->buf_cursor;
-+	dspc->buf_cursor = 0;
-+}
-+
-+static int balance_one(struct rq *rq, struct task_struct *prev,
-+		       struct rq_flags *rf, bool local)
-+{
-+	struct scx_rq *scx_rq = &rq->scx;
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
-+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
-+	int nr_loops = SCX_DSP_MAX_LOOPS;
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
-+	    unlikely(rq->scx.cpu_released)) {
-+		/*
-+		 * If the previous sched_class for the current CPU was not SCX,
-+		 * notify the BPF scheduler that it again has control of the
-+		 * core. This callback complements ->cpu_release(), which is
-+		 * emitted in scx_notify_pick_next_task().
-+		 */
-+		if (SCX_HAS_OP(cpu_acquire))
-+			SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq),
-+				    NULL);
-+		rq->scx.cpu_released = false;
-+	}
-+
-+	if (prev_on_scx) {
-+		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
-+		update_curr_scx(rq);
-+
-+		/*
-+		 * If @prev is runnable & has slice left, it has priority and
-+		 * fetching more just increases latency for the fetched tasks.
-+		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
-+		 * BPF scheduler wants to handle this explicitly, it should
-+		 * implement ->cpu_released().
-+		 *
-+		 * See scx_ops_disable_workfn() for the explanation on the
-+		 * disabling() test.
-+		 *
-+		 * When balancing a remote CPU for core-sched, there won't be a
-+		 * following put_prev_task_scx() call and we don't own
-+		 * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
-+		 * same conditions later and pick @rq->curr accordingly.
-+		 */
-+		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
-+		    prev->scx.slice && !scx_ops_disabling()) {
-+			if (local)
-+				prev->scx.flags |= SCX_TASK_BAL_KEEP;
-+			return 1;
-+		}
-+	}
-+
-+	/* if there already are tasks to run, nothing to do */
-+	if (scx_rq->local_dsq.nr)
-+		return 1;
-+
-+	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-+		return 1;
-+
-+	if (!SCX_HAS_OP(dispatch))
-+		return 0;
-+
-+	dspc->rq = rq;
-+	dspc->rf = rf;
-+
-+	/*
-+	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
-+	 * the local DSQ might still end up empty after a successful
-+	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
-+	 * produced some tasks, retry. The BPF scheduler may depend on this
-+	 * looping behavior to simplify its implementation.
-+	 */
-+	do {
-+		dspc->nr_tasks = 0;
-+
-+		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
-+			    prev_on_scx ? prev : NULL);
-+
-+		flush_dispatch_buf(rq, rf);
-+
-+		if (scx_rq->local_dsq.nr)
-+			return 1;
-+		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-+			return 1;
-+
-+		/*
-+		 * ops.dispatch() can trap us in this loop by repeatedly
-+		 * dispatching ineligible tasks. Break out once in a while to
-+		 * allow the watchdog to run. As IRQ can't be enabled in
-+		 * balance(), we want to complete this scheduling cycle and then
-+		 * start a new one. IOW, we want to call resched_curr() on the
-+		 * next, most likely idle, task, not the current one. Use
-+		 * scx_bpf_kick_cpu() for deferred kicking.
-+		 */
-+		if (unlikely(!--nr_loops)) {
-+			scx_bpf_kick_cpu(cpu_of(rq), 0);
-+			break;
-+		}
-+	} while (dspc->nr_tasks);
-+
-+	return 0;
-+}
-+
-+static int balance_scx(struct rq *rq, struct task_struct *prev,
-+		       struct rq_flags *rf)
-+{
-+	int ret;
-+
-+	ret = balance_one(rq, prev, rf, true);
-+
-+#ifdef CONFIG_SCHED_SMT
-+	/*
-+	 * When core-sched is enabled, this ops.balance() call will be followed
-+	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
-+	 * on the SMT siblings. Balance the siblings too.
-+	 */
-+	if (sched_core_enabled(rq)) {
-+		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
-+		int scpu;
-+
-+		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
-+			struct rq *srq = cpu_rq(scpu);
-+			struct rq_flags srf;
-+			struct task_struct *sprev = srq->curr;
-+
-+			/*
-+			 * While core-scheduling, rq lock is shared among
-+			 * siblings but the debug annotations and rq clock
-+			 * aren't. Do pinning dance to transfer the ownership.
-+			 */
-+			WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
-+			rq_unpin_lock(rq, rf);
-+			rq_pin_lock(srq, &srf);
-+
-+			update_rq_clock(srq);
-+			balance_one(srq, sprev, &srf, false);
-+
-+			rq_unpin_lock(srq, &srf);
-+			rq_repin_lock(rq, rf);
-+		}
-+	}
-+#endif
-+	return ret;
-+}
-+
-+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
-+{
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		/*
-+		 * Core-sched might decide to execute @p before it is
-+		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
-+		 */
-+		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
-+		dispatch_dequeue(&rq->scx, p);
-+	}
-+
-+	p->se.exec_start = rq_clock_task(rq);
-+
-+	/* see dequeue_task_scx() on why we skip when !QUEUED */
-+	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
-+
-+	watchdog_unwatch_task(p, true);
-+
-+	/*
-+	 * @p is getting newly scheduled or got kicked after someone updated its
-+	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
-+	 */
-+	if ((p->scx.slice == SCX_SLICE_INF) !=
-+	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
-+		if (p->scx.slice == SCX_SLICE_INF)
-+			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
-+		else
-+			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
-+
-+		sched_update_tick_dependency(rq);
-+	}
-+}
-+
-+static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
-+{
-+#ifndef CONFIG_SMP
-+	/*
-+	 * UP workaround.
-+	 *
-+	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
-+	 * is performed from its balance operation which isn't called in UP.
-+	 * Let's work around by calling it from the operations which come right
-+	 * after.
-+	 *
-+	 * 1. If the prev task is on SCX, pick_next_task() calls
-+	 *    .put_prev_task() right after. As .put_prev_task() is also called
-+	 *    from other places, we need to distinguish the calls which can be
-+	 *    done by looking at the previous task's state - if still queued or
-+	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
-+	 *    This case is handled here.
-+	 *
-+	 * 2. If the prev task is not on SCX, the first following call into SCX
-+	 *    will be .pick_next_task(), which is covered by calling
-+	 *    balance_scx() from pick_next_task_scx().
-+	 *
-+	 * Note that we can't merge the first case into the second as
-+	 * balance_scx() must be called before the previous SCX task goes
-+	 * through put_prev_task_scx().
-+	 *
-+	 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf.
-+	 * Pass in %NULL.
-+	 */
-+	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
-+		balance_scx(rq, p, NULL);
-+#endif
-+
-+	update_curr_scx(rq);
-+
-+	/* see dequeue_task_scx() on why we skip when !QUEUED */
-+	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
-+
-+	/*
-+	 * If we're being called from put_prev_task_balance(), balance_scx() may
-+	 * have decided that @p should keep running.
-+	 */
-+	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
-+		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-+		watchdog_watch_task(rq, p);
-+		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-+		return;
-+	}
-+
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		watchdog_watch_task(rq, p);
-+
-+		/*
-+		 * If @p has slice left and balance_scx() didn't tag it for
-+		 * keeping, @p is getting preempted by a higher priority
-+		 * scheduler class or core-sched forcing a different task. Leave
-+		 * it at the head of the local DSQ.
-+		 */
-+		if (p->scx.slice && !scx_ops_disabling()) {
-+			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-+			return;
-+		}
-+
-+		/*
-+		 * If we're in the pick_next_task path, balance_scx() should
-+		 * have already populated the local DSQ if there are any other
-+		 * available tasks. If empty, tell ops.enqueue() that @p is the
-+		 * only one available for this cpu. ops.enqueue() should put it
-+		 * on the local DSQ so that the subsequent pick_next_task_scx()
-+		 * can find the task unless it wants to trigger a separate
-+		 * follow-up scheduling event.
-+		 */
-+		if (list_empty(&rq->scx.local_dsq.fifo))
-+			do_enqueue_task(rq, p, SCX_ENQ_LAST | SCX_ENQ_LOCAL, -1);
-+		else
-+			do_enqueue_task(rq, p, 0, -1);
-+	}
-+}
-+
-+static struct task_struct *first_local_task(struct rq *rq)
-+{
-+	struct rb_node *rb_node;
-+
-+	if (!list_empty(&rq->scx.local_dsq.fifo))
-+		return list_first_entry(&rq->scx.local_dsq.fifo,
-+					struct task_struct, scx.dsq_node.fifo);
-+
-+	rb_node = rb_first_cached(&rq->scx.local_dsq.priq);
-+	if (rb_node)
-+		return container_of(rb_node,
-+				    struct task_struct, scx.dsq_node.priq);
-+
-+	return NULL;
-+}
-+
-+static struct task_struct *pick_next_task_scx(struct rq *rq)
-+{
-+	struct task_struct *p;
-+
-+#ifndef CONFIG_SMP
-+	/* UP workaround - see the comment at the head of put_prev_task_scx() */
-+	if (unlikely(rq->curr->sched_class != &ext_sched_class))
-+		balance_scx(rq, rq->curr, NULL);
-+#endif
-+
-+	p = first_local_task(rq);
-+	if (!p)
-+		return NULL;
-+
-+	if (unlikely(!p->scx.slice)) {
-+		if (!scx_ops_disabling() && !scx_warned_zero_slice) {
-+			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
-+					p->comm, p->pid);
-+			scx_warned_zero_slice = true;
-+		}
-+		p->scx.slice = SCX_SLICE_DFL;
-+	}
-+
-+	set_next_task_scx(rq, p, true);
-+
-+	return p;
-+}
-+
-+#ifdef CONFIG_SCHED_CORE
-+/**
-+ * scx_prio_less - Task ordering for core-sched
-+ * @a: task A
-+ * @b: task B
-+ *
-+ * Core-sched is implemented as an additional scheduling layer on top of the
-+ * usual sched_class'es and needs to find out the expected task ordering. For
-+ * SCX, core-sched calls this function to interrogate the task ordering.
-+ *
-+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
-+ * to implement the default task ordering. The older the timestamp, the higher
-+ * prority the task - the global FIFO ordering matching the default scheduling
-+ * behavior.
-+ *
-+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
-+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
-+ */
-+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
-+		   bool in_fi)
-+{
-+	/*
-+	 * The const qualifiers are dropped from task_struct pointers when
-+	 * calling ops.core_sched_before(). Accesses are controlled by the
-+	 * verifier.
-+	 */
-+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_disabling())
-+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
-+					      (struct task_struct *)a,
-+					      (struct task_struct *)b);
-+	else
-+		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
-+}
-+
-+/**
-+ * pick_task_scx - Pick a candidate task for core-sched
-+ * @rq: rq to pick the candidate task from
-+ *
-+ * Core-sched calls this function on each SMT sibling to determine the next
-+ * tasks to run on the SMT siblings. balance_one() has been called on all
-+ * siblings and put_prev_task_scx() has been called only for the current CPU.
-+ *
-+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
-+ * at the first task in the local dsq. @rq->curr has to be considered explicitly
-+ * to mimic %SCX_TASK_BAL_KEEP.
-+ */
-+static struct task_struct *pick_task_scx(struct rq *rq)
-+{
-+	struct task_struct *curr = rq->curr;
-+	struct task_struct *first = first_local_task(rq);
-+
-+	if (curr->scx.flags & SCX_TASK_QUEUED) {
-+		/* is curr the only runnable task? */
-+		if (!first)
-+			return curr;
-+
-+		/*
-+		 * Does curr trump first? We can always go by core_sched_at for
-+		 * this comparison as it represents global FIFO ordering when
-+		 * the default core-sched ordering is used and local-DSQ FIFO
-+		 * ordering otherwise.
-+		 *
-+		 * We can have a task with an earlier timestamp on the DSQ. For
-+		 * example, when a current task is preempted by a sibling
-+		 * picking a different cookie, the task would be requeued at the
-+		 * head of the local DSQ with an earlier timestamp than the
-+		 * core-sched picked next task. Besides, the BPF scheduler may
-+		 * dispatch any tasks to the local DSQ anytime.
-+		 */
-+		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
-+						     first->scx.core_sched_at))
-+			return curr;
-+	}
-+
-+	return first;	/* this may be %NULL */
-+}
-+#endif	/* CONFIG_SCHED_CORE */
-+
-+static enum scx_cpu_preempt_reason
-+preempt_reason_from_class(const struct sched_class *class)
-+{
-+#ifdef CONFIG_SMP
-+	if (class == &stop_sched_class)
-+		return SCX_CPU_PREEMPT_STOP;
-+#endif
-+	if (class == &dl_sched_class)
-+		return SCX_CPU_PREEMPT_DL;
-+	if (class == &rt_sched_class)
-+		return SCX_CPU_PREEMPT_RT;
-+	return SCX_CPU_PREEMPT_UNKNOWN;
-+}
-+
-+void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
-+				 const struct sched_class *active)
-+{
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * The callback is conceptually meant to convey that the CPU is no
-+	 * longer under the control of SCX. Therefore, don't invoke the
-+	 * callback if the CPU is is staying on SCX, or going idle (in which
-+	 * case the SCX scheduler has actively decided not to schedule any
-+	 * tasks on the CPU).
-+	 */
-+	if (likely(active >= &ext_sched_class))
-+		return;
-+
-+	/*
-+	 * At this point we know that SCX was preempted by a higher priority
-+	 * sched_class, so invoke the ->cpu_release() callback if we have not
-+	 * done so already. We only send the callback once between SCX being
-+	 * preempted, and it regaining control of the CPU.
-+	 *
-+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
-+	 *  next time that balance_scx() is invoked.
-+	 */
-+	if (!rq->scx.cpu_released) {
-+		if (SCX_HAS_OP(cpu_release)) {
-+			struct scx_cpu_release_args args = {
-+				.reason = preempt_reason_from_class(active),
-+				.task = task,
-+			};
-+
-+			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-+				    cpu_release, cpu_of(rq), &args);
-+		}
-+		rq->scx.cpu_released = true;
-+	}
-+}
-+
-+#ifdef CONFIG_SMP
-+
-+static bool test_and_clear_cpu_idle(int cpu)
-+{
-+#ifdef CONFIG_SCHED_SMT
-+	/*
-+	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
-+	 * cluster is not wholly idle either way. This also prevents
-+	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
-+	 */
-+	if (sched_smt_active()) {
-+		const struct cpumask *smt = cpu_smt_mask(cpu);
-+
-+		/*
-+		 * If offline, @cpu is not its own sibling and
-+		 * scx_pick_idle_cpu() can get caught in an infinite loop as
-+		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
-+		 * is eventually cleared.
-+		 */
-+		if (cpumask_intersects(smt, idle_masks.smt))
-+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-+		else if (cpumask_test_cpu(cpu, idle_masks.smt))
-+			__cpumask_clear_cpu(cpu, idle_masks.smt);
-+	}
-+#endif
-+	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-+}
-+
-+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-+{
-+	int cpu;
-+
-+retry:
-+	if (sched_smt_active()) {
-+		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
-+		if (cpu < nr_cpu_ids)
-+			goto found;
-+
-+		if (flags & SCX_PICK_IDLE_CORE)
-+			return -EBUSY;
-+	}
-+
-+	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
-+	if (cpu >= nr_cpu_ids)
-+		return -EBUSY;
-+
-+found:
-+	if (test_and_clear_cpu_idle(cpu))
-+		return cpu;
-+	else
-+		goto retry;
-+}
-+
-+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
-+{
-+	s32 cpu;
-+
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return prev_cpu;
-+	}
-+
-+	/*
-+	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
-+	 * local DSQ of the waker.
-+	 */
-+	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
-+	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) {
-+		cpu = smp_processor_id();
-+		if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
-+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-+			return cpu;
-+		}
-+	}
-+
-+	if (p->nr_cpus_allowed == 1)
-+		return prev_cpu;
-+
-+	/*
-+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-+	 * partially idle @prev_cpu.
-+	 */
-+	if (sched_smt_active()) {
-+		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
-+		    test_and_clear_cpu_idle(prev_cpu)) {
-+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-+			return prev_cpu;
-+		}
-+
-+		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0) {
-+			p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-+			return cpu;
-+		}
-+	}
-+
-+	if (test_and_clear_cpu_idle(prev_cpu)) {
-+		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-+		return prev_cpu;
-+	}
-+
-+	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0) {
-+		p->scx.flags |= SCX_TASK_ENQ_LOCAL;
-+		return cpu;
-+	}
-+
-+	return prev_cpu;
-+}
-+
-+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
-+{
-+	if (SCX_HAS_OP(select_cpu)) {
-+		s32 cpu;
-+
-+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_REST, select_cpu, p, prev_cpu,
-+					   wake_flags);
-+		if (ops_cpu_valid(cpu)) {
-+			return cpu;
-+		} else {
-+			scx_ops_error("select_cpu returned invalid cpu %d", cpu);
-+			return prev_cpu;
-+		}
-+	} else {
-+		return scx_select_cpu_dfl(p, prev_cpu, wake_flags);
-+	}
-+}
-+
-+static void set_cpus_allowed_scx(struct task_struct *p,
-+				 struct affinity_context *ac)
-+{
-+	set_cpus_allowed_common(p, ac);
-+
-+	/*
-+	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
-+	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
-+	 * scheduler the effective one.
-+	 *
-+	 * Fine-grained memory write control is enforced by BPF making the const
-+	 * designation pointless. Cast it away when calling the operation.
-+	 */
-+	if (SCX_HAS_OP(set_cpumask))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-+				 (struct cpumask *)p->cpus_ptr);
-+}
-+
-+static void reset_idle_masks(void)
-+{
-+	/* consider all cpus idle, should converge to the actual state quickly */
-+	cpumask_setall(idle_masks.cpu);
-+	cpumask_setall(idle_masks.smt);
-+}
-+
-+void __scx_update_idle(struct rq *rq, bool idle)
-+{
-+	int cpu = cpu_of(rq);
-+
-+	if (SCX_HAS_OP(update_idle)) {
-+		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-+		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
-+			return;
-+	}
-+
-+	if (idle)
-+		cpumask_set_cpu(cpu, idle_masks.cpu);
-+	else
-+		cpumask_clear_cpu(cpu, idle_masks.cpu);
-+
-+#ifdef CONFIG_SCHED_SMT
-+	if (sched_smt_active()) {
-+		const struct cpumask *smt = cpu_smt_mask(cpu);
-+
-+		if (idle) {
-+			/*
-+			 * idle_masks.smt handling is racy but that's fine as
-+			 * it's only for optimization and self-correcting.
-+			 */
-+			for_each_cpu(cpu, smt) {
-+				if (!cpumask_test_cpu(cpu, idle_masks.cpu))
-+					return;
-+			}
-+			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
-+		} else {
-+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-+		}
-+	}
-+#endif
-+}
-+
-+static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
-+{
-+	if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG)
-+		SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq));
-+}
-+
-+static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
-+{
-+	if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG)
-+		SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq));
-+}
-+
-+#else /* !CONFIG_SMP */
-+
-+static bool test_and_clear_cpu_idle(int cpu) { return false; }
-+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-+static void reset_idle_masks(void) {}
-+
-+#endif /* CONFIG_SMP */
-+
-+static bool check_rq_for_timeouts(struct rq *rq)
-+{
-+	struct task_struct *p;
-+	struct rq_flags rf;
-+	bool timed_out = false;
-+
-+	rq_lock_irqsave(rq, &rf);
-+	list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
-+		unsigned long last_runnable = p->scx.runnable_at;
-+
-+		if (unlikely(time_after(jiffies,
-+					last_runnable + scx_watchdog_timeout))) {
-+			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
-+
-+			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
-+					   "%s[%d] failed to run for %u.%03us",
-+					   p->comm, p->pid,
-+					   dur_ms / 1000, dur_ms % 1000);
-+			timed_out = true;
-+			break;
-+		}
-+	}
-+	rq_unlock_irqrestore(rq, &rf);
-+
-+	return timed_out;
-+}
-+
-+static void scx_watchdog_workfn(struct work_struct *work)
-+{
-+	int cpu;
-+
-+	scx_watchdog_timestamp = jiffies;
-+
-+	for_each_online_cpu(cpu) {
-+		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
-+			break;
-+
-+		cond_resched();
-+	}
-+	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-+			   scx_watchdog_timeout / 2);
-+}
-+
-+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
-+{
-+	update_curr_scx(rq);
-+
-+	/*
-+	 * While disabling, always resched and refresh core-sched timestamp as
-+	 * we can't trust the slice management or ops.core_sched_before().
-+	 */
-+	if (scx_ops_disabling()) {
-+		curr->scx.slice = 0;
-+		touch_core_sched(rq, curr);
-+	}
-+
-+	if (!curr->scx.slice)
-+		resched_curr(rq);
-+}
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static struct cgroup *tg_cgrp(struct task_group *tg)
-+{
-+	/*
-+	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
-+	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
-+	 * root cgroup.
-+	 */
-+	if (tg && tg->css.cgroup)
-+		return tg->css.cgroup;
-+	else
-+		return &cgrp_dfl_root.cgrp;
-+}
-+
-+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)		.cgroup = tg_cgrp(tg),
-+
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+
-+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg)
-+
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+
-+static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
-+{
-+	int ret;
-+
-+	WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
-+
-+	p->scx.disallow = false;
-+
-+	if (SCX_HAS_OP(prep_enable)) {
-+		struct scx_enable_args args = {
-+			SCX_ENABLE_ARGS_INIT_CGROUP(tg)
-+		};
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, prep_enable, p, &args);
-+		if (unlikely(ret)) {
-+			ret = ops_sanitize_err("prep_enable", ret);
-+			return ret;
-+		}
-+	}
-+
-+	if (p->scx.disallow) {
-+		struct rq *rq;
-+		struct rq_flags rf;
-+
-+		rq = task_rq_lock(p, &rf);
-+
-+		/*
-+		 * We're either in fork or load path and @p->policy will be
-+		 * applied right after. Reverting @p->policy here and rejecting
-+		 * %SCHED_EXT transitions from scx_check_setscheduler()
-+		 * guarantees that if ops.prep_enable() sets @p->disallow, @p
-+		 * can never be in SCX.
-+		 */
-+		if (p->policy == SCHED_EXT) {
-+			p->policy = SCHED_NORMAL;
-+			atomic_long_inc(&scx_nr_rejected);
-+		}
-+
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+
-+	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
-+	return 0;
-+}
-+
-+static void scx_ops_enable_task(struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
-+
-+	if (SCX_HAS_OP(enable)) {
-+		struct scx_enable_args args = {
-+			SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
-+		};
-+		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args);
-+	}
-+	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
-+	p->scx.flags |= SCX_TASK_OPS_ENABLED;
-+}
-+
-+static void scx_ops_disable_task(struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	if (p->scx.flags & SCX_TASK_OPS_PREPPED) {
-+		if (SCX_HAS_OP(cancel_enable)) {
-+			struct scx_enable_args args = {
-+				SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p))
-+			};
-+			SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args);
-+		}
-+		p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
-+	} else if (p->scx.flags & SCX_TASK_OPS_ENABLED) {
-+		if (SCX_HAS_OP(disable))
-+			SCX_CALL_OP(SCX_KF_REST, disable, p);
-+		p->scx.flags &= ~SCX_TASK_OPS_ENABLED;
-+	}
-+}
-+
-+static void set_task_scx_weight(struct task_struct *p)
-+{
-+	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
-+
-+	p->scx.weight = sched_weight_to_cgroup(weight);
-+}
-+
-+/**
-+ * refresh_scx_weight - Refresh a task's ext weight
-+ * @p: task to refresh ext weight for
-+ *
-+ * @p->scx.weight carries the task's static priority in cgroup weight scale to
-+ * enable easy access from the BPF scheduler. To keep it synchronized with the
-+ * current task priority, this function should be called when a new task is
-+ * created, priority is changed for a task on sched_ext, and a task is switched
-+ * to sched_ext from other classes.
-+ */
-+static void refresh_scx_weight(struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+	set_task_scx_weight(p);
-+	if (SCX_HAS_OP(set_weight))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
-+}
-+
-+void scx_pre_fork(struct task_struct *p)
-+{
-+	/*
-+	 * BPF scheduler enable/disable paths want to be able to iterate and
-+	 * update all tasks which can become complex when racing forks. As
-+	 * enable/disable are very cold paths, let's use a percpu_rwsem to
-+	 * exclude forks.
-+	 */
-+	percpu_down_read(&scx_fork_rwsem);
-+}
-+
-+int scx_fork(struct task_struct *p)
-+{
-+	percpu_rwsem_assert_held(&scx_fork_rwsem);
-+
-+	if (scx_enabled())
-+		return scx_ops_prepare_task(p, task_group(p));
-+	else
-+		return 0;
-+}
-+
-+void scx_post_fork(struct task_struct *p)
-+{
-+	if (scx_enabled()) {
-+		struct rq_flags rf;
-+		struct rq *rq;
-+
-+		rq = task_rq_lock(p, &rf);
-+		/*
-+		 * Set the weight manually before calling ops.enable() so that
-+		 * the scheduler doesn't see a stale value if they inspect the
-+		 * task struct. We'll invoke ops.set_weight() afterwards, as it
-+		 * would be odd to receive a callback on the task before we
-+		 * tell the scheduler that it's been fully enabled.
-+		 */
-+		set_task_scx_weight(p);
-+		scx_ops_enable_task(p);
-+		refresh_scx_weight(p);
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+
-+	spin_lock_irq(&scx_tasks_lock);
-+	list_add_tail(&p->scx.tasks_node, &scx_tasks);
-+	spin_unlock_irq(&scx_tasks_lock);
-+
-+	percpu_up_read(&scx_fork_rwsem);
-+}
-+
-+void scx_cancel_fork(struct task_struct *p)
-+{
-+	if (scx_enabled())
-+		scx_ops_disable_task(p);
-+	percpu_up_read(&scx_fork_rwsem);
-+}
-+
-+void sched_ext_free(struct task_struct *p)
-+{
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&scx_tasks_lock, flags);
-+	list_del_init(&p->scx.tasks_node);
-+	spin_unlock_irqrestore(&scx_tasks_lock, flags);
-+
-+	/*
-+	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s PREPPED ->
-+	 * ENABLED transitions can't race us. Disable ops for @p.
-+	 */
-+	if (p->scx.flags & (SCX_TASK_OPS_PREPPED | SCX_TASK_OPS_ENABLED)) {
-+		struct rq_flags rf;
-+		struct rq *rq;
-+
-+		rq = task_rq_lock(p, &rf);
-+		scx_ops_disable_task(p);
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+}
-+
-+static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
-+{
-+	refresh_scx_weight(p);
-+}
-+
-+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
-+{
-+}
-+
-+static void switching_to_scx(struct rq *rq, struct task_struct *p)
-+{
-+	refresh_scx_weight(p);
-+
-+	/*
-+	 * set_cpus_allowed_scx() is not called while @p is associated with a
-+	 * different scheduler class. Keep the BPF scheduler up-to-date.
-+	 */
-+	if (SCX_HAS_OP(set_cpumask))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-+				 (struct cpumask *)p->cpus_ptr);
-+}
-+
-+static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
-+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
-+
-+int scx_check_setscheduler(struct task_struct *p, int policy)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	/* if disallow, reject transitioning into SCX */
-+	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
-+	    p->policy != policy && policy == SCHED_EXT)
-+		return -EACCES;
-+
-+	return 0;
-+}
-+
-+#ifdef CONFIG_NO_HZ_FULL
-+bool scx_can_stop_tick(struct rq *rq)
-+{
-+	struct task_struct *p = rq->curr;
-+
-+	if (scx_ops_disabling())
-+		return false;
-+
-+	if (p->sched_class != &ext_sched_class)
-+		return true;
-+
-+	/*
-+	 * @rq can dispatch from different DSQs, so we can't tell whether it
-+	 * needs the tick or not by looking at nr_running. Allow stopping ticks
-+	 * iff the BPF scheduler indicated so. See set_next_task_scx().
-+	 */
-+	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
-+}
-+#endif
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+
-+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
-+
-+int scx_tg_online(struct task_group *tg)
-+{
-+	int ret = 0;
-+
-+	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
-+
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (SCX_HAS_OP(cgroup_init)) {
-+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
-+				      tg->css.cgroup, &args);
-+		if (!ret)
-+			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
-+		else
-+			ret = ops_sanitize_err("cgroup_init", ret);
-+	} else {
-+		tg->scx_flags |= SCX_TG_ONLINE;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+	return ret;
-+}
-+
-+void scx_tg_offline(struct task_group *tg)
-+{
-+	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
-+
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
-+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup);
-+	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
-+{
-+	struct cgroup_subsys_state *css;
-+	struct task_struct *p;
-+	int ret;
-+
-+	/* released in scx_finish/cancel_attach() */
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (!scx_enabled())
-+		return 0;
-+
-+	cgroup_taskset_for_each(p, css, tset) {
-+		struct cgroup *from = tg_cgrp(task_group(p));
-+
-+		if (SCX_HAS_OP(cgroup_prep_move)) {
-+			ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move,
-+					      p, from, css->cgroup);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		WARN_ON_ONCE(p->scx.cgrp_moving_from);
-+		p->scx.cgrp_moving_from = from;
-+	}
-+
-+	return 0;
-+
-+err:
-+	cgroup_taskset_for_each(p, css, tset) {
-+		if (!p->scx.cgrp_moving_from)
-+			break;
-+		if (SCX_HAS_OP(cgroup_cancel_move))
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
-+				    p->scx.cgrp_moving_from, css->cgroup);
-+		p->scx.cgrp_moving_from = NULL;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+	return ops_sanitize_err("cgroup_prep_move", ret);
-+}
-+
-+void scx_move_task(struct task_struct *p)
-+{
-+	/*
-+	 * We're called from sched_move_task() which handles both cgroup and
-+	 * autogroup moves. Ignore the latter.
-+	 */
-+	if (task_group_is_autogroup(task_group(p)))
-+		return;
-+
-+	if (!scx_enabled())
-+		return;
-+
-+	if (SCX_HAS_OP(cgroup_move)) {
-+		WARN_ON_ONCE(!p->scx.cgrp_moving_from);
-+		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
-+			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
-+	}
-+	p->scx.cgrp_moving_from = NULL;
-+}
-+
-+void scx_cgroup_finish_attach(void)
-+{
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
-+{
-+	struct cgroup_subsys_state *css;
-+	struct task_struct *p;
-+
-+	if (!scx_enabled())
-+		goto out_unlock;
-+
-+	cgroup_taskset_for_each(p, css, tset) {
-+		if (SCX_HAS_OP(cgroup_cancel_move)) {
-+			WARN_ON_ONCE(!p->scx.cgrp_moving_from);
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
-+				    p->scx.cgrp_moving_from, css->cgroup);
-+		}
-+		p->scx.cgrp_moving_from = NULL;
-+	}
-+out_unlock:
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
-+{
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (tg->scx_weight != weight) {
-+		if (SCX_HAS_OP(cgroup_set_weight))
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight,
-+				    tg_cgrp(tg), weight);
-+		tg->scx_weight = weight;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+static void scx_cgroup_lock(void)
-+{
-+	percpu_down_write(&scx_cgroup_rwsem);
-+}
-+
-+static void scx_cgroup_unlock(void)
-+{
-+	percpu_up_write(&scx_cgroup_rwsem);
-+}
-+
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+
-+static inline void scx_cgroup_lock(void) {}
-+static inline void scx_cgroup_unlock(void) {}
-+
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+
-+/*
-+ * Omitted operations:
-+ *
-+ * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
-+ *   task isn't tied to the CPU at that point. Preemption is implemented by
-+ *   resetting the victim task's slice to 0 and triggering reschedule on the
-+ *   target CPU.
-+ *
-+ * - migrate_task_rq: Unncessary as task to cpu mapping is transient.
-+ *
-+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
-+ *   their current sched_class. Call them directly from sched core instead.
-+ *
-+ * - task_woken, switched_from: Unnecessary.
-+ */
-+DEFINE_SCHED_CLASS(ext) = {
-+	.enqueue_task		= enqueue_task_scx,
-+	.dequeue_task		= dequeue_task_scx,
-+	.yield_task		= yield_task_scx,
-+	.yield_to_task		= yield_to_task_scx,
-+
-+	.check_preempt_curr	= check_preempt_curr_scx,
-+
-+	.pick_next_task		= pick_next_task_scx,
-+
-+	.put_prev_task		= put_prev_task_scx,
-+	.set_next_task          = set_next_task_scx,
-+
-+#ifdef CONFIG_SMP
-+	.balance		= balance_scx,
-+	.select_task_rq		= select_task_rq_scx,
-+	.set_cpus_allowed	= set_cpus_allowed_scx,
-+
-+	.rq_online		= rq_online_scx,
-+	.rq_offline		= rq_offline_scx,
-+#endif
-+
-+#ifdef CONFIG_SCHED_CORE
-+	.pick_task		= pick_task_scx,
-+#endif
-+
-+	.task_tick		= task_tick_scx,
-+
-+	.switching_to		= switching_to_scx,
-+	.switched_to		= switched_to_scx,
-+	.reweight_task		= reweight_task_scx,
-+	.prio_changed		= prio_changed_scx,
-+
-+	.update_curr		= update_curr_scx,
-+
-+#ifdef CONFIG_UCLAMP_TASK
-+	.uclamp_enabled		= 0,
-+#endif
-+};
-+
-+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
-+{
-+	memset(dsq, 0, sizeof(*dsq));
-+
-+	raw_spin_lock_init(&dsq->lock);
-+	INIT_LIST_HEAD(&dsq->fifo);
-+	dsq->id = dsq_id;
-+}
-+
-+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-+{
-+	struct scx_dispatch_q *dsq;
-+	int ret;
-+
-+	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
-+		return ERR_PTR(-EINVAL);
-+
-+	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
-+	if (!dsq)
-+		return ERR_PTR(-ENOMEM);
-+
-+	init_dsq(dsq, dsq_id);
-+
-+	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
-+				     dsq_hash_params);
-+	if (ret) {
-+		kfree(dsq);
-+		return ERR_PTR(ret);
-+	}
-+	return dsq;
-+}
-+
-+static void free_dsq_irq_workfn(struct irq_work *irq_work)
-+{
-+	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
-+	struct scx_dispatch_q *dsq, *tmp_dsq;
-+
-+	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
-+		kfree_rcu(dsq, rcu);
-+}
-+
-+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-+
-+static void destroy_dsq(u64 dsq_id)
-+{
-+	struct scx_dispatch_q *dsq;
-+	unsigned long flags;
-+
-+	rcu_read_lock();
-+
-+	dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
-+	if (!dsq)
-+		goto out_unlock_rcu;
-+
-+	raw_spin_lock_irqsave(&dsq->lock, flags);
-+
-+	if (dsq->nr) {
-+		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
-+			      dsq->id, dsq->nr);
-+		goto out_unlock_dsq;
-+	}
-+
-+	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
-+		goto out_unlock_dsq;
-+
-+	/*
-+	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
-+	 * queueing more tasks. As this function can be called from anywhere,
-+	 * freeing is bounced through an irq work to avoid nesting RCU
-+	 * operations inside scheduler locks.
-+	 */
-+	dsq->id = SCX_DSQ_INVALID;
-+	llist_add(&dsq->free_node, &dsqs_to_free);
-+	irq_work_queue(&free_dsq_irq_work);
-+
-+out_unlock_dsq:
-+	raw_spin_unlock_irqrestore(&dsq->lock, flags);
-+out_unlock_rcu:
-+	rcu_read_unlock();
-+}
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static void scx_cgroup_exit(void)
-+{
-+	struct cgroup_subsys_state *css;
-+
-+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-+
-+	/*
-+	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
-+	 * cgroups and exit all the inited ones, all online cgroups are exited.
-+	 */
-+	rcu_read_lock();
-+	css_for_each_descendant_post(css, &root_task_group.css) {
-+		struct task_group *tg = css_tg(css);
-+
-+		if (!(tg->scx_flags & SCX_TG_INITED))
-+			continue;
-+		tg->scx_flags &= ~SCX_TG_INITED;
-+
-+		if (!scx_ops.cgroup_exit)
-+			continue;
-+
-+		if (WARN_ON_ONCE(!css_tryget(css)))
-+			continue;
-+		rcu_read_unlock();
-+
-+		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
-+
-+		rcu_read_lock();
-+		css_put(css);
-+	}
-+	rcu_read_unlock();
-+}
-+
-+static int scx_cgroup_init(void)
-+{
-+	struct cgroup_subsys_state *css;
-+	int ret;
-+
-+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-+
-+	/*
-+	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
-+	 * cgroups and init, all online cgroups are initialized.
-+	 */
-+	rcu_read_lock();
-+	css_for_each_descendant_pre(css, &root_task_group.css) {
-+		struct task_group *tg = css_tg(css);
-+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
-+
-+		if ((tg->scx_flags &
-+		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
-+			continue;
-+
-+		if (!scx_ops.cgroup_init) {
-+			tg->scx_flags |= SCX_TG_INITED;
-+			continue;
-+		}
-+
-+		if (WARN_ON_ONCE(!css_tryget(css)))
-+			continue;
-+		rcu_read_unlock();
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
-+				      css->cgroup, &args);
-+		if (ret) {
-+			css_put(css);
-+			return ret;
-+		}
-+		tg->scx_flags |= SCX_TG_INITED;
-+
-+		rcu_read_lock();
-+		css_put(css);
-+	}
-+	rcu_read_unlock();
-+
-+	return 0;
-+}
-+
-+static void scx_cgroup_config_knobs(void)
-+{
-+	static DEFINE_MUTEX(cgintf_mutex);
-+	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
-+	u64 knob_flags;
-+	int i;
-+
-+	/*
-+	 * Called from both class switch and ops enable/disable paths,
-+	 * synchronize internally.
-+	 */
-+	mutex_lock(&cgintf_mutex);
-+
-+	/* if fair is in use, all knobs should be shown */
-+	if (!scx_switched_all()) {
-+		bitmap_fill(mask, CPU_CFTYPE_CNT);
-+		goto apply;
-+	}
-+
-+	/*
-+	 * On ext, only show the supported knobs. Otherwise, show all possible
-+	 * knobs so that configuration attempts succeed and the states are
-+	 * remembered while ops is not loaded.
-+	 */
-+	if (scx_enabled())
-+		knob_flags = scx_ops.flags;
-+	else
-+		knob_flags = SCX_OPS_ALL_FLAGS;
-+
-+	if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) {
-+		__set_bit(CPU_CFTYPE_WEIGHT, mask);
-+		__set_bit(CPU_CFTYPE_WEIGHT_NICE, mask);
-+	}
-+apply:
-+	for (i = 0; i < CPU_CFTYPE_CNT; i++)
-+		cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask));
-+
-+	mutex_unlock(&cgintf_mutex);
-+}
-+
-+#else
-+static void scx_cgroup_exit(void) {}
-+static int scx_cgroup_init(void) { return 0; }
-+static void scx_cgroup_config_knobs(void) {}
-+#endif
-+
-+/*
-+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
-+ * sched_class. dl/rt are already handled.
-+ */
-+bool task_should_scx(struct task_struct *p)
-+{
-+	if (!scx_enabled() || scx_ops_disabling())
-+		return false;
-+	if (READ_ONCE(scx_switching_all))
-+		return true;
-+	return p->policy == SCHED_EXT;
-+}
-+
-+static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
-+{
-+	if (enq_flags & SCX_ENQ_LAST)
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-+	else
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
-+
-+static void scx_ops_disable_workfn(struct kthread_work *work)
-+{
-+	struct scx_exit_info *ei = &scx_exit_info;
-+	struct scx_task_iter sti;
-+	struct task_struct *p;
-+	struct rhashtable_iter rht_iter;
-+	struct scx_dispatch_q *dsq;
-+	const char *reason;
-+	int i, cpu, kind;
-+
-+	kind = atomic_read(&scx_exit_kind);
-+	while (true) {
-+		/*
-+		 * NONE indicates that a new scx_ops has been registered since
-+		 * disable was scheduled - don't kill the new ops. DONE
-+		 * indicates that the ops has already been disabled.
-+		 */
-+		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
-+			return;
-+		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
-+			break;
-+	}
-+
-+	cancel_delayed_work_sync(&scx_watchdog_work);
-+
-+	switch (kind) {
-+	case SCX_EXIT_UNREG:
-+		reason = "BPF scheduler unregistered";
-+		break;
-+	case SCX_EXIT_SYSRQ:
-+		reason = "disabled by sysrq-S";
-+		break;
-+	case SCX_EXIT_ERROR:
-+		reason = "runtime error";
-+		break;
-+	case SCX_EXIT_ERROR_BPF:
-+		reason = "scx_bpf_error";
-+		break;
-+	case SCX_EXIT_ERROR_STALL:
-+		reason = "runnable task stall";
-+		break;
-+	default:
-+		reason = "<UNKNOWN>";
-+	}
-+
-+	ei->kind = kind;
-+	strlcpy(ei->reason, reason, sizeof(ei->reason));
-+
-+	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
-+	case SCX_OPS_DISABLED:
-+		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-+			scx_exit_info.msg);
-+		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
-+			     SCX_OPS_DISABLING);
-+		return;
-+	case SCX_OPS_PREPPING:
-+		goto forward_progress_guaranteed;
-+	case SCX_OPS_DISABLING:
-+		/* shouldn't happen but handle it like ENABLING if it does */
-+		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
-+		fallthrough;
-+	case SCX_OPS_ENABLING:
-+	case SCX_OPS_ENABLED:
-+		break;
-+	}
-+
-+	/*
-+	 * DISABLING is set and ops was either ENABLING or ENABLED indicating
-+	 * that the ops and static branches are set.
-+	 *
-+	 * We must guarantee that all runnable tasks make forward progress
-+	 * without trusting the BPF scheduler. We can't grab any mutexes or
-+	 * rwsems as they might be held by tasks that the BPF scheduler is
-+	 * forgetting to run, which unfortunately also excludes toggling the
-+	 * static branches.
-+	 *
-+	 * Let's work around by overriding a couple ops and modifying behaviors
-+	 * based on the DISABLING state and then cycling the tasks through
-+	 * dequeue/enqueue to force global FIFO scheduling.
-+	 *
-+	 * a. ops.enqueue() and .dispatch() are overridden for simple global
-+	 *    FIFO scheduling.
-+	 *
-+	 * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value
-+	 *    can't be trusted. Whenever a tick triggers, the running task is
-+	 *    rotated to the tail of the queue with core_sched_at touched.
-+	 *
-+	 * c. pick_next_task() suppresses zero slice warning.
-+	 *
-+	 * d. scx_prio_less() reverts to the default core_sched_at order.
-+	 */
-+	scx_ops.enqueue = scx_ops_fallback_enqueue;
-+	scx_ops.dispatch = scx_ops_fallback_dispatch;
-+
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
-+		if (READ_ONCE(p->__state) != TASK_DEAD) {
-+			struct sched_enq_and_set_ctx ctx;
-+
-+			/* cycling deq/enq is enough, see above */
-+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-+			sched_enq_and_set_task(&ctx);
-+		}
-+	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
-+
-+	/* kick all CPUs to restore ticks */
-+	for_each_possible_cpu(cpu)
-+		resched_cpu(cpu);
-+
-+forward_progress_guaranteed:
-+	/*
-+	 * Here, every runnable task is guaranteed to make forward progress and
-+	 * we can safely use blocking synchronization constructs. Actually
-+	 * disable ops.
-+	 */
-+	mutex_lock(&scx_ops_enable_mutex);
-+
-+	static_branch_disable(&__scx_switched_all);
-+	WRITE_ONCE(scx_switching_all, false);
-+
-+	/* avoid racing against fork and cgroup changes */
-+	cpus_read_lock();
-+	percpu_down_write(&scx_fork_rwsem);
-+	scx_cgroup_lock();
-+
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
-+		const struct sched_class *old_class = p->sched_class;
-+		struct sched_enq_and_set_ctx ctx;
-+		bool alive = READ_ONCE(p->__state) != TASK_DEAD;
-+
-+		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-+
-+		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
-+
-+		__setscheduler_prio(p, p->prio);
-+		if (alive)
-+			check_class_changing(task_rq(p), p, old_class);
-+
-+		sched_enq_and_set_task(&ctx);
-+
-+		if (alive)
-+			check_class_changed(task_rq(p), p, old_class, p->prio);
-+
-+		scx_ops_disable_task(p);
-+	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
-+
-+	/* no task is on scx, turn off all the switches and flush in-progress calls */
-+	static_branch_disable_cpuslocked(&__scx_ops_enabled);
-+	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
-+		static_branch_disable_cpuslocked(&scx_has_op[i]);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_last);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
-+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
-+	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
-+	synchronize_rcu();
-+
-+	scx_cgroup_exit();
-+
-+	scx_cgroup_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+	cpus_read_unlock();
-+
-+	if (ei->kind >= SCX_EXIT_ERROR) {
-+		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
-+
-+		if (ei->msg[0] == '\0')
-+			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
-+		else
-+			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
-+
-+		stack_trace_print(ei->bt, ei->bt_len, 2);
-+	}
-+
-+	if (scx_ops.exit)
-+		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
-+
-+	memset(&scx_ops, 0, sizeof(scx_ops));
-+
-+	rhashtable_walk_enter(&dsq_hash, &rht_iter);
-+	do {
-+		rhashtable_walk_start(&rht_iter);
-+
-+		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
-+			destroy_dsq(dsq->id);
-+
-+		rhashtable_walk_stop(&rht_iter);
-+	} while (dsq == ERR_PTR(-EAGAIN));
-+	rhashtable_walk_exit(&rht_iter);
-+
-+	free_percpu(scx_dsp_buf);
-+	scx_dsp_buf = NULL;
-+	scx_dsp_max_batch = 0;
-+
-+	mutex_unlock(&scx_ops_enable_mutex);
-+
-+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
-+		     SCX_OPS_DISABLING);
-+
-+	scx_cgroup_config_knobs();
-+}
-+
-+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-+
-+static void schedule_scx_ops_disable_work(void)
-+{
-+	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-+
-+	/*
-+	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
-+	 * scx_ops_helper isn't set up yet, there's nothing to do.
-+	 */
-+	if (helper)
-+		kthread_queue_work(helper, &scx_ops_disable_work);
-+}
-+
-+static void scx_ops_disable(enum scx_exit_kind kind)
-+{
-+	int none = SCX_EXIT_NONE;
-+
-+	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
-+		kind = SCX_EXIT_ERROR;
-+
-+	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-+
-+	schedule_scx_ops_disable_work();
-+}
-+
-+static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
-+{
-+	schedule_scx_ops_disable_work();
-+}
-+
-+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-+
-+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
-+				       const char *fmt, ...)
-+{
-+	struct scx_exit_info *ei = &scx_exit_info;
-+	int none = SCX_EXIT_NONE;
-+	va_list args;
-+
-+	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
-+		return;
-+
-+	ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1);
-+
-+	va_start(args, fmt);
-+	vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args);
-+	va_end(args);
-+
-+	irq_work_queue(&scx_ops_error_irq_work);
-+}
-+
-+static struct kthread_worker *scx_create_rt_helper(const char *name)
-+{
-+	struct kthread_worker *helper;
-+
-+	helper = kthread_create_worker(0, name);
-+	if (helper)
-+		sched_set_fifo(helper->task);
-+	return helper;
-+}
-+
-+static int scx_ops_enable(struct sched_ext_ops *ops)
-+{
-+	struct scx_task_iter sti;
-+	struct task_struct *p;
-+	int i, ret;
-+
-+	mutex_lock(&scx_ops_enable_mutex);
-+
-+	if (!scx_ops_helper) {
-+		WRITE_ONCE(scx_ops_helper,
-+			   scx_create_rt_helper("sched_ext_ops_helper"));
-+		if (!scx_ops_helper) {
-+			ret = -ENOMEM;
-+			goto err_unlock;
-+		}
-+	}
-+
-+	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
-+		ret = -EBUSY;
-+		goto err_unlock;
-+	}
-+
-+	/*
-+	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
-+	 * disable path. Failure triggers full disabling from here on.
-+	 */
-+	scx_ops = *ops;
-+
-+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
-+		     SCX_OPS_DISABLED);
-+
-+	memset(&scx_exit_info, 0, sizeof(scx_exit_info));
-+	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
-+	scx_warned_zero_slice = false;
-+
-+	atomic_long_set(&scx_nr_rejected, 0);
-+
-+	/*
-+	 * Keep CPUs stable during enable so that the BPF scheduler can track
-+	 * online CPUs by watching ->on/offline_cpu() after ->init().
-+	 */
-+	cpus_read_lock();
-+
-+	scx_switch_all_req = false;
-+	if (scx_ops.init) {
-+		ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
-+		if (ret) {
-+			ret = ops_sanitize_err("init", ret);
-+			goto err_disable;
-+		}
-+
-+		/*
-+		 * Exit early if ops.init() triggered scx_bpf_error(). Not
-+		 * strictly necessary as we'll fail transitioning into ENABLING
-+		 * later but that'd be after calling ops.prep_enable() on all
-+		 * tasks and with -EBUSY which isn't very intuitive. Let's exit
-+		 * early with success so that the condition is notified through
-+		 * ops.exit() like other scx_bpf_error() invocations.
-+		 */
-+		if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE)
-+			goto err_disable;
-+	}
-+
-+	WARN_ON_ONCE(scx_dsp_buf);
-+	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
-+	scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch,
-+				     __alignof__(scx_dsp_buf[0]));
-+	if (!scx_dsp_buf) {
-+		ret = -ENOMEM;
-+		goto err_disable;
-+	}
-+
-+	scx_watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-+	if (ops->timeout_ms)
-+		scx_watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
-+
-+	scx_watchdog_timestamp = jiffies;
-+	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
-+			   scx_watchdog_timeout / 2);
-+
-+	/*
-+	 * Lock out forks, cgroup on/offlining and moves before opening the
-+	 * floodgate so that they don't wander into the operations prematurely.
-+	 */
-+	percpu_down_write(&scx_fork_rwsem);
-+	scx_cgroup_lock();
-+
-+	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
-+		if (((void (**)(void))ops)[i])
-+			static_branch_enable_cpuslocked(&scx_has_op[i]);
-+
-+	if (ops->flags & SCX_OPS_ENQ_LAST)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_last);
-+
-+	if (ops->flags & SCX_OPS_ENQ_EXITING)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
-+	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
-+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
-+
-+	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
-+		reset_idle_masks();
-+		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
-+	} else {
-+		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
-+	}
-+
-+	/*
-+	 * All cgroups should be initialized before letting in tasks. cgroup
-+	 * on/offlining and task migrations are already locked out.
-+	 */
-+	ret = scx_cgroup_init();
-+	if (ret)
-+		goto err_disable_unlock;
-+
-+	static_branch_enable_cpuslocked(&__scx_ops_enabled);
-+
-+	/*
-+	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
-+	 * preventing new tasks from being added. No need to exclude tasks
-+	 * leaving as sched_ext_free() can handle both prepped and enabled
-+	 * tasks. Prep all tasks first and then enable them with preemption
-+	 * disabled.
-+	 */
-+	spin_lock_irq(&scx_tasks_lock);
-+
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_filtered(&sti))) {
-+		get_task_struct(p);
-+		spin_unlock_irq(&scx_tasks_lock);
-+
-+		ret = scx_ops_prepare_task(p, task_group(p));
-+		if (ret) {
-+			put_task_struct(p);
-+			spin_lock_irq(&scx_tasks_lock);
-+			scx_task_iter_exit(&sti);
-+			spin_unlock_irq(&scx_tasks_lock);
-+			pr_err("sched_ext: ops.prep_enable() failed (%d) for %s[%d] while loading\n",
-+			       ret, p->comm, p->pid);
-+			goto err_disable_unlock;
-+		}
-+
-+		put_task_struct(p);
-+		spin_lock_irq(&scx_tasks_lock);
-+	}
-+	scx_task_iter_exit(&sti);
-+
-+	/*
-+	 * All tasks are prepped but are still ops-disabled. Ensure that
-+	 * %current can't be scheduled out and switch everyone.
-+	 * preempt_disable() is necessary because we can't guarantee that
-+	 * %current won't be starved if scheduled out while switching.
-+	 */
-+	preempt_disable();
-+
-+	/*
-+	 * From here on, the disable path must assume that tasks have ops
-+	 * enabled and need to be recovered.
-+	 */
-+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
-+		preempt_enable();
-+		spin_unlock_irq(&scx_tasks_lock);
-+		ret = -EBUSY;
-+		goto err_disable_unlock;
-+	}
-+
-+	/*
-+	 * We're fully committed and can't fail. The PREPPED -> ENABLED
-+	 * transitions here are synchronized against sched_ext_free() through
-+	 * scx_tasks_lock.
-+	 */
-+	WRITE_ONCE(scx_switching_all, scx_switch_all_req);
-+
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
-+		if (READ_ONCE(p->__state) != TASK_DEAD) {
-+			const struct sched_class *old_class = p->sched_class;
-+			struct sched_enq_and_set_ctx ctx;
-+
-+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
-+					       &ctx);
-+
-+			scx_ops_enable_task(p);
-+			__setscheduler_prio(p, p->prio);
-+			check_class_changing(task_rq(p), p, old_class);
-+
-+			sched_enq_and_set_task(&ctx);
-+
-+			check_class_changed(task_rq(p), p, old_class, p->prio);
-+		} else {
-+			scx_ops_disable_task(p);
-+		}
-+	}
-+	scx_task_iter_exit(&sti);
-+
-+	spin_unlock_irq(&scx_tasks_lock);
-+	preempt_enable();
-+	scx_cgroup_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+
-+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
-+		ret = -EBUSY;
-+		goto err_disable;
-+	}
-+
-+	if (scx_switch_all_req)
-+		static_branch_enable_cpuslocked(&__scx_switched_all);
-+
-+	cpus_read_unlock();
-+	mutex_unlock(&scx_ops_enable_mutex);
-+
-+	scx_cgroup_config_knobs();
-+
-+	return 0;
-+
-+err_unlock:
-+	mutex_unlock(&scx_ops_enable_mutex);
-+	return ret;
-+
-+err_disable_unlock:
-+	scx_cgroup_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+err_disable:
-+	cpus_read_unlock();
-+	mutex_unlock(&scx_ops_enable_mutex);
-+	/* must be fully disabled before returning */
-+	scx_ops_disable(SCX_EXIT_ERROR);
-+	kthread_flush_work(&scx_ops_disable_work);
-+	return ret;
-+}
-+
-+#ifdef CONFIG_SCHED_DEBUG
-+static int scx_debug_show(struct seq_file *m, void *v)
-+{
-+	mutex_lock(&scx_ops_enable_mutex);
-+	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
-+	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
-+	seq_printf(m, "%-30s: %d\n", "switching_all",
-+		   READ_ONCE(scx_switching_all));
-+	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
-+	seq_printf(m, "%-30s: %s\n", "enable_state",
-+		   scx_ops_enable_state_str[scx_ops_enable_state()]);
-+	seq_printf(m, "%-30s: %lu\n", "nr_rejected",
-+		   atomic_long_read(&scx_nr_rejected));
-+	mutex_unlock(&scx_ops_enable_mutex);
-+	return 0;
-+}
-+
-+static int scx_debug_open(struct inode *inode, struct file *file)
-+{
-+	return single_open(file, scx_debug_show, NULL);
-+}
-+
-+const struct file_operations sched_ext_fops = {
-+	.open		= scx_debug_open,
-+	.read		= seq_read,
-+	.llseek		= seq_lseek,
-+	.release	= single_release,
-+};
-+#endif
-+
-+/********************************************************************************
-+ * bpf_struct_ops plumbing.
-+ */
-+#include <linux/bpf_verifier.h>
-+#include <linux/bpf.h>
-+#include <linux/btf.h>
-+
-+extern struct btf *btf_vmlinux;
-+static const struct btf_type *task_struct_type;
-+
-+static bool bpf_scx_is_valid_access(int off, int size,
-+				    enum bpf_access_type type,
-+				    const struct bpf_prog *prog,
-+				    struct bpf_insn_access_aux *info)
-+{
-+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
-+		return false;
-+	if (type != BPF_READ)
-+		return false;
-+	if (off % size != 0)
-+		return false;
-+
-+	return btf_ctx_access(off, size, type, prog, info);
-+}
-+
-+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
-+				     const struct bpf_reg_state *reg, int off,
-+				     int size)
-+{
-+	const struct btf_type *t;
-+
-+	t = btf_type_by_id(reg->btf, reg->btf_id);
-+	if (t == task_struct_type) {
-+		if (off >= offsetof(struct task_struct, scx.slice) &&
-+		    off + size <= offsetofend(struct task_struct, scx.slice))
-+			return SCALAR_VALUE;
-+		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
-+		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
-+			return SCALAR_VALUE;
-+		if (off >= offsetof(struct task_struct, scx.disallow) &&
-+		    off + size <= offsetofend(struct task_struct, scx.disallow))
-+			return SCALAR_VALUE;
-+	}
-+
-+	return -EACCES;
-+}
-+
-+static const struct bpf_func_proto *
-+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-+{
-+	switch (func_id) {
-+	case BPF_FUNC_task_storage_get:
-+		return &bpf_task_storage_get_proto;
-+	case BPF_FUNC_task_storage_delete:
-+		return &bpf_task_storage_delete_proto;
-+	default:
-+		return bpf_base_func_proto(func_id);
-+	}
-+}
-+
-+const struct bpf_verifier_ops bpf_scx_verifier_ops = {
-+	.get_func_proto = bpf_scx_get_func_proto,
-+	.is_valid_access = bpf_scx_is_valid_access,
-+	.btf_struct_access = bpf_scx_btf_struct_access,
-+};
-+
-+static int bpf_scx_init_member(const struct btf_type *t,
-+			       const struct btf_member *member,
-+			       void *kdata, const void *udata)
-+{
-+	const struct sched_ext_ops *uops = udata;
-+	struct sched_ext_ops *ops = kdata;
-+	u32 moff = __btf_member_bit_offset(t, member) / 8;
-+	int ret;
-+
-+	switch (moff) {
-+	case offsetof(struct sched_ext_ops, dispatch_max_batch):
-+		if (*(u32 *)(udata + moff) > INT_MAX)
-+			return -E2BIG;
-+		ops->dispatch_max_batch = *(u32 *)(udata + moff);
-+		return 1;
-+	case offsetof(struct sched_ext_ops, flags):
-+		if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
-+			return -EINVAL;
-+		ops->flags = *(u64 *)(udata + moff);
-+		return 1;
-+	case offsetof(struct sched_ext_ops, name):
-+		ret = bpf_obj_name_cpy(ops->name, uops->name,
-+				       sizeof(ops->name));
-+		if (ret < 0)
-+			return ret;
-+		if (ret == 0)
-+			return -EINVAL;
-+		return 1;
-+	case offsetof(struct sched_ext_ops, timeout_ms):
-+		if (*(u32 *)(udata + moff) > SCX_WATCHDOG_MAX_TIMEOUT)
-+			return -E2BIG;
-+		ops->timeout_ms = *(u32 *)(udata + moff);
-+		return 1;
-+	}
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_check_member(const struct btf_type *t,
-+				const struct btf_member *member,
-+				const struct bpf_prog *prog)
-+{
-+	u32 moff = __btf_member_bit_offset(t, member) / 8;
-+
-+	switch (moff) {
-+	case offsetof(struct sched_ext_ops, prep_enable):
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	case offsetof(struct sched_ext_ops, cgroup_init):
-+	case offsetof(struct sched_ext_ops, cgroup_exit):
-+	case offsetof(struct sched_ext_ops, cgroup_prep_move):
-+#endif
-+	case offsetof(struct sched_ext_ops, init):
-+	case offsetof(struct sched_ext_ops, exit):
-+		break;
-+	default:
-+		if (prog->aux->sleepable)
-+			return -EINVAL;
-+	}
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_reg(void *kdata)
-+{
-+	return scx_ops_enable(kdata);
-+}
-+
-+static void bpf_scx_unreg(void *kdata)
-+{
-+	scx_ops_disable(SCX_EXIT_UNREG);
-+	kthread_flush_work(&scx_ops_disable_work);
-+}
-+
-+static int bpf_scx_init(struct btf *btf)
-+{
-+	u32 type_id;
-+
-+	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
-+	if (type_id < 0)
-+		return -EINVAL;
-+	task_struct_type = btf_type_by_id(btf, type_id);
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_update(void *kdata, void *old_kdata)
-+{
-+	/*
-+	 * sched_ext does not support updating the actively-loaded BPF
-+	 * scheduler, as registering a BPF scheduler can always fail if the
-+	 * scheduler returns an error code for e.g. ops.init(),
-+	 * ops.prep_enable(), etc. Similarly, we can always race with
-+	 * unregistration happening elsewhere, such as with sysrq.
-+	 */
-+	return -EOPNOTSUPP;
-+}
-+
-+static int bpf_scx_validate(void *kdata)
-+{
-+	return 0;
-+}
-+
-+/* "extern" to avoid sparse warning, only used in this file */
-+extern struct bpf_struct_ops bpf_sched_ext_ops;
-+
-+struct bpf_struct_ops bpf_sched_ext_ops = {
-+	.verifier_ops = &bpf_scx_verifier_ops,
-+	.reg = bpf_scx_reg,
-+	.unreg = bpf_scx_unreg,
-+	.check_member = bpf_scx_check_member,
-+	.init_member = bpf_scx_init_member,
-+	.init = bpf_scx_init,
-+	.update = bpf_scx_update,
-+	.validate = bpf_scx_validate,
-+	.name = "sched_ext_ops",
-+};
-+
-+static void sysrq_handle_sched_ext_reset(u8 key)
-+{
-+	if (scx_ops_helper)
-+		scx_ops_disable(SCX_EXIT_SYSRQ);
-+	else
-+		pr_info("sched_ext: BPF scheduler not yet used\n");
-+}
-+
-+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
-+	.handler	= sysrq_handle_sched_ext_reset,
-+	.help_msg	= "reset-sched-ext(S)",
-+	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
-+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
-+};
-+
-+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
-+{
-+	struct rq *this_rq = this_rq();
-+	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
-+	int this_cpu = cpu_of(this_rq);
-+	int cpu;
-+
-+	for_each_cpu(cpu, this_rq->scx.cpus_to_kick) {
-+		struct rq *rq = cpu_rq(cpu);
-+		unsigned long flags;
-+
-+		raw_spin_rq_lock_irqsave(rq, flags);
-+
-+		if (cpu_online(cpu) || cpu == this_cpu) {
-+			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
-+			    rq->curr->sched_class == &ext_sched_class)
-+				rq->curr->scx.slice = 0;
-+			pseqs[cpu] = rq->scx.pnt_seq;
-+			resched_curr(rq);
-+		} else {
-+			cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait);
-+		}
-+
-+		raw_spin_rq_unlock_irqrestore(rq, flags);
-+	}
-+
-+	for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait,
-+			    cpumask_of(this_cpu)) {
-+		/*
-+		 * Pairs with smp_store_release() issued by this CPU in
-+		 * scx_notify_pick_next_task() on the resched path.
-+		 *
-+		 * We busy-wait here to guarantee that no other task can be
-+		 * scheduled on our core before the target CPU has entered the
-+		 * resched path.
-+		 */
-+		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
-+			cpu_relax();
-+	}
-+
-+	cpumask_clear(this_rq->scx.cpus_to_kick);
-+	cpumask_clear(this_rq->scx.cpus_to_preempt);
-+	cpumask_clear(this_rq->scx.cpus_to_wait);
-+}
-+
-+/**
-+ * print_scx_info - print out sched_ext scheduler state
-+ * @log_lvl: the log level to use when printing
-+ * @p: target task
-+ *
-+ * If a sched_ext scheduler is enabled, print the name and state of the
-+ * scheduler. If @p is on sched_ext, print further information about the task.
-+ *
-+ * This function can be safely called on any task as long as the task_struct
-+ * itself is accessible. While safe, this function isn't synchronized and may
-+ * print out mixups or garbages of limited length.
-+ */
-+void print_scx_info(const char *log_lvl, struct task_struct *p)
-+{
-+	enum scx_ops_enable_state state = scx_ops_enable_state();
-+	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
-+	char runnable_at_buf[22] = "?";
-+	struct sched_class *class;
-+	unsigned long runnable_at;
-+
-+	if (state == SCX_OPS_DISABLED)
-+		return;
-+
-+	/*
-+	 * Carefully check if the task was running on sched_ext, and then
-+	 * carefully copy the time it's been runnable, and its state.
-+	 */
-+	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
-+	    class != &ext_sched_class) {
-+		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
-+		       scx_ops_enable_state_str[state], all);
-+		return;
-+	}
-+
-+	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
-+				      sizeof(runnable_at)))
-+		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms",
-+			  (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC));
-+
-+	/* Print everything onto one line to conserve console spce. */
-+	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
-+	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
-+	       runnable_at_buf);
-+}
-+
-+void __init init_sched_ext_class(void)
-+{
-+	int cpu;
-+	u32 v;
-+
-+	/*
-+	 * The following is to prevent the compiler from optimizing out the enum
-+	 * definitions so that BPF scheduler implementations can use them
-+	 * through the generated vmlinux.h.
-+	 */
-+	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP |
-+		   SCX_TG_ONLINE | SCX_KICK_PREEMPT);
-+
-+	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-+	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
-+#ifdef CONFIG_SMP
-+	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
-+	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-+#endif
-+	scx_kick_cpus_pnt_seqs =
-+		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) *
-+			       num_possible_cpus(),
-+			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-+	BUG_ON(!scx_kick_cpus_pnt_seqs);
-+
-+	for_each_possible_cpu(cpu) {
-+		struct rq *rq = cpu_rq(cpu);
-+
-+		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
-+		INIT_LIST_HEAD(&rq->scx.watchdog_list);
-+
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
-+		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
-+	}
-+
-+	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
-+	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
-+	scx_cgroup_config_knobs();
-+}
-+
-+
-+/********************************************************************************
-+ * Helpers that can be called from the BPF scheduler.
-+ */
-+#include <linux/btf_ids.h>
-+
-+/* Disables missing prototype warnings for kfuncs */
-+__diag_push();
-+__diag_ignore_all("-Wmissing-prototypes",
-+		  "Global functions as their definitions will be in vmlinux BTF");
-+
-+/**
-+ * scx_bpf_switch_all - Switch all tasks into SCX
-+ *
-+ * Switch all existing and future non-dl/rt tasks to SCX. This can only be
-+ * called from ops.init(), and actual switching is performed asynchronously.
-+ */
-+void scx_bpf_switch_all(void)
-+{
-+	if (!scx_kf_allowed(SCX_KF_INIT))
-+		return;
-+
-+	scx_switch_all_req = true;
-+}
-+
-+BTF_SET8_START(scx_kfunc_ids_init)
-+BTF_ID_FLAGS(func, scx_bpf_switch_all)
-+BTF_SET8_END(scx_kfunc_ids_init)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_init,
-+};
-+
-+/**
-+ * scx_bpf_create_dsq - Create a custom DSQ
-+ * @dsq_id: DSQ to create
-+ * @node: NUMA node to allocate from
-+ *
-+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(),
-+ * ops.prep_enable(), ops.cgroup_init() and ops.cgroup_prep_move().
-+ */
-+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
-+{
-+	if (!scx_kf_allowed(SCX_KF_INIT | SCX_KF_SLEEPABLE))
-+		return -EINVAL;
-+
-+	if (unlikely(node >= (int)nr_node_ids ||
-+		     (node < 0 && node != NUMA_NO_NODE)))
-+		return -EINVAL;
-+	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
-+}
-+
-+BTF_SET8_START(scx_kfunc_ids_sleepable)
-+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-+BTF_SET8_END(scx_kfunc_ids_sleepable)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_sleepable,
-+};
-+
-+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
-+{
-+	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
-+		return false;
-+
-+	lockdep_assert_irqs_disabled();
-+
-+	if (unlikely(!p)) {
-+		scx_ops_error("called with NULL task");
-+		return false;
-+	}
-+
-+	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
-+		scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
-+{
-+	struct task_struct *ddsp_task;
-+	int idx;
-+
-+	ddsp_task = __this_cpu_read(direct_dispatch_task);
-+	if (ddsp_task) {
-+		direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
-+		return;
-+	}
-+
-+	idx = __this_cpu_read(scx_dsp_ctx.buf_cursor);
-+	if (unlikely(idx >= scx_dsp_max_batch)) {
-+		scx_ops_error("dispatch buffer overflow");
-+		return;
-+	}
-+
-+	this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){
-+		.task = p,
-+		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
-+		.dsq_id = dsq_id,
-+		.enq_flags = enq_flags,
-+	};
-+	__this_cpu_inc(scx_dsp_ctx.buf_cursor);
-+}
-+
-+/**
-+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
-+ * @p: task_struct to dispatch
-+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
-+ * @enq_flags: SCX_ENQ_*
-+ *
-+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
-+ * to call this function spuriously. Can be called from ops.enqueue() and
-+ * ops.dispatch().
-+ *
-+ * When called from ops.enqueue(), it's for direct dispatch and @p must match
-+ * the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be used to target the
-+ * local DSQ of a CPU other than the enqueueing one. Use ops.select_cpu() to be
-+ * on the target CPU in the first place.
-+ *
-+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
-+ * and this function can be called upto ops.dispatch_max_batch times to dispatch
-+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
-+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
-+ *
-+ * This function doesn't have any locking restrictions and may be called under
-+ * BPF locks (in the future when BPF introduces more flexible locking).
-+ *
-+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
-+ * exhaustion. If zero, the current residual slice is maintained. If
-+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
-+ * scx_bpf_kick_cpu() to trigger scheduling.
-+ */
-+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
-+		      u64 enq_flags)
-+{
-+	if (!scx_dispatch_preamble(p, enq_flags))
-+		return;
-+
-+	if (slice)
-+		p->scx.slice = slice;
-+	else
-+		p->scx.slice = p->scx.slice ?: 1;
-+
-+	scx_dispatch_commit(p, dsq_id, enq_flags);
-+}
-+
-+/**
-+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
-+ * @p: task_struct to dispatch
-+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
-+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
-+ * @enq_flags: SCX_ENQ_*
-+ *
-+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
-+ * Tasks queued into the priority queue are ordered by @vtime and always
-+ * consumed after the tasks in the FIFO queue. All other aspects are identical
-+ * to scx_bpf_dispatch().
-+ *
-+ * @vtime ordering is according to time_before64() which considers wrapping. A
-+ * numerically larger vtime may indicate an earlier position in the ordering and
-+ * vice-versa.
-+ */
-+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice,
-+			    u64 vtime, u64 enq_flags)
-+{
-+	if (!scx_dispatch_preamble(p, enq_flags))
-+		return;
-+
-+	if (slice)
-+		p->scx.slice = slice;
-+	else
-+		p->scx.slice = p->scx.slice ?: 1;
-+
-+	p->scx.dsq_vtime = vtime;
-+
-+	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
-+}
-+
-+BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
-+BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_enqueue_dispatch,
-+};
-+
-+/**
-+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
-+ *
-+ * Can only be called from ops.dispatch().
-+ */
-+u32 scx_bpf_dispatch_nr_slots(void)
-+{
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return 0;
-+
-+	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor);
-+}
-+
-+/**
-+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
-+ * @dsq_id: DSQ to consume
-+ *
-+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
-+ * to the current CPU's local DSQ for execution. Can only be called from
-+ * ops.dispatch().
-+ *
-+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
-+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't
-+ * be called under any BPF locks.
-+ *
-+ * Returns %true if a task has been consumed, %false if there isn't any task to
-+ * consume.
-+ */
-+bool scx_bpf_consume(u64 dsq_id)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
-+	struct scx_dispatch_q *dsq;
-+
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return false;
-+
-+	flush_dispatch_buf(dspc->rq, dspc->rf);
-+
-+	dsq = find_non_local_dsq(dsq_id);
-+	if (unlikely(!dsq)) {
-+		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
-+		return false;
-+	}
-+
-+	if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) {
-+		/*
-+		 * A successfully consumed task can be dequeued before it starts
-+		 * running while the CPU is trying to migrate other dispatched
-+		 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
-+		 * local DSQ.
-+		 */
-+		dspc->nr_tasks++;
-+		return true;
-+	} else {
-+		return false;
-+	}
-+}
-+
-+BTF_SET8_START(scx_kfunc_ids_dispatch)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
-+BTF_ID_FLAGS(func, scx_bpf_consume)
-+BTF_SET8_END(scx_kfunc_ids_dispatch)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_dispatch,
-+};
-+
-+/**
-+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
-+ *
-+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
-+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
-+ * processed tasks. Can only be called from ops.cpu_release().
-+ */
-+u32 scx_bpf_reenqueue_local(void)
-+{
-+	u32 nr_enqueued, i;
-+	struct rq *rq;
-+	struct scx_rq *scx_rq;
-+
-+	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
-+		return 0;
-+
-+	rq = cpu_rq(smp_processor_id());
-+	lockdep_assert_rq_held(rq);
-+	scx_rq = &rq->scx;
-+
-+	/*
-+	 * Get the number of tasks on the local DSQ before iterating over it to
-+	 * pull off tasks. The enqueue callback below can signal that it wants
-+	 * the task to stay on the local DSQ, and we want to prevent the BPF
-+	 * scheduler from causing us to loop indefinitely.
-+	 */
-+	nr_enqueued = scx_rq->local_dsq.nr;
-+	for (i = 0; i < nr_enqueued; i++) {
-+		struct task_struct *p;
-+
-+		p = first_local_task(rq);
-+		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
-+			     SCX_OPSS_NONE);
-+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
-+		WARN_ON_ONCE(p->scx.holding_cpu != -1);
-+		dispatch_dequeue(scx_rq, p);
-+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
-+	}
-+
-+	return nr_enqueued;
-+}
-+
-+BTF_SET8_START(scx_kfunc_ids_cpu_release)
-+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
-+BTF_SET8_END(scx_kfunc_ids_cpu_release)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_cpu_release,
-+};
-+
-+/**
-+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
-+ * @cpu: cpu to kick
-+ * @flags: %SCX_KICK_* flags
-+ *
-+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
-+ * trigger rescheduling on a busy CPU. This can be called from any online
-+ * scx_ops operation and the actual kicking is performed asynchronously through
-+ * an irq work.
-+ */
-+void scx_bpf_kick_cpu(s32 cpu, u64 flags)
-+{
-+	struct rq *rq;
-+
-+	if (!ops_cpu_valid(cpu)) {
-+		scx_ops_error("invalid cpu %d", cpu);
-+		return;
-+	}
-+
-+	preempt_disable();
-+	rq = this_rq();
-+
-+	/*
-+	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
-+	 * rq locks. We can probably be smarter and avoid bouncing if called
-+	 * from ops which don't hold a rq lock.
-+	 */
-+	cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
-+	if (flags & SCX_KICK_PREEMPT)
-+		cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
-+	if (flags & SCX_KICK_WAIT)
-+		cpumask_set_cpu(cpu, rq->scx.cpus_to_wait);
-+
-+	irq_work_queue(&rq->scx.kick_cpus_irq_work);
-+	preempt_enable();
-+}
-+
-+/**
-+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
-+ * @dsq_id: id of the DSQ
-+ *
-+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
-+ * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops
-+ * operations.
-+ */
-+s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
-+{
-+	struct scx_dispatch_q *dsq;
-+
-+	lockdep_assert(rcu_read_lock_any_held());
-+
-+	if (dsq_id == SCX_DSQ_LOCAL) {
-+		return this_rq()->scx.local_dsq.nr;
-+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+		if (ops_cpu_valid(cpu))
-+			return cpu_rq(cpu)->scx.local_dsq.nr;
-+	} else {
-+		dsq = find_non_local_dsq(dsq_id);
-+		if (dsq)
-+			return dsq->nr;
-+	}
-+	return -ENOENT;
-+}
-+
-+/**
-+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
-+ * @cpu: cpu to test and clear idle for
-+ *
-+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
-+ * %false otherwise.
-+ *
-+ * Unavailable if ops.update_idle() is implemented and
-+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
-+ */
-+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return false;
-+	}
-+
-+	if (ops_cpu_valid(cpu))
-+		return test_and_clear_cpu_idle(cpu);
-+	else
-+		return false;
-+}
-+
-+/**
-+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
-+ * @cpus_allowed: Allowed cpumask
-+ * @flags: %SCX_PICK_IDLE_CPU_* flags
-+ *
-+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
-+ * number on success. -%EBUSY if no matching cpu was found.
-+ *
-+ * Idle CPU tracking may race against CPU scheduling state transitions. For
-+ * example, this function may return -%EBUSY as CPUs are transitioning into the
-+ * idle state. If the caller then assumes that there will be dispatch events on
-+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
-+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
-+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
-+ * event in the near future.
-+ *
-+ * Unavailable if ops.update_idle() is implemented and
-+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
-+ */
-+s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return -EBUSY;
-+	}
-+
-+	return scx_pick_idle_cpu(cpus_allowed, flags);
-+}
-+
-+/**
-+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
-+ * @cpus_allowed: Allowed cpumask
-+ * @flags: %SCX_PICK_IDLE_CPU_* flags
-+ *
-+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
-+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
-+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
-+ * empty.
-+ *
-+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
-+ * set, this function can't tell which CPUs are idle and will always pick any
-+ * CPU.
-+ */
-+s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags)
-+{
-+	s32 cpu;
-+
-+	if (static_branch_likely(&scx_builtin_idle_enabled)) {
-+		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
-+		if (cpu >= 0)
-+			return cpu;
-+	}
-+
-+	cpu = cpumask_any_distribute(cpus_allowed);
-+	if (cpu < nr_cpu_ids)
-+		return cpu;
-+	else
-+		return -EBUSY;
-+}
-+
-+/**
-+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
-+ * per-CPU cpumask.
-+ *
-+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
-+ */
-+const struct cpumask *scx_bpf_get_idle_cpumask(void)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return cpu_none_mask;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	return idle_masks.cpu;
-+#else
-+	return cpu_none_mask;
-+#endif
-+}
-+
-+/**
-+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
-+ * per-physical-core cpumask. Can be used to determine if an entire physical
-+ * core is free.
-+ *
-+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
-+ */
-+const struct cpumask *scx_bpf_get_idle_smtmask(void)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return cpu_none_mask;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	if (sched_smt_active())
-+		return idle_masks.smt;
-+	else
-+		return idle_masks.cpu;
-+#else
-+	return cpu_none_mask;
-+#endif
-+}
-+
-+/**
-+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
-+ * either the percpu, or SMT idle-tracking cpumask.
-+ */
-+void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-+{
-+	/*
-+	 * Empty function body because we aren't actually acquiring or
-+	 * releasing a reference to a global idle cpumask, which is read-only
-+	 * in the caller and is never released. The acquire / release semantics
-+	 * here are just used to make the cpumask is a trusted pointer in the
-+	 * caller.
-+	 */
-+}
-+
-+struct scx_bpf_error_bstr_bufs {
-+	u64			data[MAX_BPRINTF_VARARGS];
-+	char			msg[SCX_EXIT_MSG_LEN];
-+};
-+
-+static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs);
-+
-+/**
-+ * scx_bpf_error_bstr - Indicate fatal error
-+ * @fmt: error message format string
-+ * @data: format string parameters packaged using ___bpf_fill() macro
-+ * @data__sz: @data len, must end in '__sz' for the verifier
-+ *
-+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
-+ * disabling.
-+ */
-+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz)
-+{
-+	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
-+	struct scx_bpf_error_bstr_bufs *bufs;
-+	unsigned long flags;
-+	int ret;
-+
-+	local_irq_save(flags);
-+	bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs);
-+
-+	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
-+	    (data__sz && !data)) {
-+		scx_ops_error("invalid data=%p and data__sz=%u",
-+			      (void *)data, data__sz);
-+		goto out_restore;
-+	}
-+
-+	ret = copy_from_kernel_nofault(bufs->data, data, data__sz);
-+	if (ret) {
-+		scx_ops_error("failed to read data fields (%d)", ret);
-+		goto out_restore;
-+	}
-+
-+	ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8,
-+				  &bprintf_data);
-+	if (ret < 0) {
-+		scx_ops_error("failed to format prepration (%d)", ret);
-+		goto out_restore;
-+	}
-+
-+	ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt,
-+			  bprintf_data.bin_args);
-+	bpf_bprintf_cleanup(&bprintf_data);
-+	if (ret < 0) {
-+		scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format",
-+			      fmt, data, data__sz);
-+		goto out_restore;
-+	}
-+
-+	scx_ops_error_kind(SCX_EXIT_ERROR_BPF, "%s", bufs->msg);
-+out_restore:
-+	local_irq_restore(flags);
-+}
-+
-+/**
-+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
-+ * @dsq_id: DSQ to destroy
-+ *
-+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
-+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
-+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
-+ * which doesn't exist. Can be called from any online scx_ops operations.
-+ */
-+void scx_bpf_destroy_dsq(u64 dsq_id)
-+{
-+	destroy_dsq(dsq_id);
-+}
-+
-+/**
-+ * scx_bpf_task_running - Is task currently running?
-+ * @p: task of interest
-+ */
-+bool scx_bpf_task_running(const struct task_struct *p)
-+{
-+	return task_rq(p)->curr == p;
-+}
-+
-+/**
-+ * scx_bpf_task_cpu - CPU a task is currently associated with
-+ * @p: task of interest
-+ */
-+s32 scx_bpf_task_cpu(const struct task_struct *p)
-+{
-+	return task_cpu(p);
-+}
-+
-+/**
-+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
-+ * @p: task of interest
-+ *
-+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
-+ * from the scheduler's POV. SCX operations should use this function to
-+ * determine @p's current cgroup as, unlike following @p->cgroups,
-+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
-+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
-+ * operations. The restriction guarantees that @p's rq is locked by the caller.
-+ */
-+#ifdef CONFIG_CGROUP_SCHED
-+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
-+{
-+	struct task_group *tg = p->sched_task_group;
-+	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
-+
-+	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
-+		goto out;
-+
-+	/*
-+	 * A task_group may either be a cgroup or an autogroup. In the latter
-+	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
-+	 * kind once created.
-+	 */
-+	if (tg && tg->css.cgroup)
-+		cgrp = tg->css.cgroup;
-+	else
-+		cgrp = &cgrp_dfl_root.cgrp;
-+out:
-+	cgroup_get(cgrp);
-+	return cgrp;
-+}
-+#endif
-+
-+BTF_SET8_START(scx_kfunc_ids_ops_only)
-+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
-+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
-+BTF_SET8_END(scx_kfunc_ids_ops_only)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_ops_only,
-+};
-+
-+BTF_SET8_START(scx_kfunc_ids_any)
-+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
-+#ifdef CONFIG_CGROUP_SCHED
-+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
-+#endif
-+BTF_SET8_END(scx_kfunc_ids_any)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_any,
-+};
-+
-+__diag_pop();
-+
-+/*
-+ * This can't be done from init_sched_ext_class() as register_btf_kfunc_id_set()
-+ * needs most of the system to be up.
-+ */
-+static int __init register_ext_kfuncs(void)
-+{
-+	int ret;
-+
-+	/*
-+	 * Some kfuncs are context-sensitive and can only be called from
-+	 * specific SCX ops. They are grouped into BTF sets accordingly.
-+	 * Unfortunately, BPF currently doesn't have a way of enforcing such
-+	 * restrictions. Eventually, the verifier should be able to enforce
-+	 * them. For now, register them the same and make each kfunc explicitly
-+	 * check using scx_kf_allowed().
-+	 */
-+	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_init)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_sleepable)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_enqueue_dispatch)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_dispatch)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_cpu_release)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_ops_only)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_any)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
-+					     &scx_kfunc_set_any))) {
-+		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
-+		return ret;
-+	}
-+
-+	return 0;
-+}
-+__initcall(register_ext_kfuncs);
-diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
-new file mode 100644
-index 000000000..27248760f
---- /dev/null
-+++ b/kernel/sched/ext.h
-@@ -0,0 +1,266 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+enum scx_wake_flags {
-+	/* expose select WF_* flags as enums */
-+	SCX_WAKE_EXEC		= WF_EXEC,
-+	SCX_WAKE_FORK		= WF_FORK,
-+	SCX_WAKE_TTWU		= WF_TTWU,
-+	SCX_WAKE_SYNC		= WF_SYNC,
-+};
-+
-+enum scx_enq_flags {
-+	/* expose select ENQUEUE_* flags as enums */
-+	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
-+	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
-+
-+	/* high 32bits are SCX specific */
-+
-+	/*
-+	 * Set the following to trigger preemption when calling
-+	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
-+	 * current task is cleared to zero and the CPU is kicked into the
-+	 * scheduling path. Implies %SCX_ENQ_HEAD.
-+	 */
-+	SCX_ENQ_PREEMPT		= 1LLU << 32,
-+
-+	/*
-+	 * The task being enqueued was previously enqueued on the current CPU's
-+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
-+	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
-+	 * invoked in a ->cpu_release() callback, and the task is again
-+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
-+	 * task will not be scheduled on the CPU until at least the next invocation
-+	 * of the ->cpu_acquire() callback.
-+	 */
-+	SCX_ENQ_REENQ		= 1LLU << 40,
-+
-+	/*
-+	 * The task being enqueued is the only task available for the cpu. By
-+	 * default, ext core keeps executing such tasks but when
-+	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with
-+	 * %SCX_ENQ_LAST and %SCX_ENQ_LOCAL flags set.
-+	 *
-+	 * If the BPF scheduler wants to continue executing the task,
-+	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
-+	 * If the task gets queued on a different dsq or the BPF side, the BPF
-+	 * scheduler is responsible for triggering a follow-up scheduling event.
-+	 * Otherwise, Execution may stall.
-+	 */
-+	SCX_ENQ_LAST		= 1LLU << 41,
-+
-+	/*
-+	 * A hint indicating that it's advisable to enqueue the task on the
-+	 * local dsq of the currently selected CPU. Currently used by
-+	 * select_cpu_dfl() and together with %SCX_ENQ_LAST.
-+	 */
-+	SCX_ENQ_LOCAL		= 1LLU << 42,
-+
-+	/* high 8 bits are internal */
-+	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
-+
-+	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
-+	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
-+};
-+
-+enum scx_deq_flags {
-+	/* expose select DEQUEUE_* flags as enums */
-+	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
-+
-+	/* high 32bits are SCX specific */
-+
-+	/*
-+	 * The generic core-sched layer decided to execute the task even though
-+	 * it hasn't been dispatched yet. Dequeue from the BPF side.
-+	 */
-+	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
-+};
-+
-+enum scx_pick_idle_cpu_flags {
-+	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
-+};
-+
-+enum scx_kick_flags {
-+	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
-+	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
-+};
-+
-+enum scx_tg_flags {
-+	SCX_TG_ONLINE		= 1U << 0,
-+	SCX_TG_INITED		= 1U << 1,
-+};
-+
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+
-+struct sched_enq_and_set_ctx {
-+	struct task_struct	*p;
-+	int			queue_flags;
-+	bool			queued;
-+	bool			running;
-+};
-+
-+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-+			    struct sched_enq_and_set_ctx *ctx);
-+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
-+
-+extern const struct sched_class ext_sched_class;
-+extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
-+extern const struct file_operations sched_ext_fops;
-+extern unsigned long scx_watchdog_timeout;
-+extern unsigned long scx_watchdog_timestamp;
-+
-+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
-+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
-+#define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
-+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
-+
-+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-+
-+static inline bool task_on_scx(const struct task_struct *p)
-+{
-+	return scx_enabled() && p->sched_class == &ext_sched_class;
-+}
-+
-+bool task_should_scx(struct task_struct *p);
-+void scx_pre_fork(struct task_struct *p);
-+int scx_fork(struct task_struct *p);
-+void scx_post_fork(struct task_struct *p);
-+void scx_cancel_fork(struct task_struct *p);
-+int scx_check_setscheduler(struct task_struct *p, int policy);
-+bool scx_can_stop_tick(struct rq *rq);
-+void init_sched_ext_class(void);
-+
-+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
-+				       const char *fmt, ...);
-+#define scx_ops_error(fmt, args...)						\
-+	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
-+
-+void __scx_notify_pick_next_task(struct rq *rq,
-+				 struct task_struct *p,
-+				 const struct sched_class *active);
-+
-+static inline void scx_notify_pick_next_task(struct rq *rq,
-+					     struct task_struct *p,
-+					     const struct sched_class *active)
-+{
-+	if (!scx_enabled())
-+		return;
-+#ifdef CONFIG_SMP
-+	/*
-+	 * Pairs with the smp_load_acquire() issued by a CPU in
-+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
-+	 * resched.
-+	 */
-+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-+#endif
-+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
-+		return;
-+	__scx_notify_pick_next_task(rq, p, active);
-+}
-+
-+static inline void scx_notify_sched_tick(void)
-+{
-+	unsigned long last_check;
-+
-+	if (!scx_enabled())
-+		return;
-+
-+	last_check = scx_watchdog_timestamp;
-+	if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) {
-+		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
-+
-+		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
-+				   "watchdog failed to check in for %u.%03us",
-+				   dur_ms / 1000, dur_ms % 1000);
-+	}
-+}
-+
-+static inline const struct sched_class *next_active_class(const struct sched_class *class)
-+{
-+	class++;
-+	if (scx_switched_all() && class == &fair_sched_class)
-+		class++;
-+	if (!scx_enabled() && class == &ext_sched_class)
-+		class++;
-+	return class;
-+}
-+
-+#define for_active_class_range(class, _from, _to)				\
-+	for (class = (_from); class != (_to); class = next_active_class(class))
-+
-+#define for_each_active_class(class)						\
-+	for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
-+
-+/*
-+ * SCX requires a balance() call before every pick_next_task() call including
-+ * when waking up from idle.
-+ */
-+#define for_balance_class_range(class, prev_class, end_class)			\
-+	for_active_class_range(class, (prev_class) > &ext_sched_class ?		\
-+			       &ext_sched_class : (prev_class), (end_class))
-+
-+#ifdef CONFIG_SCHED_CORE
-+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
-+		   bool in_fi);
-+#endif
-+
-+#else	/* CONFIG_SCHED_CLASS_EXT */
-+
-+#define scx_enabled()		false
-+#define scx_switched_all()	false
-+
-+static inline bool task_on_scx(const struct task_struct *p) { return false; }
-+static inline void scx_pre_fork(struct task_struct *p) {}
-+static inline int scx_fork(struct task_struct *p) { return 0; }
-+static inline void scx_post_fork(struct task_struct *p) {}
-+static inline void scx_cancel_fork(struct task_struct *p) {}
-+static inline int scx_check_setscheduler(struct task_struct *p,
-+					 int policy) { return 0; }
-+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
-+static inline void init_sched_ext_class(void) {}
-+static inline void scx_notify_pick_next_task(struct rq *rq,
-+					     const struct task_struct *p,
-+					     const struct sched_class *active) {}
-+static inline void scx_notify_sched_tick(void) {}
-+
-+#define for_each_active_class		for_each_class
-+#define for_balance_class_range		for_class_range
-+
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-+
-+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-+void __scx_update_idle(struct rq *rq, bool idle);
-+
-+static inline void scx_update_idle(struct rq *rq, bool idle)
-+{
-+	if (scx_enabled())
-+		__scx_update_idle(rq, idle);
-+}
-+#else
-+static inline void scx_update_idle(struct rq *rq, bool idle) {}
-+#endif
-+
-+#ifdef CONFIG_CGROUP_SCHED
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+int scx_tg_online(struct task_group *tg);
-+void scx_tg_offline(struct task_group *tg);
-+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
-+void scx_move_task(struct task_struct *p);
-+void scx_cgroup_finish_attach(void);
-+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
-+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+static inline int scx_tg_online(struct task_group *tg) { return 0; }
-+static inline void scx_tg_offline(struct task_group *tg) {}
-+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
-+static inline void scx_move_task(struct task_struct *p) {}
-+static inline void scx_cgroup_finish_attach(void) {}
-+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
-+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+#endif	/* CONFIG_CGROUP_SCHED */
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index fa9fff0f9..1ed9d351c 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -3785,7 +3785,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- 	}
- }
- 
--void reweight_task(struct task_struct *p, int prio)
-+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio)
- {
- 	struct sched_entity *se = &p->se;
- 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-@@ -8187,7 +8187,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
- 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
- 	 * is driven by the tick):
- 	 */
--	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
-+	if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION))
- 		return;
- 
- 	find_matching_se(&se, &pse);
-@@ -12325,14 +12325,14 @@ void trigger_load_balance(struct rq *rq)
- 	nohz_balancer_kick(rq);
- }
- 
--static void rq_online_fair(struct rq *rq)
-+static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	update_sysctl();
- 
- 	update_runtime_enabled(rq);
- }
- 
--static void rq_offline_fair(struct rq *rq)
-+static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	update_sysctl();
- 
-@@ -13024,6 +13024,7 @@ DEFINE_SCHED_CLASS(fair) = {
- 	.task_tick		= task_tick_fair,
- 	.task_fork		= task_fork_fair,
- 
-+	.reweight_task		= reweight_task_fair,
- 	.prio_changed		= prio_changed_fair,
- 	.switched_from		= switched_from_fair,
- 	.switched_to		= switched_to_fair,
-diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
-index 5007b25c5..b33cefeb4 100644
---- a/kernel/sched/idle.c
-+++ b/kernel/sched/idle.c
-@@ -408,11 +408,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
- 
- static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
- {
-+	scx_update_idle(rq, false);
- }
- 
- static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
- {
- 	update_idle_core(rq);
-+	scx_update_idle(rq, true);
- 	schedstat_inc(rq->sched_goidle);
- }
- 
-diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
-index 904dd8534..449a9f28d 100644
---- a/kernel/sched/rt.c
-+++ b/kernel/sched/rt.c
-@@ -2481,7 +2481,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
- }
- 
- /* Assumes rq->lock is held */
--static void rq_online_rt(struct rq *rq)
-+static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (rq->rt.overloaded)
- 		rt_set_overload(rq);
-@@ -2492,7 +2492,7 @@ static void rq_online_rt(struct rq *rq)
- }
- 
- /* Assumes rq->lock is held */
--static void rq_offline_rt(struct rq *rq)
-+static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason)
- {
- 	if (rq->rt.overloaded)
- 		rt_clear_overload(rq);
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 048462724..0b33d0117 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -182,9 +182,19 @@ static inline int idle_policy(int policy)
- {
- 	return policy == SCHED_IDLE;
- }
-+
-+static inline int normal_policy(int policy)
-+{
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	if (policy == SCHED_EXT)
-+		return true;
-+#endif
-+	return policy == SCHED_NORMAL;
-+}
-+
- static inline int fair_policy(int policy)
- {
--	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
-+	return normal_policy(policy) || policy == SCHED_BATCH;
- }
- 
- static inline int rt_policy(int policy)
-@@ -232,6 +242,24 @@ static inline void update_avg(u64 *avg, u64 sample)
- #define shr_bound(val, shift)							\
- 	(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
- 
-+/*
-+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
-+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
-+ * maps pretty well onto the shares value used by scheduler and the round-trip
-+ * conversions preserve the original value over the entire range.
-+ */
-+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
-+{
-+	return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
-+}
-+
-+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
-+{
-+	return clamp_t(unsigned long,
-+		       DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
-+		       CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
-+}
-+
- /*
-  * !! For sched_setattr_nocheck() (kernel) only !!
-  *
-@@ -390,6 +418,11 @@ struct task_group {
- 	struct rt_bandwidth	rt_bandwidth;
- #endif
- 
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	u32			scx_flags;	/* SCX_TG_* */
-+	u32			scx_weight;
-+#endif
-+
- 	struct rcu_head		rcu;
- 	struct list_head	list;
- 
-@@ -445,6 +478,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
- 	return walk_tg_tree_from(&root_task_group, down, up, data);
- }
- 
-+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-+{
-+	return css ? container_of(css, struct task_group, css) : NULL;
-+}
-+
- extern int tg_nop(struct task_group *tg, void *data);
- 
- extern void free_fair_sched_group(struct task_group *tg);
-@@ -490,6 +528,11 @@ extern void set_task_rq_fair(struct sched_entity *se,
- static inline void set_task_rq_fair(struct sched_entity *se,
- 			     struct cfs_rq *prev, struct cfs_rq *next) { }
- #endif /* CONFIG_SMP */
-+#else /* CONFIG_FAIR_GROUP_SCHED */
-+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-+{
-+	return 0;
-+}
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- 
- #else /* CONFIG_CGROUP_SCHED */
-@@ -651,6 +694,28 @@ struct cfs_rq {
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- };
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+/* scx_rq->flags, protected by the rq lock */
-+enum scx_rq_flags {
-+	SCX_RQ_CAN_STOP_TICK	= 1 << 0,
-+};
-+
-+struct scx_rq {
-+	struct scx_dispatch_q	local_dsq;
-+	struct list_head	watchdog_list;
-+	unsigned long		ops_qseq;
-+	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
-+	u32			nr_running;
-+	u32			flags;
-+	bool			cpu_released;
-+	cpumask_var_t		cpus_to_kick;
-+	cpumask_var_t		cpus_to_preempt;
-+	cpumask_var_t		cpus_to_wait;
-+	unsigned long		pnt_seq;
-+	struct irq_work		kick_cpus_irq_work;
-+};
-+#endif /* CONFIG_SCHED_CLASS_EXT */
-+
- static inline int rt_bandwidth_enabled(void)
- {
- 	return sysctl_sched_rt_runtime >= 0;
-@@ -998,6 +1063,9 @@ struct rq {
- 	struct cfs_rq		cfs;
- 	struct rt_rq		rt;
- 	struct dl_rq		dl;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	struct scx_rq		scx;
-+#endif
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 	/* list of leaf cfs_rq on this CPU: */
-@@ -2222,6 +2290,11 @@ extern const u32		sched_prio_to_wmult[40];
- 
- #define RETRY_TASK		((void *)-1UL)
- 
-+enum rq_onoff_reason {
-+	RQ_ONOFF_HOTPLUG,		/* CPU is going on/offline */
-+	RQ_ONOFF_TOPOLOGY,		/* sched domain topology update */
-+};
-+
- struct affinity_context {
- 	const struct cpumask *new_mask;
- 	struct cpumask *user_mask;
-@@ -2258,8 +2331,8 @@ struct sched_class {
- 
- 	void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
- 
--	void (*rq_online)(struct rq *rq);
--	void (*rq_offline)(struct rq *rq);
-+	void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason);
-+	void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason);
- 
- 	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
- #endif
-@@ -2273,8 +2346,11 @@ struct sched_class {
- 	 * cannot assume the switched_from/switched_to pair is serialized by
- 	 * rq->lock. They are however serialized by p->pi_lock.
- 	 */
-+	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
-+	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
-+			      int newprio);
- 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- 			      int oldprio);
- 
-@@ -2432,7 +2508,7 @@ extern void init_sched_dl_class(void);
- extern void init_sched_rt_class(void);
- extern void init_sched_fair_class(void);
- 
--extern void reweight_task(struct task_struct *p, int prio);
-+extern void __setscheduler_prio(struct task_struct *p, int prio);
- 
- extern void resched_curr(struct rq *rq);
- extern void resched_cpu(int cpu);
-@@ -2513,6 +2589,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
- extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
- extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
- 
-+extern void check_class_changing(struct rq *rq, struct task_struct *p,
-+				 const struct sched_class *prev_class);
-+extern void check_class_changed(struct rq *rq, struct task_struct *p,
-+				const struct sched_class *prev_class,
-+				int oldprio);
-+
- extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
- 
- #ifdef CONFIG_PREEMPT_RT
-@@ -2794,8 +2876,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- 	raw_spin_rq_unlock(rq1);
- }
- 
--extern void set_rq_online (struct rq *rq);
--extern void set_rq_offline(struct rq *rq);
-+extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason);
-+extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason);
- extern bool sched_smp_initialized;
- 
- #else /* CONFIG_SMP */
-@@ -3528,4 +3610,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
- extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
- extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
- 
-+#ifdef CONFIG_CGROUP_SCHED
-+enum cpu_cftype_id {
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+	CPU_CFTYPE_WEIGHT,
-+	CPU_CFTYPE_WEIGHT_NICE,
-+	CPU_CFTYPE_IDLE,
-+#endif
-+#ifdef CONFIG_CFS_BANDWIDTH
-+	CPU_CFTYPE_MAX,
-+	CPU_CFTYPE_MAX_BURST,
-+#endif
-+#ifdef CONFIG_UCLAMP_TASK_GROUP
-+	CPU_CFTYPE_UCLAMP_MIN,
-+	CPU_CFTYPE_UCLAMP_MAX,
-+#endif
-+	CPU_CFTYPE_CNT,
-+};
-+
-+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
-+#endif /* CONFIG_CGROUP_SCHED */
-+
-+#include "ext.h"
-+
- #endif /* _KERNEL_SCHED_SCHED_H */
-diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
-index 423d08947..2adf6a0fb 100644
---- a/kernel/sched/topology.c
-+++ b/kernel/sched/topology.c
-@@ -495,7 +495,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
- 		old_rd = rq->rd;
- 
- 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
--			set_rq_offline(rq);
-+			set_rq_offline(rq, RQ_ONOFF_TOPOLOGY);
- 
- 		cpumask_clear_cpu(rq->cpu, old_rd->span);
- 
-@@ -513,7 +513,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
- 
- 	cpumask_set_cpu(rq->cpu, rd->span);
- 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
--		set_rq_online(rq);
-+		set_rq_online(rq, RQ_ONOFF_TOPOLOGY);
- 
- 	rq_unlock_irqrestore(rq, &rf);
- 
-diff --git a/lib/dump_stack.c b/lib/dump_stack.c
-index 83471e815..6e667c445 100644
---- a/lib/dump_stack.c
-+++ b/lib/dump_stack.c
-@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl)
- 
- 	print_worker_info(log_lvl, current);
- 	print_stop_info(log_lvl, current);
-+	print_scx_info(log_lvl, current);
- }
- 
- /**
-diff --git a/tools/Makefile b/tools/Makefile
-index 37e9f6804..8021267f7 100644
---- a/tools/Makefile
-+++ b/tools/Makefile
-@@ -29,6 +29,7 @@ help:
- 	@echo '  pci                    - PCI tools'
- 	@echo '  perf                   - Linux performance measurement and analysis tool'
- 	@echo '  selftests              - various kernel selftests'
-+	@echo '  sched_ext              - sched_ext example schedulers'
- 	@echo '  bootconfig             - boot config tool'
- 	@echo '  spi                    - spi tools'
- 	@echo '  tmon                   - thermal monitoring and tuning tool'
-@@ -92,6 +93,9 @@ perf: FORCE
- 	$(Q)mkdir -p $(PERF_O) .
- 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
- 
-+sched_ext: FORCE
-+	$(call descend,sched_ext)
-+
- selftests: FORCE
- 	$(call descend,testing/$@)
- 
-@@ -185,6 +189,9 @@ perf_clean:
- 	$(Q)mkdir -p $(PERF_O) .
- 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
- 
-+sched_ext_clean:
-+	$(call descend,sched_ext,clean)
-+
- selftests_clean:
- 	$(call descend,testing/$(@:_clean=),clean)
- 
-@@ -214,6 +221,7 @@ clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_cl
- 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
- 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
- 		gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
--		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean
-+		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \
-+		sched_ext_clean
- 
- .PHONY: FORCE
-diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
-new file mode 100644
-index 000000000..215ed36b2
---- /dev/null
-+++ b/tools/sched_ext/.gitignore
-@@ -0,0 +1,10 @@
-+scx_simple
-+scx_qmap
-+scx_central
-+scx_pair
-+scx_flatcg
-+scx_userland
-+*.skel.h
-+*.subskel.h
-+/tools/
-+build/
-diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig
-new file mode 100644
-index 000000000..6543fcf19
---- /dev/null
-+++ b/tools/sched_ext/Kconfig
-@@ -0,0 +1,9 @@
-+CONFIG_BPF=y
-+CONFIG_SCHED_CLASS_EXT=y
-+CONFIG_BPF_SYSCALL=y
-+CONFIG_BPF_JIT=y
-+CONFIG_DEBUG_INFO_BTF=y
-+CONFIG_BPF_JIT_ALWAYS_ON=y
-+CONFIG_BPF_JIT_DEFAULT_ON=y
-+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
-+CONFIG_PAHOLE_HAS_BTF_TAG=y
-diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
-new file mode 100644
-index 000000000..7ea754b7d
---- /dev/null
-+++ b/tools/sched_ext/Makefile
-@@ -0,0 +1,301 @@
-+# SPDX-License-Identifier: GPL-2.0
-+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+include ../build/Build.include
-+include ../scripts/Makefile.arch
-+include ../scripts/Makefile.include
-+
-+all: all_targets
-+
-+ifneq ($(LLVM),)
-+ifneq ($(filter %/,$(LLVM)),)
-+LLVM_PREFIX := $(LLVM)
-+else ifneq ($(filter -%,$(LLVM)),)
-+LLVM_SUFFIX := $(LLVM)
-+endif
-+
-+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
-+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
-+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
-+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
-+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
-+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
-+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
-+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
-+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
-+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
-+
-+ifeq ($(CROSS_COMPILE),)
-+ifeq ($(CLANG_TARGET_FLAGS),)
-+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
-+else
-+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
-+endif # CLANG_TARGET_FLAGS
-+else
-+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
-+endif # CROSS_COMPILE
-+
-+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
-+else
-+CC := $(CROSS_COMPILE)gcc
-+endif # LLVM
-+
-+CURDIR := $(abspath .)
-+TOOLSDIR := $(abspath ..)
-+LIBDIR := $(TOOLSDIR)/lib
-+BPFDIR := $(LIBDIR)/bpf
-+TOOLSINCDIR := $(TOOLSDIR)/include
-+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
-+APIDIR := $(TOOLSINCDIR)/uapi
-+GENDIR := $(abspath ../../include/generated)
-+GENHDR := $(GENDIR)/autoconf.h
-+
-+ifeq ($(O),)
-+OUTPUT_DIR := $(CURDIR)/build
-+else
-+OUTPUT_DIR := $(O)/build
-+endif # O
-+OBJ_DIR := $(OUTPUT_DIR)/obj
-+INCLUDE_DIR := $(OUTPUT_DIR)/include
-+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
-+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
-+BINDIR := $(OUTPUT_DIR)/bin
-+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
-+ifneq ($(CROSS_COMPILE),)
-+HOST_BUILD_DIR		:= $(OBJ_DIR)/host
-+HOST_OUTPUT_DIR	:= host-tools
-+HOST_INCLUDE_DIR	:= $(HOST_OUTPUT_DIR)/include
-+else
-+HOST_BUILD_DIR		:= $(OBJ_DIR)
-+HOST_OUTPUT_DIR	:= $(OUTPUT_DIR)
-+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
-+endif
-+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
-+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
-+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
-+
-+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
-+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
-+		     ../../vmlinux						\
-+		     /sys/kernel/btf/vmlinux					\
-+		     /boot/vmlinux-$(shell uname -r)
-+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
-+ifeq ($(VMLINUX_BTF),)
-+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
-+endif
-+
-+BPFTOOL ?= $(DEFAULT_BPFTOOL)
-+
-+ifneq ($(wildcard $(GENHDR)),)
-+  GENFLAGS := -DHAVE_GENHDR
-+endif
-+
-+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
-+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
-+	  -I$(TOOLSINCDIR) -I$(APIDIR)
-+
-+CARGOFLAGS := --release --target-dir $(OUTPUT_DIR)
-+ifneq ($(CARGO_OFFLINE),)
-+CARGOFLAGS += --offline
-+endif
-+
-+# Silence some warnings when compiled with clang
-+ifneq ($(LLVM),)
-+CFLAGS += -Wno-unused-command-line-argument
-+endif
-+
-+LDFLAGS = -lelf -lz -lpthread
-+
-+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
-+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
-+
-+# Get Clang's default includes on this system, as opposed to those seen by
-+# '-target bpf'. This fixes "missing" files on some architectures/distros,
-+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-+#
-+# Use '-idirafter': Don't interfere with include mechanics except where the
-+# build would have failed anyways.
-+define get_sys_includes
-+$(shell $(1) -v -E - </dev/null 2>&1 \
-+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-+endef
-+
-+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
-+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
-+	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)				\
-+	     -I../../include							\
-+	     $(call get_sys_includes,$(CLANG))					\
-+	     -Wall -Wno-compare-distinct-pointer-types				\
-+	     -O2 -mcpu=v3
-+
-+# sort removes libbpf duplicates when not cross-building
-+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
-+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
-+	       $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
-+
-+$(MAKE_DIRS):
-+	$(call msg,MKDIR,,$@)
-+	$(Q)mkdir -p $@
-+
-+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
-+	   $(APIDIR)/linux/bpf.h						\
-+	   | $(OBJ_DIR)/libbpf
-+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
-+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
-+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
-+
-+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
-+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
-+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
-+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
-+		    EXTRA_CFLAGS='-g -O0'					\
-+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
-+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
-+		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
-+		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
-+
-+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
-+ifeq ($(VMLINUX_H),)
-+	$(call msg,GEN,,$@)
-+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
-+else
-+	$(call msg,CP,,$@)
-+	$(Q)cp "$(VMLINUX_H)" $@
-+endif
-+
-+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h	\
-+		       user_exit_info.h ravg.bpf.h ravg_impl.bpf.h		\
-+		       | $(BPFOBJ) $(SCXOBJ_DIR)
-+	$(call msg,CLNG-BPF,,$(notdir $@))
-+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
-+
-+$(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
-+	$(eval sched=$(notdir $@))
-+	$(call msg,GEN-SKEL,,$(sched))
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
-+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
-+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@
-+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
-+
-+SCX_COMMON_DEPS := scx_common.h user_exit_info.h | $(BINDIR)
-+
-+################
-+# C schedulers #
-+################
-+c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg		\
-+		  scx_userland scx_nest
-+
-+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
-+	$(BINDIR)/%: \
-+		$(filter-out %.bpf.c,%.c) \
-+		$(INCLUDE_DIR)/%.skel.h \
-+		$(SCX_COMMON_DEPS)
-+	$(eval sched=$(notdir $@))
-+	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
-+	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
-+$(c-sched-targets): %: $(BINDIR)/%
-+
-+
-+###################
-+# Rust schedulers #
-+###################
-+rust-sched-targets := scx_rusty scx_layered
-+
-+# Separate build target that is available for build systems to use to fetch
-+# dependencies in a separate step from building. This allows the scheduler
-+# to be compiled without network access.
-+#
-+# If the regular rust scheduler Make target (e.g. scx_rusty) is invoked without
-+# CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download
-+# all of the necessary dependencies, and the deps target can be skipped.
-+$(addsuffix _deps,$(rust-sched-targets)):
-+	$(eval sched=$(@:_deps=))
-+	$(Q)cargo fetch --manifest-path=$(sched)/Cargo.toml
-+
-+$(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
-+	$(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR))
-+	$(eval export SCX_RUST_CLANG = $(CLANG))
-+	$(eval export SCX_RUST_BPF_CFLAGS= $(BPF_CFLAGS))
-+	$(eval sched=$(notdir $@))
-+	$(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS)
-+	$(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@
-+
-+install: all
-+	$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
-+	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
-+
-+clean:
-+	$(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;)
-+	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
-+	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-+	rm -f $(c-sched-targets)
-+
-+help:
-+	@echo   'Building targets'
-+	@echo   '================'
-+	@echo   ''
-+	@echo   '  all		  - Compile all schedulers'
-+	@echo   ''
-+	@echo   'Alternatively, you may compile individual schedulers:'
-+	@echo   ''
-+	@printf '  %s\n' $(c-sched-targets) $(rust-sched-targets)
-+	@echo   ''
-+	@echo   'For any scheduler build target, you may specify an alternative'
-+	@echo   'build output path with the O= environment variable. For example:'
-+	@echo   ''
-+	@echo   '   O=/tmp/sched_ext make all'
-+	@echo   ''
-+	@echo   'will compile all schedulers, and emit the build artifacts to'
-+	@echo   '/tmp/sched_ext/build.'
-+	@echo   ''
-+	@echo   ''
-+	@echo   'Rust scheduler targets'
-+	@echo   '======================'
-+	@echo   ''
-+	@printf '  %s\n' $(rust-sched-targets)
-+	@printf '  %s_deps\n' $(rust-sched-targets)
-+	@echo   ''
-+	@echo   'For any rust schedulers built with cargo, you can specify'
-+	@echo   'CARGO_OFFLINE=1 to ensure the build portion does not access the'
-+	@echo   'network (e.g. if the scheduler is being packaged).'
-+	@echo   ''
-+	@echo   'For such use cases, the build workflow will look something like this:'
-+	@echo   ''
-+	@echo   '   make scx_rusty_deps'
-+	@echo   '   CARGO_OFFLINE=1 make scx_rusty'
-+	@echo   ''
-+	@echo   'If network access during build is allowed, you can just make scx_rusty'
-+	@echo   'directly without CARGO_OFFLINE, and dependencies will be downloaded'
-+	@echo   'during the build step.'
-+	@echo   ''
-+	@echo   ''
-+	@echo   'Installing targets'
-+	@echo   '=================='
-+	@echo   ''
-+	@echo   '  install	  - Compile and install all schedulers to /usr/bin.'
-+	@echo   '		    You may specify the DESTDIR= environment variable'
-+	@echo   '		    to indicate a prefix for /usr/bin. For example:'
-+	@echo   ''
-+	@echo   '                     DESTDIR=/tmp/sched_ext make install'
-+	@echo   ''
-+	@echo   '		    will build the schedulers in CWD/build, and'
-+	@echo   '		    install the schedulers to /tmp/sched_ext/usr/bin.'
-+	@echo   ''
-+	@echo   ''
-+	@echo   'Cleaning targets'
-+	@echo   '================'
-+	@echo   ''
-+	@echo   '  clean		  - Remove all generated files, including intermediate'
-+	@echo   '                    rust files for rust schedulers.'
-+
-+all_targets: $(c-sched-targets) $(rust-sched-targets)
-+
-+.PHONY: all all_targets $(c-sched-targets) $(rust-sched-targets) clean help
-+
-+# delete failed targets
-+.DELETE_ON_ERROR:
-+
-+# keep intermediate (.skel.h, .bpf.o, etc) targets
-+.SECONDARY:
-diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
-new file mode 100644
-index 000000000..8e7194ada
---- /dev/null
-+++ b/tools/sched_ext/README.md
-@@ -0,0 +1,403 @@
-+SCHED_EXT EXAMPLE SCHEDULERS
-+============================
-+
-+# Introduction
-+
-+This directory contains a number of example sched_ext schedulers. These
-+schedulers are meant to provide examples of different types of schedulers
-+that can be built using sched_ext, and illustrate how various features of
-+sched_ext can be used.
-+
-+Some of the examples are performant, production-ready schedulers. That is, for
-+the correct workload and with the correct tuning, they may be deployed in a
-+production environment with acceptable or possibly even improved performance.
-+Others are just examples that in practice, would not provide acceptable
-+performance (though they could be improved to get there).
-+
-+This README will describe these example schedulers, including describing the
-+types of workloads or scenarios they're designed to accommodate, and whether or
-+not they're production ready. For more details on any of these schedulers,
-+please see the header comment in their .bpf.c file.
-+
-+
-+# Compiling the examples
-+
-+There are a few toolchain dependencies for compiling the example schedulers.
-+
-+## Toolchain dependencies
-+
-+1. clang >= 16.0.0
-+
-+The schedulers are BPF programs, and therefore must be compiled with clang. gcc
-+is actively working on adding a BPF backend compiler as well, but are still
-+missing some features such as BTF type tags which are necessary for using
-+kptrs.
-+
-+2. pahole >= 1.25
-+
-+You may need pahole in order to generate BTF from DWARF.
-+
-+3. rust >= 1.70.0
-+
-+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
-+should be able to use the stable build from rustup, but if that doesn't
-+work, try using the rustup nightly build.
-+
-+There are other requirements as well, such as make, but these are the main /
-+non-trivial ones.
-+
-+## Compiling the kernel
-+
-+In order to run a sched_ext scheduler, you'll have to run a kernel compiled
-+with the patches in this repository, and with a minimum set of necessary
-+Kconfig options:
-+
-+```
-+CONFIG_BPF=y
-+CONFIG_SCHED_CLASS_EXT=y
-+CONFIG_BPF_SYSCALL=y
-+CONFIG_BPF_JIT=y
-+CONFIG_DEBUG_INFO_BTF=y
-+```
-+
-+It's also recommended that you also include the following Kconfig options:
-+
-+```
-+CONFIG_BPF_JIT_ALWAYS_ON=y
-+CONFIG_BPF_JIT_DEFAULT_ON=y
-+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
-+CONFIG_PAHOLE_HAS_BTF_TAG=y
-+```
-+
-+There is a `Kconfig` file in this directory whose contents you can append to
-+your local `.config` file, as long as there are no conflicts with any existing
-+options in the file.
-+
-+## Getting a vmlinux.h file
-+
-+You may notice that most of the example schedulers include a "vmlinux.h" file.
-+This is a large, auto-generated header file that contains all of the types
-+defined in some vmlinux binary that was compiled with
-+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
-+options specified above).
-+
-+The header file is created using `bpftool`, by passing it a vmlinux binary
-+compiled with BTF as follows:
-+
-+```bash
-+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
-+```
-+
-+`bpftool` analyzes all of the BTF encodings in the binary, and produces a
-+header file that can be included by BPF programs to access those types.  For
-+example, using vmlinux.h allows a scheduler to access fields defined directly
-+in vmlinux as follows:
-+
-+```c
-+#include "vmlinux.h"
-+// vmlinux.h is also implicitly included by scx_common.bpf.h.
-+#include "scx_common.bpf.h"
-+
-+/*
-+ * vmlinux.h provides definitions for struct task_struct and
-+ * struct scx_enable_args.
-+ */
-+void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
-+		    struct scx_enable_args *args)
-+{
-+	bpf_printk("Task %s enabled in example scheduler", p->comm);
-+}
-+
-+// vmlinux.h provides the definition for struct sched_ext_ops.
-+SEC(".struct_ops.link")
-+struct sched_ext_ops example_ops {
-+	.enable	= (void *)example_enable,
-+	.name	= "example",
-+}
-+```
-+
-+The scheduler build system will generate this vmlinux.h file as part of the
-+scheduler build pipeline. It looks for a vmlinux file in the following
-+dependency order:
-+
-+1. If the O= environment variable is defined, at `$O/vmlinux`
-+2. If the KBUILD_OUTPUT= environment variable is defined, at
-+   `$KBUILD_OUTPUT/vmlinux`
-+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
-+   compiling the schedulers)
-+3. `/sys/kernel/btf/vmlinux`
-+4. `/boot/vmlinux-$(uname -r)`
-+
-+In other words, if you have compiled a kernel in your local repo, its vmlinux
-+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
-+the kernel you're currently running on. This means that if you're running on a
-+kernel with sched_ext support, you may not need to compile a local kernel at
-+all.
-+
-+### Aside on CO-RE
-+
-+One of the cooler features of BPF is that it supports
-+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
-+Everywhere). This feature allows you to reference fields inside of structs with
-+types defined internal to the kernel, and not have to recompile if you load the
-+BPF program on a different kernel with the field at a different offset. In our
-+example above, we print out a task name with `p->comm`. CO-RE would perform
-+relocations for that access when the program is loaded to ensure that it's
-+referencing the correct offset for the currently running kernel.
-+
-+## Compiling the schedulers
-+
-+Once you have your toolchain setup, and a vmlinux that can be used to generate
-+a full vmlinux.h file, you can compile the schedulers using `make`:
-+
-+```bash
-+$ make -j($nproc)
-+```
-+
-+# Schedulers
-+
-+This section lists, in alphabetical order, all of the current example
-+schedulers.
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_simple
-+
-+### Overview
-+
-+A simple scheduler that provides an example of a minimal sched_ext
-+scheduler. scx_simple can be run in either global weighted vtime mode, or
-+FIFO mode.
-+
-+### Typical Use Case
-+
-+Though very simple, this scheduler should perform reasonably well on
-+single-socket CPUs with a uniform L3 cache topology. Note that while running in
-+global FIFO mode may work well for some workloads, saturating threads can
-+easily drown out inactive ones.
-+
-+### Production Ready?
-+
-+This scheduler could be used in a production environment, assuming the hardware
-+constraints enumerated above, and assuming the workload can accommodate a
-+simple scheduling policy.
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_qmap
-+
-+### Overview
-+
-+Another simple, yet slightly more complex scheduler that provides an example of
-+a basic weighted FIFO queuing policy. It also provides examples of some common
-+useful BPF features, such as sleepable per-task storage allocation in the
-+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
-+enqueue tasks. It also illustrates how core-sched support could be implemented.
-+
-+### Typical Use Case
-+
-+Purely used to illustrate sched_ext features.
-+
-+### Production Ready?
-+
-+No
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_central
-+
-+### Overview
-+
-+A "central" scheduler where scheduling decisions are made from a single CPU.
-+This scheduler illustrates how scheduling decisions can be dispatched from a
-+single CPU, allowing other cores to run with infinite slices, without timer
-+ticks, and without having to incur the overhead of making scheduling decisions.
-+
-+### Typical Use Case
-+
-+This scheduler could theoretically be useful for any workload that benefits
-+from minimizing scheduling overhead and timer ticks. An example of where this
-+could be particularly useful is running VMs, where running with infinite slices
-+and no timer ticks allows the VM to avoid unnecessary expensive vmexits.
-+
-+### Production Ready?
-+
-+Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're
-+preempted every 20ms in a timer callback. The scheduler also puts the core
-+schedling logic inside of the central / scheduling CPU's ops.dispatch() path,
-+and does not yet have any kind of priority mechanism.
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_pair
-+
-+### Overview
-+
-+A sibling scheduler which ensures that tasks will only ever be co-located on a
-+physical core if they're in the same cgroup. It illustrates how a scheduling
-+policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows
-+how some useful kfuncs such as `scx_bpf_kick_cpu()` can be utilized.
-+
-+### Typical Use Case
-+
-+While this scheduler is only meant to be used to illustrate certain sched_ext
-+features, with a bit more work (e.g. by adding some form of priority handling
-+inside and across cgroups), it could have been used as a way to quickly
-+mitigate L1TF before core scheduling was implemented and rolled out.
-+
-+### Production Ready?
-+
-+No
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_flatcg
-+
-+### Overview
-+
-+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
-+weight-based cgroup CPU control by flattening the cgroup hierarchy into a
-+single layer, by compounding the active weight share at each level. The effect
-+of this is a much more performant CPU controller, which does not need to
-+descend down cgroup trees in order to properly compute a cgroup's share.
-+
-+### Typical Use Case
-+
-+This scheduler could be useful for any typical workload requiring a CPU
-+controller, but which cannot tolerate the higher overheads of the fair CPU
-+controller.
-+
-+### Production Ready?
-+
-+Yes, though the scheduler (currently) does not adequately accommodate
-+thundering herds of cgroups. If, for example, many cgroups which are nested
-+behind a low-priority cgroup were to wake up around the same time, they may be
-+able to consume more CPU cycles than they are entitled to.
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_userland
-+
-+### Overview
-+
-+A simple weighted vtime scheduler where all scheduling decisions take place in
-+user space. This is in contrast to Rusty, where load balancing lives in user
-+space, but scheduling decisions are still made in the kernel.
-+
-+### Typical Use Case
-+
-+There are many advantages to writing schedulers in user space. For example, you
-+can use a debugger, you can write the scheduler in Rust, and you can use data
-+structures bundled with your favorite library.
-+
-+On the other hand, user space scheduling can be hard to get right. You can
-+potentially deadlock due to not scheduling a task that's required for the
-+scheduler itself to make forward progress (though the sched_ext watchdog will
-+protect the system by unloading your scheduler after a timeout if that
-+happens). You also have to bootstrap some communication protocol between the
-+kernel and user space.
-+
-+A more robust solution to this would be building a user space scheduling
-+framework that abstracts much of this complexity away from you.
-+
-+### Production Ready?
-+
-+No. This scheduler uses an ordered list for vtime scheduling, and is stricly
-+less performant than just using something like `scx_simple`. It is purely
-+meant to illustrate that it's possible to build a user space scheduler on
-+top of sched_ext.
-+
-+--------------------------------------------------------------------------------
-+
-+## scx_rusty
-+
-+### Overview
-+
-+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
-+scheduler does a simple round robin in each domain, and the user space portion
-+(written in Rust) calculates the load factor of each domain, and informs BPF of
-+how tasks should be load balanced accordingly.
-+
-+### Typical Use Case
-+
-+Rusty is designed to be flexible, and accommodate different architectures and
-+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
-+as well as how Rusty should partition the system into scheduling domains, can
-+be tuned to achieve the optimal configuration for any given system or workload.
-+
-+### Production Ready?
-+
-+Yes. If tuned correctly, rusty should be performant across various CPU
-+architectures and workloads. Rusty by default creates a separate scheduling
-+domain per-LLC, so its default configuration may be performant as well.
-+
-+That said, you may run into an issue with infeasible weights, where a task with
-+a very high weight may cause the scheduler to incorrectly leave cores idle
-+because it thinks they're necessary to accommodate the compute for a single
-+task. This can also happen in CFS, and should soon be addressed for rusty.
-+
-+--------------------------------------------------------------------------------
-+
-+# Troubleshooting
-+
-+There are a number of common issues that you may run into when building the
-+schedulers. We'll go over some of the common ones here.
-+
-+## Build Failures
-+
-+### Old version of clang
-+
-+```
-+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
-+        _Static_assert(SCX_DSQ_FLAG_BUILTIN,
-+                       ^~~~~~~~~~~~~~~~~~~~
-+1 error generated.
-+```
-+
-+This means you built the kernel or the schedulers with an older version of
-+clang than what's supported (i.e. older than 16.0.0). To remediate this:
-+
-+1. `which clang` to make sure you're using a sufficiently new version of clang.
-+
-+2. `make fullclean` in the root path of the repository, and rebuild the kernel
-+   and schedulers.
-+
-+3. Rebuild the kernel, and then your example schedulers.
-+
-+The schedulers are also cleaned if you invoke `make mrproper` in the root
-+directory of the tree.
-+
-+### Stale kernel build / incomplete vmlinux.h file
-+
-+As described above, you'll need a `vmlinux.h` file that was generated from a
-+vmlinux built with BTF, and with sched_ext support enabled. If you don't,
-+you'll see errors such as the following which indicate that a type being
-+referenced in a scheduler is unknown:
-+
-+```
-+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
-+
-+const struct scx_exit_info *ei)
-+
-+^
-+```
-+
-+In order to resolve this, please follow the steps above in
-+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
-+schedulers are using a vmlinux.h file that includes the requisite types.
-+
-+## Misc
-+
-+### llvm: [OFF]
-+
-+You may see the following output when building the schedulers:
-+
-+```
-+Auto-detecting system features:
-+...                         clang-bpf-co-re: [ on  ]
-+...                                    llvm: [ OFF ]
-+...                                  libcap: [ on  ]
-+...                                  libbfd: [ on  ]
-+```
-+
-+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.
-diff --git a/tools/sched_ext/gnu/stubs.h b/tools/sched_ext/gnu/stubs.h
-new file mode 100644
-index 000000000..719225b16
---- /dev/null
-+++ b/tools/sched_ext/gnu/stubs.h
-@@ -0,0 +1 @@
-+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
-diff --git a/tools/sched_ext/ravg.bpf.h b/tools/sched_ext/ravg.bpf.h
-new file mode 100644
-index 000000000..a233d85d0
---- /dev/null
-+++ b/tools/sched_ext/ravg.bpf.h
-@@ -0,0 +1,42 @@
-+#ifndef __SCX_RAVG_BPF_H__
-+#define __SCX_RAVG_BPF_H__
-+
-+/*
-+ * Running average helpers to be used in BPF progs. Assumes vmlinux.h has
-+ * already been included.
-+ */
-+enum ravg_consts {
-+	RAVG_VAL_BITS		= 44,		/* input values are 44bit */
-+	RAVG_FRAC_BITS		= 20,		/* 1048576 is 1.0 */
-+};
-+
-+/*
-+ * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in
-+ * arbitrary time intervals. The accumulated values are halved every half_life
-+ * with each period starting when the current time % half_life is 0. Zeroing is
-+ * enough for initialization.
-+ *
-+ * See ravg_accumulate() and ravg_read() for more details.
-+ */
-+struct ravg_data {
-+	/* current value */
-+	u64			val;
-+
-+	/*
-+	 * The timestamp of @val. The latest completed seq #:
-+	 *
-+	 *   (val_at / half_life) - 1
-+	 */
-+	u64			val_at;
-+
-+	/* running avg as of the latest completed seq  */
-+	u64			old;
-+
-+	/*
-+	 * Accumulated value of the current period. Input value is 48bits and we
-+	 * normalize half-life to 16bit, so it should fit in an u64.
-+	 */
-+	u64			cur;
-+};
-+
-+#endif /* __SCX_RAVG_BPF_H__ */
-diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h
-new file mode 100644
-index 000000000..4922a3e68
---- /dev/null
-+++ b/tools/sched_ext/ravg_impl.bpf.h
-@@ -0,0 +1,358 @@
-+/* to be included in the main bpf.c file */
-+#include "ravg.bpf.h"
-+
-+#define RAVG_FN_ATTRS		inline __attribute__((unused, always_inline))
-+
-+static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend)
-+{
-+	u64 new = *sum + addend;
-+
-+	if (new >= *sum)
-+		*sum = new;
-+	else
-+		*sum = -1;
-+}
-+
-+static RAVG_FN_ATTRS u64 ravg_decay(u64 v, u32 shift)
-+{
-+	if (shift >= 64)
-+		return 0;
-+	else
-+		return v >> shift;
-+}
-+
-+static RAVG_FN_ATTRS u32 ravg_normalize_dur(u32 dur, u32 half_life)
-+{
-+	if (dur < half_life)
-+		return (((u64)dur << RAVG_FRAC_BITS) + half_life - 1) /
-+			half_life;
-+	else
-+		return 1 << RAVG_FRAC_BITS;
-+}
-+
-+/*
-+ * Pre-computed decayed full-period values. This is quicker and keeps the bpf
-+ * verifier happy by removing the need for looping.
-+ *
-+ * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1)
-+ * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2)
-+ * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3)
-+ * ...
-+ */
-+static u64 ravg_full_sum[] = {
-+	 524288,  786432,  917504,  983040,
-+	1015808, 1032192, 1040384, 1044480,
-+	1046528, 1047552, 1048064, 1048320,
-+	1048448, 1048512, 1048544, 1048560,
-+	1048568, 1048572, 1048574, 1048575,
-+	/* the same from here on */
-+};
-+
-+static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]);
-+
-+/**
-+ * ravg_accumulate - Accumulate a new value
-+ * @rd: ravg_data to accumulate into
-+ * @new_val: new value
-+ * @now: current timestamp
-+ * @half_life: decay period, must be the same across calls
-+ *
-+ * The current value is changing to @val at @now. Accumulate accordingly.
-+ */
-+static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd, u64 new_val, u64 now,
-+					  u32 half_life)
-+{
-+	u32 cur_seq, val_seq, seq_delta;
-+
-+	/*
-+	 * It may be difficult for the caller to guarantee monotonic progress if
-+	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
-+	 * the past of @rd->val_at.
-+	 */
-+	if (now < rd->val_at)
-+		now = rd->val_at;
-+
-+	cur_seq = now / half_life;
-+	val_seq = rd->val_at / half_life;
-+	seq_delta = cur_seq - val_seq;
-+
-+	/*
-+	 * Decay ->old and fold ->cur into it.
-+	 *
-+	 *                                                          @end
-+	 *                                                            v
-+	 * timeline     |---------|---------|---------|---------|---------|
-+	 * seq delta         4         3         2         1          0
-+	 * seq            ->seq                                    cur_seq
-+	 * val            ->old     ->cur                  ^
-+	 *                   |         |                   |
-+	 *                   \---------+------------------/
-+	 */
-+	if (seq_delta > 0) {
-+		/* decay ->old to bring it upto the cur_seq - 1 */
-+		rd->old = ravg_decay(rd->old, seq_delta);
-+		/* non-zero ->cur must be from val_seq, calc and fold */
-+		ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta));
-+		/* clear */
-+		rd->cur = 0;
-+	}
-+
-+	if (!rd->val)
-+		goto out;
-+
-+	/*
-+	 * Accumulate @rd->val between @rd->val_at and @now.
-+	 *
-+	 *                       @rd->val_at                        @now
-+	 *                            v                               v
-+	 * timeline     |---------|---------|---------|---------|---------|
-+	 * seq delta                  [  3  |    2    |    1    |  0  ]
-+	 */
-+	if (seq_delta > 0) {
-+		u32 dur;
-+
-+		/* fold the oldest period which may be partial */
-+		dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life);
-+		ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta));
-+
-+		/* fold the full periods in the middle with precomputed vals */
-+		if (seq_delta > 1) {
-+			u32 idx = seq_delta - 2;
-+
-+			if (idx >= ravg_full_sum_len)
-+				idx = ravg_full_sum_len - 1;
-+
-+			ravg_add(&rd->old, rd->val * ravg_full_sum[idx]);
-+		}
-+
-+		/* accumulate the current period duration into ->cur */
-+		rd->cur += rd->val * ravg_normalize_dur(now % half_life,
-+							half_life);
-+	} else {
-+		rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at,
-+							half_life);
-+	}
-+out:
-+	if (new_val >= 1LLU << RAVG_VAL_BITS)
-+		rd->val = (1LLU << RAVG_VAL_BITS) - 1;
-+	else
-+		rd->val = new_val;
-+	rd->val_at = now;
-+}
-+
-+/**
-+ * ravg_transfer - Transfer in or out a component running avg
-+ * @base: ravg_data to transfer @xfer into or out of
-+ * @base_new_val: new value for @base
-+ * @xfer: ravg_data to transfer
-+ * @xfer_new_val: new value for @xfer
-+ * @is_xfer_in: transfer direction
-+ *
-+ * An ravg may be a sum of component ravgs. For example, a scheduling domain's
-+ * load is the sum of the load values of all member tasks. If a task is migrated
-+ * to a different domain, its contribution should be subtracted from the source
-+ * ravg and added to the destination one.
-+ *
-+ * This function can be used for such component transfers. Both @base and @xfer
-+ * must have been accumulated at the same timestamp. @xfer's contribution is
-+ * subtracted if @is_fer_in is %false and added if %true.
-+ */
-+static RAVG_FN_ATTRS void ravg_transfer(struct ravg_data *base, u64 base_new_val,
-+					struct ravg_data *xfer, u64 xfer_new_val,
-+					u32 half_life, bool is_xfer_in)
-+{
-+	/* synchronize @base and @xfer */
-+	if ((s64)(base->val_at - xfer->val_at) < 0)
-+		ravg_accumulate(base, base_new_val, xfer->val_at, half_life);
-+	else if ((s64)(base->val_at - xfer->val_at) > 0)
-+		ravg_accumulate(xfer, xfer_new_val, base->val_at, half_life);
-+
-+	/* transfer */
-+	if (is_xfer_in) {
-+		base->old += xfer->old;
-+		base->cur += xfer->cur;
-+	} else {
-+		if (base->old > xfer->old)
-+			base->old -= xfer->old;
-+		else
-+			base->old = 0;
-+
-+		if (base->cur > xfer->cur)
-+			base->cur -= xfer->cur;
-+		else
-+			base->cur = 0;
-+	}
-+}
-+
-+/**
-+ * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift)
-+ * @a: multiplicand
-+ * @b: multiplier
-+ * @rshift: number of bits to shift right
-+ *
-+ * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is
-+ * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must
-+ * ensure that the final shifted result fits in u64.
-+ */
-+static inline __attribute__((always_inline))
-+u64 u64_x_u32_rshift(u64 a, u32 b, u32 rshift)
-+{
-+	const u64 mask32 = (u32)-1;
-+	u64 al = a & mask32;
-+	u64 ah = (a & (mask32 << 32)) >> 32;
-+
-+	/*
-+	 *                                        ah: high 32     al: low 32
-+	 * a                                   |--------------||--------------|
-+	 *
-+	 * ah * b              |--------------||--------------|
-+	 * al * b                              |--------------||--------------|
-+	 */
-+	al *= b;
-+	ah *= b;
-+
-+	/*
-+	 * (ah * b) >> rshift        |--------------||--------------|
-+	 * (al * b) >> rshift                        |--------------||--------|
-+	 *                                                           <-------->
-+	 *                                                           32 - rshift
-+	 */
-+	al >>= rshift;
-+	if (rshift <= 32)
-+		ah <<= 32 - rshift;
-+	else
-+		ah >>= rshift - 32;
-+
-+	return al + ah;
-+}
-+
-+/**
-+ * ravg_scale - Scale a running avg
-+ * @rd: ravg_data to scale
-+ * @mult: multipler
-+ * @rshift: right shift amount
-+ *
-+ * Scale @rd by multiplying the tracked values by @mult and shifting right by
-+ * @rshift.
-+ */
-+static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift)
-+{
-+	rd->val = u64_x_u32_rshift(rd->val, mult, rshift);
-+	rd->old = u64_x_u32_rshift(rd->old, mult, rshift);
-+	rd->cur = u64_x_u32_rshift(rd->cur, mult, rshift);
-+}
-+
-+/**
-+ * ravg_read - Read the current running avg
-+ * @rd: ravg_data to read from
-+ * @now: timestamp as of which to read the running avg
-+ * @half_life: decay period, must match ravg_accumulate()'s
-+ *
-+ * Read running avg from @rd as of @now.
-+ */
-+static RAVG_FN_ATTRS u64 ravg_read(struct ravg_data *rd, u64 now, u64 half_life)
-+{
-+	struct ravg_data trd;
-+	u32 elapsed;
-+
-+	/*
-+	 * It may be difficult for the caller to guarantee monotonic progress if
-+	 * multiple CPUs accumulate to the same ravg_data. Handle @now being in
-+	 * the past of @rd->val_at.
-+	 */
-+	if (now < rd->val_at)
-+		now = rd->val_at;
-+
-+	elapsed = now % half_life;
-+
-+	/*
-+	 * Accumulate the ongoing period into a temporary copy. This allows
-+	 * external readers to access up-to-date avg without strongly
-+	 * synchronizing with the updater (we need to add a seq lock tho).
-+	 */
-+	trd = *rd;
-+	rd = &trd;
-+	ravg_accumulate(rd, 0, now, half_life);
-+
-+	/*
-+	 * At the beginning of a new half_life period, the running avg is the
-+	 * same as @rd->old. At the beginning of the next, it'd be old load / 2
-+	 * + current load / 2. Inbetween, we blend the two linearly.
-+	 */
-+	if (elapsed) {
-+		u32 progress = ravg_normalize_dur(elapsed, half_life);
-+		/*
-+		 * `H` is the duration of the half-life window, and `E` is how
-+		 * much time has elapsed in this window. `P` is [0.0, 1.0]
-+		 * representing how much the current window has progressed:
-+		 *
-+		 *   P = E / H
-+		 *
-+		 * If `old` is @rd->old, we would want to calculate the
-+		 * following for blending:
-+		 *
-+		 *   old * (1.0 - P / 2)
-+		 *
-+		 * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply
-+		 * and then divide by 1 << RAVG_FRAC_BITS:
-+		 *
-+		 *         (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2
-+		 *   old * -----------------------------------------------------
-+		 *                       1 << RAVG_FRAC_BITS
-+		 *
-+		 * As @progress is (1 << RAVG_FRAC_BITS) * P:
-+		 *
-+		 *         (1 << RAVG_FRAC_BITS) - progress / 2
-+		 *   old * ------------------------------------
-+		 *                1 << RAVG_FRAC_BITS
-+		 *
-+		 * As @rd->old uses full 64bit, the multiplication can overflow,
-+		 * but we also know that the final result is gonna be smaller
-+		 * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle
-+		 * the interim multiplication correctly.
-+		 */
-+		u64 old = u64_x_u32_rshift(rd->old,
-+					   (1 << RAVG_FRAC_BITS) - progress / 2,
-+					   RAVG_FRAC_BITS);
-+		/*
-+		 * If `S` is the Sum(val * duration) for this half-life window,
-+		 * the avg for this window is:
-+		 *
-+		 *   S / E
-+		 *
-+		 * We would want to calculate the following for blending:
-+		 *
-+		 *   S / E * (P / 2)
-+		 *
-+		 * As P = E / H,
-+		 *
-+		 *   S / E * (E / H / 2)
-+		 *   S / H / 2
-+		 *
-+		 * Expanding S, the above becomes:
-+		 *
-+		 *   Sum(val * duration) / H / 2
-+		 *   Sum(val * (duration / H)) / 2
-+		 *
-+		 * As we use RAVG_FRAC_BITS bits for fixed point arithmetic,
-+		 * let's multiply the whole result accordingly:
-+		 *
-+		 *   (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS)
-+		 *
-+		 *             duration * (1 << RAVG_FRAC_BITS)
-+		 *   Sum(val * --------------------------------) / 2
-+		 *                            H
-+		 *
-+		 * The righthand multiplier inside Sum() is the normalized
-+		 * duration returned from ravg_normalize_dur(), so, the whole
-+		 * Sum term equals @rd->cur.
-+		 *
-+		 *  rd->cur / 2
-+		 */
-+		u64 cur = rd->cur / 2;
-+
-+		return old + cur;
-+	} else {
-+		return rd->old;
-+	}
-+}
-diff --git a/tools/sched_ext/ravg_read.rs.h b/tools/sched_ext/ravg_read.rs.h
-new file mode 100644
-index 000000000..4efaa2390
---- /dev/null
-+++ b/tools/sched_ext/ravg_read.rs.h
-@@ -0,0 +1,82 @@
-+/// ravg_read() implementation for rust userland. See ravg_read() in
-+/// ravg_impl.bpf.h. We don't yet have a good mechanism to share BPF and
-+/// matching rust code across multiple schedulers. For now, include both BPF
-+/// and rust code from scheduler implementations.
-+fn ravg_read(
-+    val: u64,
-+    val_at: u64,
-+    old: u64,
-+    cur: u64,
-+    now: u64,
-+    half_life: u32,
-+    frac_bits: u32,
-+) -> f64 {
-+    let ravg_1: f64 = (1 << frac_bits) as f64;
-+    let half_life = half_life as u64;
-+    let val = val as f64;
-+    let mut old = old as f64 / ravg_1;
-+    let mut cur = cur as f64 / ravg_1;
-+
-+    let now = now.max(val_at);
-+    let normalized_dur = |dur| dur as f64 / half_life as f64;
-+
-+    //
-+    // The following is f64 implementation of BPF ravg_accumulate().
-+    //
-+    let cur_seq = (now / half_life) as i64;
-+    let val_seq = (val_at / half_life) as i64;
-+    let seq_delta = (cur_seq - val_seq) as i32;
-+
-+    if seq_delta > 0 {
-+        let full_decay = 2f64.powi(seq_delta);
-+
-+        // Decay $old and fold $cur into it.
-+        old /= full_decay;
-+        old += cur / full_decay;
-+        cur = 0.0;
-+
-+        // Fold the oldest period whicy may be partial.
-+        old += val * normalized_dur(half_life - val_at % half_life) / full_decay;
-+
-+        // Pre-computed decayed full-period values.
-+        const FULL_SUMS: [f64; 20] = [
-+            0.5,
-+            0.75,
-+            0.875,
-+            0.9375,
-+            0.96875,
-+            0.984375,
-+            0.9921875,
-+            0.99609375,
-+            0.998046875,
-+            0.9990234375,
-+            0.99951171875,
-+            0.999755859375,
-+            0.9998779296875,
-+            0.99993896484375,
-+            0.999969482421875,
-+            0.9999847412109375,
-+            0.9999923706054688,
-+            0.9999961853027344,
-+            0.9999980926513672,
-+            0.9999990463256836,
-+            // Use the same value beyond this point.
-+        ];
-+
-+        // Fold the full periods in the middle.
-+        if seq_delta >= 2 {
-+            let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1);
-+            old += val * FULL_SUMS[idx];
-+        }
-+
-+        // Accumulate the current period duration into @cur.
-+        cur += val * normalized_dur(now % half_life);
-+    } else {
-+        cur += val * normalized_dur(now - val_at);
-+    }
-+
-+    //
-+    // The following is the blending part of BPF ravg_read().
-+    //
-+    old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0
-+}
-diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
-new file mode 100644
-index 000000000..890e97e22
---- /dev/null
-+++ b/tools/sched_ext/scx_central.bpf.c
-@@ -0,0 +1,346 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A central FIFO sched_ext scheduler which demonstrates the followings:
-+ *
-+ * a. Making all scheduling decisions from one CPU:
-+ *
-+ *    The central CPU is the only one making scheduling decisions. All other
-+ *    CPUs kick the central CPU when they run out of tasks to run.
-+ *
-+ *    There is one global BPF queue and the central CPU schedules all CPUs by
-+ *    dispatching from the global queue to each CPU's local dsq from dispatch().
-+ *    This isn't the most straightforward. e.g. It'd be easier to bounce
-+ *    through per-CPU BPF queues. The current design is chosen to maximally
-+ *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
-+ *
-+ * b. Tickless operation
-+ *
-+ *    All tasks are dispatched with the infinite slice which allows stopping the
-+ *    ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
-+ *    parameter. The tickless operation can be observed through
-+ *    /proc/interrupts.
-+ *
-+ *    Periodic switching is enforced by a periodic timer checking all CPUs and
-+ *    preempting them as necessary. Unfortunately, BPF timer currently doesn't
-+ *    have a way to pin to a specific CPU, so the periodic timer isn't pinned to
-+ *    the central CPU.
-+ *
-+ * c. Preemption
-+ *
-+ *    Kthreads are unconditionally queued to the head of a matching local dsq
-+ *    and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
-+ *    prioritized over user threads, which is required for ensuring forward
-+ *    progress as e.g. the periodic timer may run on a ksoftirqd and if the
-+ *    ksoftirqd gets starved by a user thread, there may not be anything else to
-+ *    vacate that user thread.
-+ *
-+ *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
-+ *    next tasks.
-+ *
-+ * This scheduler is designed to maximize usage of various SCX mechanisms. A
-+ * more practical implementation would likely put the scheduling loop outside
-+ * the central CPU's dispatch() path and add some form of priority mechanism.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include "scx_common.bpf.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+enum {
-+	FALLBACK_DSQ_ID		= 0,
-+	MS_TO_NS		= 1000LLU * 1000,
-+	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
-+};
-+
-+const volatile bool switch_partial;
-+const volatile s32 central_cpu;
-+const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+
-+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
-+u64 nr_overflows;
-+
-+struct user_exit_info uei;
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, 4096);
-+	__type(value, s32);
-+} central_q SEC(".maps");
-+
-+/* can't use percpu map due to bad lookups */
-+bool RESIZABLE_ARRAY(data, cpu_gimme_task);
-+u64 RESIZABLE_ARRAY(data, cpu_started_at);
-+
-+struct central_timer {
-+	struct bpf_timer timer;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__uint(max_entries, 1);
-+	__type(key, u32);
-+	__type(value, struct central_timer);
-+} central_timer SEC(".maps");
-+
-+static bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	/*
-+	 * Steer wakeups to the central CPU as much as possible to avoid
-+	 * disturbing other CPUs. It's safe to blindly return the central cpu as
-+	 * select_cpu() is a hint and if @p can't be on it, the kernel will
-+	 * automatically pick a fallback CPU.
-+	 */
-+	return central_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	s32 pid = p->pid;
-+
-+	__sync_fetch_and_add(&nr_total, 1);
-+
-+	/*
-+	 * Push per-cpu kthreads at the head of local dsq's and preempt the
-+	 * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
-+	 * behind other threads which is necessary for forward progress
-+	 * guarantee as we depend on the BPF timer which may run from ksoftirqd.
-+	 */
-+	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
-+		__sync_fetch_and_add(&nr_locals, 1);
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
-+				 enq_flags | SCX_ENQ_PREEMPT);
-+		return;
-+	}
-+
-+	if (bpf_map_push_elem(&central_q, &pid, 0)) {
-+		__sync_fetch_and_add(&nr_overflows, 1);
-+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
-+		return;
-+	}
-+
-+	__sync_fetch_and_add(&nr_queued, 1);
-+
-+	if (!scx_bpf_task_running(p))
-+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+}
-+
-+static bool dispatch_to_cpu(s32 cpu)
-+{
-+	struct task_struct *p;
-+	s32 pid;
-+
-+	bpf_repeat(BPF_MAX_LOOPS) {
-+		if (bpf_map_pop_elem(&central_q, &pid))
-+			break;
-+
-+		__sync_fetch_and_sub(&nr_queued, 1);
-+
-+		p = bpf_task_from_pid(pid);
-+		if (!p) {
-+			__sync_fetch_and_add(&nr_lost_pids, 1);
-+			continue;
-+		}
-+
-+		/*
-+		 * If we can't run the task at the top, do the dumb thing and
-+		 * bounce it to the fallback dsq.
-+		 */
-+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-+			__sync_fetch_and_add(&nr_mismatches, 1);
-+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
-+			bpf_task_release(p);
-+			continue;
-+		}
-+
-+		/* dispatch to local and mark that @cpu doesn't need more */
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
-+
-+		if (cpu != central_cpu)
-+			scx_bpf_kick_cpu(cpu, 0);
-+
-+		bpf_task_release(p);
-+		return true;
-+	}
-+
-+	return false;
-+}
-+
-+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	if (cpu == central_cpu) {
-+		/* dispatch for all other CPUs first */
-+		__sync_fetch_and_add(&nr_dispatches, 1);
-+
-+		bpf_for(cpu, 0, nr_cpu_ids) {
-+			bool *gimme;
-+
-+			if (!scx_bpf_dispatch_nr_slots())
-+				break;
-+
-+			/* central's gimme is never set */
-+			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+			if (gimme && !*gimme)
-+				continue;
-+
-+			if (dispatch_to_cpu(cpu))
-+				*gimme = false;
-+		}
-+
-+		/*
-+		 * Retry if we ran out of dispatch buffer slots as we might have
-+		 * skipped some CPUs and also need to dispatch for self. The ext
-+		 * core automatically retries if the local dsq is empty but we
-+		 * can't rely on that as we're dispatching for other CPUs too.
-+		 * Kick self explicitly to retry.
-+		 */
-+		if (!scx_bpf_dispatch_nr_slots()) {
-+			__sync_fetch_and_add(&nr_retries, 1);
-+			scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+			return;
-+		}
-+
-+		/* look for a task to run on the central CPU */
-+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
-+			return;
-+		dispatch_to_cpu(central_cpu);
-+	} else {
-+		bool *gimme;
-+
-+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
-+			return;
-+
-+		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+		if (gimme)
-+			*gimme = true;
-+
-+		/*
-+		 * Force dispatch on the scheduling CPU so that it finds a task
-+		 * to run for us.
-+		 */
-+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
-+{
-+	s32 cpu = scx_bpf_task_cpu(p);
-+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+	if (started_at)
-+		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
-+}
-+
-+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
-+{
-+	s32 cpu = scx_bpf_task_cpu(p);
-+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+	if (started_at)
-+		*started_at = 0;
-+}
-+
-+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
-+{
-+	u64 now = bpf_ktime_get_ns();
-+	u64 nr_to_kick = nr_queued;
-+	s32 i, curr_cpu;
-+
-+	curr_cpu = bpf_get_smp_processor_id();
-+	/*
-+	 * XXX BACKPORT NOTE - BPF_F_TIMER_CPU_PIN is not available in v6.6 and
-+	 * we can't guarantee that the central timer runs on the central CPU.
-+	 */
-+	/*if (curr_cpu != central_cpu) {
-+		scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
-+			      curr_cpu, central_cpu);
-+		return 0;
-+	}*/
-+
-+	bpf_for(i, 0, nr_cpu_ids) {
-+		s32 cpu = (nr_timers + i) % nr_cpu_ids;
-+		u64 *started_at;
-+
-+		if (cpu == central_cpu)
-+			continue;
-+
-+		/* kick iff the current one exhausted its slice */
-+		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+		if (started_at && *started_at &&
-+		    vtime_before(now, *started_at + slice_ns))
-+			continue;
-+
-+		/* and there's something pending */
-+		if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
-+		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
-+			;
-+		else if (nr_to_kick)
-+			nr_to_kick--;
-+		else
-+			continue;
-+
-+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
-+	}
-+
-+	bpf_timer_start(timer, TIMER_INTERVAL_NS, 0 /*BPF_F_TIMER_CPU_PIN*/);
-+	__sync_fetch_and_add(&nr_timers, 1);
-+	return 0;
-+}
-+
-+int BPF_STRUCT_OPS_SLEEPABLE(central_init)
-+{
-+	u32 key = 0;
-+	struct bpf_timer *timer;
-+	int ret;
-+
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+
-+	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
-+	if (ret)
-+		return ret;
-+
-+	timer = bpf_map_lookup_elem(&central_timer, &key);
-+	if (!timer)
-+		return -ESRCH;
-+
-+	if (bpf_get_smp_processor_id() != central_cpu)
-+		return -EINVAL;
-+
-+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
-+	bpf_timer_set_callback(timer, central_timerfn);
-+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0 /*BPF_F_TIMER_CPU_PIN*/);
-+	return ret;
-+}
-+
-+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops central_ops = {
-+	/*
-+	 * We are offloading all scheduling decisions to the central CPU and
-+	 * thus being the last task on a given CPU doesn't mean anything
-+	 * special. Enqueue the last tasks like any other tasks.
-+	 */
-+	.flags			= SCX_OPS_ENQ_LAST,
-+
-+	.select_cpu		= (void *)central_select_cpu,
-+	.enqueue		= (void *)central_enqueue,
-+	.dispatch		= (void *)central_dispatch,
-+	.running		= (void *)central_running,
-+	.stopping		= (void *)central_stopping,
-+	.init			= (void *)central_init,
-+	.exit			= (void *)central_exit,
-+	.name			= "central",
-+};
-diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
-new file mode 100644
-index 000000000..1e2985900
---- /dev/null
-+++ b/tools/sched_ext/scx_central.c
-@@ -0,0 +1,123 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#define _GNU_SOURCE
-+#include <sched.h>
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "scx_common.h"
-+#include "scx_central.skel.h"
-+
-+const char help_fmt[] =
-+"A central FIFO sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-c CPU] [-p]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -c CPU        Override the central CPU (default: 0)\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_central *skel;
-+	struct bpf_link *link;
-+	__u64 seq = 0;
-+	__s32 opt;
-+	cpu_set_t *cpuset;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_central__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	skel->rodata->central_cpu = 0;
-+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
-+
-+	while ((opt = getopt(argc, argv, "s:c:ph")) != -1) {
-+		switch (opt) {
-+		case 's':
-+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		case 'c':
-+			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	/* Resize arrays so their element count is equal to cpu count. */
-+	RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
-+	RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids);
-+
-+	SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
-+
-+	/*
-+	 * Affinitize the loading thread to the central CPU, as:
-+	 * - That's where the BPF timer is first invoked in the BPF program.
-+	 * - We probably don't want this user space component to take up a core
-+	 *   from a task that would benefit from avoiding preemption on one of
-+	 *   the tickless cores.
-+	 *
-+	 * Until BPF supports pinning the timer, it's not guaranteed that it
-+	 * will always be invoked on the central CPU. In practice, this
-+	 * suffices the majority of the time.
-+	 */
-+	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
-+	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
-+	CPU_ZERO(cpuset);
-+	CPU_SET(skel->rodata->central_cpu, cpuset);
-+	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
-+		   "Failed to affinitize to central CPU %d (max %d)",
-+		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
-+	CPU_FREE(cpuset);
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		printf("[SEQ %llu]\n", seq++);
-+		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
-+		       skel->bss->nr_total,
-+		       skel->bss->nr_locals,
-+		       skel->bss->nr_queued,
-+		       skel->bss->nr_lost_pids);
-+		printf("timer   :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
-+		       skel->bss->nr_timers,
-+		       skel->bss->nr_dispatches,
-+		       skel->bss->nr_mismatches,
-+		       skel->bss->nr_retries);
-+		printf("overflow:%10lu\n",
-+		       skel->bss->nr_overflows);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_central__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
-new file mode 100644
-index 000000000..5c503c235
---- /dev/null
-+++ b/tools/sched_ext/scx_common.bpf.h
-@@ -0,0 +1,244 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCHED_EXT_COMMON_BPF_H
-+#define __SCHED_EXT_COMMON_BPF_H
-+
-+#include "vmlinux.h"
-+#include <bpf/bpf_helpers.h>
-+#include <bpf/bpf_tracing.h>
-+#include <linux/errno.h>
-+#include "user_exit_info.h"
-+
-+#define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
-+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
-+#define PF_EXITING			0x00000004
-+#define CLOCK_MONOTONIC			1
-+
-+/*
-+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
-+ * lead to really confusing misbehaviors. Let's trigger a build failure.
-+ */
-+static inline void ___vmlinux_h_sanity_check___(void)
-+{
-+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
-+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
-+}
-+
-+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
-+
-+static inline __attribute__((format(printf, 1, 2)))
-+void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
-+
-+/*
-+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
-+ * instead of an array of u64. Note that __param[] must have at least one
-+ * element to keep the verifier happy.
-+ */
-+#define scx_bpf_error(fmt, args...)						\
-+({										\
-+	static char ___fmt[] = fmt;						\
-+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
-+										\
-+	_Pragma("GCC diagnostic push")						\
-+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
-+	___bpf_fill(___param, args);						\
-+	_Pragma("GCC diagnostic pop")						\
-+										\
-+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
-+										\
-+	___scx_bpf_error_format_checker(fmt, ##args);				\
-+})
-+
-+void scx_bpf_switch_all(void) __ksym;
-+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
-+bool scx_bpf_consume(u64 dsq_id) __ksym;
-+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
-+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
-+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
-+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
-+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
-+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
-+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
-+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
-+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
-+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
-+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
-+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
-+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
-+u32 scx_bpf_reenqueue_local(void) __ksym;
-+
-+#define BPF_STRUCT_OPS(name, args...)						\
-+SEC("struct_ops/"#name)								\
-+BPF_PROG(name, ##args)
-+
-+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
-+SEC("struct_ops.s/"#name)							\
-+BPF_PROG(name, ##args)
-+
-+/**
-+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
-+ * @elfsec: the data section of the BPF program in which to place the array
-+ * @arr: the name of the array
-+ *
-+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
-+ * bss, data, rodata) themselves are maps, a data section can be resized. If
-+ * a data section has an array as its last element, the BTF info for that
-+ * array will be adjusted so that length of the array is extended to meet the
-+ * new length of the data section. This macro annotates an array to have an
-+ * element count of one with the assumption that this array can be resized
-+ * within the userspace program. It also annotates the section specifier so
-+ * this array exists in a custom sub data section which can be resized
-+ * independently.
-+ *
-+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
-+ * array declared with RESIZABLE_ARRAY().
-+ */
-+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
-+
-+/**
-+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
-+ * @base: struct or array to index
-+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
-+ *
-+ * The verifier often gets confused by the instruction sequence the compiler
-+ * generates for indexing struct fields or arrays. This macro forces the
-+ * compiler to generate a code sequence which first calculates the byte offset,
-+ * checks it against the struct or array size and add that byte offset to
-+ * generate the pointer to the member to help the verifier.
-+ *
-+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
-+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
-+ * must check for %NULL and take appropriate action to appease the verifier. To
-+ * avoid confusing the verifier, it's best to check for %NULL and dereference
-+ * immediately.
-+ *
-+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
-+ *	if (!vptr)
-+ *		return error;
-+ *	*vptr = new_value;
-+ *
-+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
-+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
-+ * `MEMBER_VPTR(ptr, ->member)`.
-+ */
-+#define MEMBER_VPTR(base, member) (typeof((base) member) *)({			\
-+	u64 __base = (u64)&(base);						\
-+	u64 __addr = (u64)&((base) member) - __base;				\
-+	_Static_assert(sizeof(base) >= sizeof((base) member),			\
-+		       "@base is smaller than @member, is @base a pointer?");	\
-+	asm volatile (								\
-+		"if %0 <= %[max] goto +2\n"					\
-+		"%0 = 0\n"							\
-+		"goto +1\n"							\
-+		"%0 += %1\n"							\
-+		: "+r"(__addr)							\
-+		: "r"(__base),							\
-+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
-+	__addr;									\
-+})
-+
-+/**
-+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
-+ * @arr: array to index into
-+ * @i: array index
-+ * @n: number of elements in array
-+ *
-+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
-+ * element count needs to be explicit.
-+ * It can be used in cases where a global array is defined with an initial
-+ * size but is intended to be be resized before loading the BPF program.
-+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
-+ * size of the array to compute the max, which will result in rejection by
-+ * the verifier.
-+ */
-+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({	  \
-+	u64 __base = (u64)arr;				  \
-+	u64 __addr = (u64)&(arr[i]) - __base;		  \
-+	asm volatile (					  \
-+		"if %0 <= %[max] goto +2\n"		  \
-+		"%0 = 0\n"				  \
-+		"goto +1\n"				  \
-+		"%0 += %1\n"				  \
-+		: "+r"(__addr)				  \
-+		: "r"(__base),				  \
-+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));  \
-+	__addr;						  \
-+})
-+
-+/*
-+ * BPF core and other generic helpers
-+ */
-+
-+/* list and rbtree */
-+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
-+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
-+
-+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
-+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
-+
-+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
-+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
-+
-+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
-+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
-+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
-+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
-+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
-+				      struct bpf_rb_node *node) __ksym;
-+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
-+			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
-+			void *meta, __u64 off) __ksym;
-+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
-+
-+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
-+
-+/* task */
-+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
-+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
-+void bpf_task_release(struct task_struct *p) __ksym;
-+
-+/* cgroup */
-+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
-+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
-+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
-+
-+/* cpumask */
-+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
-+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
-+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		     const struct cpumask *src2) __ksym;
-+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		    const struct cpumask *src2) __ksym;
-+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		     const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
-+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
-+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
-+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
-+				   const struct cpumask *src2) __ksym;
-+
-+/* rcu */
-+void bpf_rcu_read_lock(void) __ksym;
-+void bpf_rcu_read_unlock(void) __ksym;
-+
-+#endif	/* __SCHED_EXT_COMMON_BPF_H */
-diff --git a/tools/sched_ext/scx_common.h b/tools/sched_ext/scx_common.h
-new file mode 100644
-index 000000000..0e93d6b69
---- /dev/null
-+++ b/tools/sched_ext/scx_common.h
-@@ -0,0 +1,59 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCHED_EXT_COMMON_H
-+#define __SCHED_EXT_COMMON_H
-+
-+#include <stdarg.h>
-+#include <stdio.h>
-+#include <stdlib.h>
-+
-+#include "user_exit_info.h"
-+
-+#ifdef __KERNEL__
-+#error "Should not be included by BPF programs"
-+#endif
-+
-+#define SCX_BUG(__fmt, ...)							\
-+	do {									\
-+		fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__,	\
-+			strerror(errno));					\
-+		fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__);		\
-+		fprintf(stderr, "\n");						\
-+										\
-+		exit(EXIT_FAILURE);						\
-+	} while (0)
-+
-+#define SCX_BUG_ON(__cond, __fmt, ...)					\
-+	do {								\
-+		if (__cond)						\
-+			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
-+	} while (0)
-+
-+/**
-+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
-+ * @elfsec: the data section of the BPF program in which to the array exists
-+ * @arr: the name of the array
-+ * @n: the desired array element count
-+ *
-+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
-+ * operations. It resizes the map which corresponds to the custom data
-+ * section that contains the target array. As a side effect, the BTF info for
-+ * the array is adjusted so that the array length is sized to cover the new
-+ * data section size. The second operation is reassigning the skeleton pointer
-+ * for that custom data section so that it points to the newly memory mapped
-+ * region.
-+ */
-+#define RESIZE_ARRAY(elfsec, arr, n)						  \
-+	do {									  \
-+		size_t __sz;							  \
-+		bpf_map__set_value_size(skel->maps.elfsec##_##arr,		  \
-+				sizeof(skel->elfsec##_##arr->arr[0]) * (n));	  \
-+		skel->elfsec##_##arr =						  \
-+			bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \
-+	} while (0)
-+
-+#endif	/* __SCHED_EXT_COMMON_H */
-diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
-new file mode 100644
-index 000000000..2db3d8d45
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.bpf.c
-@@ -0,0 +1,912 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
-+ * hierarchical weight-based cgroup CPU control by flattening the cgroup
-+ * hierarchy into a single layer by compounding the active weight share at each
-+ * level. Consider the following hierarchy with weights in parentheses:
-+ *
-+ * R + A (100) + B (100)
-+ *   |         \ C (100)
-+ *   \ D (200)
-+ *
-+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
-+ * Let's say all three have runnable tasks. The total share that each of these
-+ * three cgroups is entitled to can be calculated by compounding its share at
-+ * each level.
-+ *
-+ * For example, B is competing against C and in that competition its share is
-+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
-+ * share in that competition is 200/(200+100) == 1/3. B's eventual share in the
-+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
-+ * eventual shaer is the same at 1/6. D is only competing at the top level and
-+ * its share is 200/(100+200) == 2/3.
-+ *
-+ * So, instead of hierarchically scheduling level-by-level, we can consider it
-+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
-+ * and keep updating the eventual shares as the cgroups' runnable states change.
-+ *
-+ * This flattening of hierarchy can bring a substantial performance gain when
-+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
-+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
-+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
-+ * apache instances competing with 2:1 weight ratio nested four level deep.
-+ *
-+ * However, the gain comes at the cost of not being able to properly handle
-+ * thundering herd of cgroups. For example, if many cgroups which are nested
-+ * behind a low priority parent cgroup wake up around the same time, they may be
-+ * able to consume more CPU cycles than they are entitled to. In many use cases,
-+ * this isn't a real concern especially given the performance gain. Also, there
-+ * are ways to mitigate the problem further by e.g. introducing an extra
-+ * scheduling layer on cgroup delegation boundaries.
-+ *
-+ * The scheduler first picks the cgroup to run and then schedule the tasks
-+ * within by using nested weighted vtime scheduling by default. The
-+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
-+ */
-+#include "scx_common.bpf.h"
-+#include "user_exit_info.h"
-+#include "scx_flatcg.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
-+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
-+const volatile bool fifo_sched;
-+const volatile bool switch_partial;
-+
-+u64 cvtime_now;
-+struct user_exit_info uei;
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__type(key, u32);
-+	__type(value, u64);
-+	__uint(max_entries, FCG_NR_STATS);
-+} stats SEC(".maps");
-+
-+static void stat_inc(enum fcg_stat_idx idx)
-+{
-+	u32 idx_v = idx;
-+
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
-+	if (cnt_p)
-+		(*cnt_p)++;
-+}
-+
-+struct fcg_cpu_ctx {
-+	u64			cur_cgid;
-+	u64			cur_at;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct fcg_cpu_ctx);
-+	__uint(max_entries, 1);
-+} cpu_ctx SEC(".maps");
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct fcg_cgrp_ctx);
-+} cgrp_ctx SEC(".maps");
-+
-+struct cgv_node {
-+	struct bpf_rb_node	rb_node;
-+	__u64			cvtime;
-+	__u64			cgid;
-+};
-+
-+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
-+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
-+
-+struct cgv_node_stash {
-+	struct cgv_node __kptr *node;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__uint(max_entries, 16384);
-+	__type(key, __u64);
-+	__type(value, struct cgv_node_stash);
-+} cgv_node_stash SEC(".maps");
-+
-+struct fcg_task_ctx {
-+	u64		bypassed_at;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct fcg_task_ctx);
-+} task_ctx SEC(".maps");
-+
-+/* gets inc'd on weight tree changes to expire the cached hweights */
-+unsigned long hweight_gen = 1;
-+
-+static u64 div_round_up(u64 dividend, u64 divisor)
-+{
-+	return (dividend + divisor - 1) / divisor;
-+}
-+
-+static bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
-+{
-+	struct cgv_node *cgc_a, *cgc_b;
-+
-+	cgc_a = container_of(a, struct cgv_node, rb_node);
-+	cgc_b = container_of(b, struct cgv_node, rb_node);
-+
-+	return cgc_a->cvtime < cgc_b->cvtime;
-+}
-+
-+static struct fcg_cpu_ctx *find_cpu_ctx(void)
-+{
-+	struct fcg_cpu_ctx *cpuc;
-+	u32 idx = 0;
-+
-+	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
-+	if (!cpuc) {
-+		scx_bpf_error("cpu_ctx lookup failed");
-+		return NULL;
-+	}
-+	return cpuc;
-+}
-+
-+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (!cgc) {
-+		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
-+		return NULL;
-+	}
-+	return cgc;
-+}
-+
-+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	cgrp = bpf_cgroup_ancestor(cgrp, level);
-+	if (!cgrp) {
-+		scx_bpf_error("ancestor cgroup lookup failed");
-+		return NULL;
-+	}
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		scx_bpf_error("ancestor cgrp_ctx lookup failed");
-+	bpf_cgroup_release(cgrp);
-+	return cgc;
-+}
-+
-+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
-+{
-+	int level;
-+
-+	if (!cgc->nr_active) {
-+		stat_inc(FCG_STAT_HWT_SKIP);
-+		return;
-+	}
-+
-+	if (cgc->hweight_gen == hweight_gen) {
-+		stat_inc(FCG_STAT_HWT_CACHE);
-+		return;
-+	}
-+
-+	stat_inc(FCG_STAT_HWT_UPDATES);
-+	bpf_for(level, 0, cgrp->level + 1) {
-+		struct fcg_cgrp_ctx *cgc;
-+		bool is_active;
-+
-+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
-+		if (!cgc)
-+			break;
-+
-+		if (!level) {
-+			cgc->hweight = FCG_HWEIGHT_ONE;
-+			cgc->hweight_gen = hweight_gen;
-+		} else {
-+			struct fcg_cgrp_ctx *pcgc;
-+
-+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
-+			if (!pcgc)
-+				break;
-+
-+			/*
-+			 * We can be oppotunistic here and not grab the
-+			 * cgv_tree_lock and deal with the occasional races.
-+			 * However, hweight updates are already cached and
-+			 * relatively low-frequency. Let's just do the
-+			 * straightforward thing.
-+			 */
-+			bpf_spin_lock(&cgv_tree_lock);
-+			is_active = cgc->nr_active;
-+			if (is_active) {
-+				cgc->hweight_gen = pcgc->hweight_gen;
-+				cgc->hweight =
-+					div_round_up(pcgc->hweight * cgc->weight,
-+						     pcgc->child_weight_sum);
-+			}
-+			bpf_spin_unlock(&cgv_tree_lock);
-+
-+			if (!is_active) {
-+				stat_inc(FCG_STAT_HWT_RACE);
-+				break;
-+			}
-+		}
-+	}
-+}
-+
-+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
-+{
-+	u64 delta, cvtime, max_budget;
-+
-+	/*
-+	 * A node which is on the rbtree can't be pointed to from elsewhere yet
-+	 * and thus can't be updated and repositioned. Instead, we collect the
-+	 * vtime deltas separately and apply it asynchronously here.
-+	 */
-+	delta = cgc->cvtime_delta;
-+	__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
-+	cvtime = cgv_node->cvtime + delta;
-+
-+	/*
-+	 * Allow a cgroup to carry the maximum budget proportional to its
-+	 * hweight such that a full-hweight cgroup can immediately take up half
-+	 * of the CPUs at the most while staying at the front of the rbtree.
-+	 */
-+	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
-+		(2 * FCG_HWEIGHT_ONE);
-+	if (vtime_before(cvtime, cvtime_now - max_budget))
-+		cvtime = cvtime_now - max_budget;
-+
-+	cgv_node->cvtime = cvtime;
-+}
-+
-+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
-+{
-+	struct cgv_node_stash *stash;
-+	struct cgv_node *cgv_node;
-+	u64 cgid = cgrp->kn->id;
-+
-+	/* paired with cmpxchg in try_pick_next_cgroup() */
-+	if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
-+		stat_inc(FCG_STAT_ENQ_SKIP);
-+		return;
-+	}
-+
-+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-+	if (!stash) {
-+		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
-+		return;
-+	}
-+
-+	/* NULL if the node is already on the rbtree */
-+	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
-+	if (!cgv_node) {
-+		stat_inc(FCG_STAT_ENQ_RACE);
-+		return;
-+	}
-+
-+	bpf_spin_lock(&cgv_tree_lock);
-+	cgrp_cap_budget(cgv_node, cgc);
-+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
-+	if (!taskc) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	/*
-+	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
-+	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
-+	 * the fact. Use the same mechanism to deal with tasks with custom
-+	 * affinities so that we don't have to worry about per-cgroup dq's
-+	 * containing tasks that can't be executed from some CPUs.
-+	 */
-+	if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
-+		/*
-+		 * Tell fcg_stopping() that this bypassed the regular scheduling
-+		 * path and should be force charged to the cgroup. 0 is used to
-+		 * indicate that the task isn't bypassing, so if the current
-+		 * runtime is 0, go back by one nanosecond.
-+		 */
-+		taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
-+
-+		/*
-+		 * The global dq is deprioritized as we don't want to let tasks
-+		 * to boost themselves by constraining its cpumask. The
-+		 * deprioritization is rather severe, so let's not apply that to
-+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
-+		 * implement per-cgroup fallback dq's instead so that we have
-+		 * more control over when tasks with custom cpumask get issued.
-+		 */
-+		if ((enq_flags & SCX_ENQ_LOCAL) ||
-+		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
-+			stat_inc(FCG_STAT_LOCAL);
-+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-+		} else {
-+			stat_inc(FCG_STAT_GLOBAL);
-+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+		}
-+		return;
-+	}
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		goto out_release;
-+
-+	if (fifo_sched) {
-+		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
-+	} else {
-+		u64 tvtime = p->scx.dsq_vtime;
-+
-+		/*
-+		 * Limit the amount of budget that an idling task can accumulate
-+		 * to one slice.
-+		 */
-+		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
-+			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
-+
-+		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
-+				       tvtime, enq_flags);
-+	}
-+
-+	cgrp_enqueued(cgrp, cgc);
-+out_release:
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+/*
-+ * Walk the cgroup tree to update the active weight sums as tasks wake up and
-+ * sleep. The weight sums are used as the base when calculating the proportion a
-+ * given cgroup or task is entitled to at each level.
-+ */
-+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+	bool updated = false;
-+	int idx;
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		return;
-+
-+	/*
-+	 * In most cases, a hot cgroup would have multiple threads going to
-+	 * sleep and waking up while the whole cgroup stays active. In leaf
-+	 * cgroups, ->nr_runnable which is updated with __sync operations gates
-+	 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
-+	 * repeatedly for a busy cgroup which is staying active.
-+	 */
-+	if (runnable) {
-+		if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
-+			return;
-+		stat_inc(FCG_STAT_ACT);
-+	} else {
-+		if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
-+			return;
-+		stat_inc(FCG_STAT_DEACT);
-+	}
-+
-+	/*
-+	 * If @cgrp is becoming runnable, its hweight should be refreshed after
-+	 * it's added to the weight tree so that enqueue has the up-to-date
-+	 * value. If @cgrp is becoming quiescent, the hweight should be
-+	 * refreshed before it's removed from the weight tree so that the usage
-+	 * charging which happens afterwards has access to the latest value.
-+	 */
-+	if (!runnable)
-+		cgrp_refresh_hweight(cgrp, cgc);
-+
-+	/* propagate upwards */
-+	bpf_for(idx, 0, cgrp->level) {
-+		int level = cgrp->level - idx;
-+		struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
-+		bool propagate = false;
-+
-+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
-+		if (!cgc)
-+			break;
-+		if (level) {
-+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
-+			if (!pcgc)
-+				break;
-+		}
-+
-+		/*
-+		 * We need the propagation protected by a lock to synchronize
-+		 * against weight changes. There's no reason to drop the lock at
-+		 * each level but bpf_spin_lock() doesn't want any function
-+		 * calls while locked.
-+		 */
-+		bpf_spin_lock(&cgv_tree_lock);
-+
-+		if (runnable) {
-+			if (!cgc->nr_active++) {
-+				updated = true;
-+				if (pcgc) {
-+					propagate = true;
-+					pcgc->child_weight_sum += cgc->weight;
-+				}
-+			}
-+		} else {
-+			if (!--cgc->nr_active) {
-+				updated = true;
-+				if (pcgc) {
-+					propagate = true;
-+					pcgc->child_weight_sum -= cgc->weight;
-+				}
-+			}
-+		}
-+
-+		bpf_spin_unlock(&cgv_tree_lock);
-+
-+		if (!propagate)
-+			break;
-+	}
-+
-+	if (updated)
-+		__sync_fetch_and_add(&hweight_gen, 1);
-+
-+	if (runnable)
-+		cgrp_refresh_hweight(cgrp, cgc);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
-+{
-+	struct cgroup *cgrp;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	update_active_weight_sums(cgrp, true);
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
-+{
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	if (fifo_sched)
-+		return;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (cgc) {
-+		/*
-+		 * @cgc->tvtime_now always progresses forward as tasks start
-+		 * executing. The test and update can be performed concurrently
-+		 * from multiple CPUs and thus racy. Any error should be
-+		 * contained and temporary. Let's just live with it.
-+		 */
-+		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
-+			cgc->tvtime_now = p->scx.dsq_vtime;
-+	}
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	/*
-+	 * Scale the execution time by the inverse of the weight and charge.
-+	 *
-+	 * Note that the default yield implementation yields by setting
-+	 * @p->scx.slice to zero and the following would treat the yielding task
-+	 * as if it has consumed all its slice. If this penalizes yielding tasks
-+	 * too much, determine the execution time by taking explicit timestamps
-+	 * instead of depending on @p->scx.slice.
-+	 */
-+	if (!fifo_sched)
-+		p->scx.dsq_vtime +=
-+			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
-+
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
-+	if (!taskc) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	if (!taskc->bypassed_at)
-+		return;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (cgc) {
-+		__sync_fetch_and_add(&cgc->cvtime_delta,
-+				     p->se.sum_exec_runtime - taskc->bypassed_at);
-+		taskc->bypassed_at = 0;
-+	}
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
-+{
-+	struct cgroup *cgrp;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	update_active_weight_sums(cgrp, false);
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
-+{
-+	struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		return;
-+
-+	if (cgrp->level) {
-+		pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
-+		if (!pcgc)
-+			return;
-+	}
-+
-+	bpf_spin_lock(&cgv_tree_lock);
-+	if (pcgc && cgc->nr_active)
-+		pcgc->child_weight_sum += (s64)weight - cgc->weight;
-+	cgc->weight = weight;
-+	bpf_spin_unlock(&cgv_tree_lock);
-+}
-+
-+static bool try_pick_next_cgroup(u64 *cgidp)
-+{
-+	struct bpf_rb_node *rb_node;
-+	struct cgv_node_stash *stash;
-+	struct cgv_node *cgv_node;
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgroup *cgrp;
-+	u64 cgid;
-+
-+	/* pop the front cgroup and wind cvtime_now accordingly */
-+	bpf_spin_lock(&cgv_tree_lock);
-+
-+	rb_node = bpf_rbtree_first(&cgv_tree);
-+	if (!rb_node) {
-+		bpf_spin_unlock(&cgv_tree_lock);
-+		stat_inc(FCG_STAT_PNC_NO_CGRP);
-+		*cgidp = 0;
-+		return true;
-+	}
-+
-+	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+
-+	if (!rb_node) {
-+		/*
-+		 * This should never happen. bpf_rbtree_first() was called
-+		 * above while the tree lock was held, so the node should
-+		 * always be present.
-+		 */
-+		scx_bpf_error("node could not be removed");
-+		return true;
-+	}
-+
-+	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
-+	cgid = cgv_node->cgid;
-+
-+	if (vtime_before(cvtime_now, cgv_node->cvtime))
-+		cvtime_now = cgv_node->cvtime;
-+
-+	/*
-+	 * If lookup fails, the cgroup's gone. Free and move on. See
-+	 * fcg_cgroup_exit().
-+	 */
-+	cgrp = bpf_cgroup_from_id(cgid);
-+	if (!cgrp) {
-+		stat_inc(FCG_STAT_PNC_GONE);
-+		goto out_free;
-+	}
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (!cgc) {
-+		bpf_cgroup_release(cgrp);
-+		stat_inc(FCG_STAT_PNC_GONE);
-+		goto out_free;
-+	}
-+
-+	if (!scx_bpf_consume(cgid)) {
-+		bpf_cgroup_release(cgrp);
-+		stat_inc(FCG_STAT_PNC_EMPTY);
-+		goto out_stash;
-+	}
-+
-+	/*
-+	 * Successfully consumed from the cgroup. This will be our current
-+	 * cgroup for the new slice. Refresh its hweight.
-+	 */
-+	cgrp_refresh_hweight(cgrp, cgc);
-+
-+	bpf_cgroup_release(cgrp);
-+
-+	/*
-+	 * As the cgroup may have more tasks, add it back to the rbtree. Note
-+	 * that here we charge the full slice upfront and then exact later
-+	 * according to the actual consumption. This prevents lowpri thundering
-+	 * herd from saturating the machine.
-+	 */
-+	bpf_spin_lock(&cgv_tree_lock);
-+	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
-+	cgrp_cap_budget(cgv_node, cgc);
-+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+
-+	*cgidp = cgid;
-+	stat_inc(FCG_STAT_PNC_NEXT);
-+	return true;
-+
-+out_stash:
-+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-+	if (!stash) {
-+		stat_inc(FCG_STAT_PNC_GONE);
-+		goto out_free;
-+	}
-+
-+	/*
-+	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
-+	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
-+	 * see their task in the dq below and requeue the cgroup.
-+	 */
-+	__sync_val_compare_and_swap(&cgc->queued, 1, 0);
-+
-+	if (scx_bpf_dsq_nr_queued(cgid)) {
-+		bpf_spin_lock(&cgv_tree_lock);
-+		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+		bpf_spin_unlock(&cgv_tree_lock);
-+	} else {
-+		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
-+		if (cgv_node) {
-+			scx_bpf_error("unexpected !NULL cgv_node stash");
-+			goto out_free;
-+		}
-+	}
-+
-+	return false;
-+
-+out_free:
-+	bpf_obj_drop(cgv_node);
-+	return false;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	struct fcg_cpu_ctx *cpuc;
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgroup *cgrp;
-+	u64 now = bpf_ktime_get_ns();
-+
-+	cpuc = find_cpu_ctx();
-+	if (!cpuc)
-+		return;
-+
-+	if (!cpuc->cur_cgid)
-+		goto pick_next_cgroup;
-+
-+	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
-+		if (scx_bpf_consume(cpuc->cur_cgid)) {
-+			stat_inc(FCG_STAT_CNS_KEEP);
-+			return;
-+		}
-+		stat_inc(FCG_STAT_CNS_EMPTY);
-+	} else {
-+		stat_inc(FCG_STAT_CNS_EXPIRE);
-+	}
-+
-+	/*
-+	 * The current cgroup is expiring. It was already charged a full slice.
-+	 * Calculate the actual usage and accumulate the delta.
-+	 */
-+	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
-+	if (!cgrp) {
-+		stat_inc(FCG_STAT_CNS_GONE);
-+		goto pick_next_cgroup;
-+	}
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (cgc) {
-+		/*
-+		 * We want to update the vtime delta and then look for the next
-+		 * cgroup to execute but the latter needs to be done in a loop
-+		 * and we can't keep the lock held. Oh well...
-+		 */
-+		bpf_spin_lock(&cgv_tree_lock);
-+		__sync_fetch_and_add(&cgc->cvtime_delta,
-+				     (cpuc->cur_at + cgrp_slice_ns - now) *
-+				     FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
-+		bpf_spin_unlock(&cgv_tree_lock);
-+	} else {
-+		stat_inc(FCG_STAT_CNS_GONE);
-+	}
-+
-+	bpf_cgroup_release(cgrp);
-+
-+pick_next_cgroup:
-+	cpuc->cur_at = now;
-+
-+	if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
-+		cpuc->cur_cgid = 0;
-+		return;
-+	}
-+
-+	bpf_repeat(BPF_MAX_LOOPS) {
-+		if (try_pick_next_cgroup(&cpuc->cur_cgid))
-+			break;
-+	}
-+}
-+
-+s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	/*
-+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-+	 * in this function and the following will automatically use GFP_KERNEL.
-+	 */
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0,
-+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
-+	if (!taskc)
-+		return -ENOMEM;
-+
-+	taskc->bypassed_at = 0;
-+
-+	if (!(cgc = find_cgrp_ctx(args->cgroup)))
-+		return -ENOENT;
-+
-+	p->scx.dsq_vtime = cgc->tvtime_now;
-+
-+	return 0;
-+}
-+
-+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
-+			     struct scx_cgroup_init_args *args)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgv_node *cgv_node;
-+	struct cgv_node_stash empty_stash = {}, *stash;
-+	u64 cgid = cgrp->kn->id;
-+	int ret;
-+
-+	/*
-+	 * Technically incorrect as cgroup ID is full 64bit while dq ID is
-+	 * 63bit. Should not be a problem in practice and easy to spot in the
-+	 * unlikely case that it breaks.
-+	 */
-+	ret = scx_bpf_create_dsq(cgid, -1);
-+	if (ret)
-+		return ret;
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
-+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
-+	if (!cgc) {
-+		ret = -ENOMEM;
-+		goto err_destroy_dsq;
-+	}
-+
-+	cgc->weight = args->weight;
-+	cgc->hweight = FCG_HWEIGHT_ONE;
-+
-+	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
-+				  BPF_NOEXIST);
-+	if (ret) {
-+		if (ret != -ENOMEM)
-+			scx_bpf_error("unexpected stash creation error (%d)",
-+				      ret);
-+		goto err_destroy_dsq;
-+	}
-+
-+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-+	if (!stash) {
-+		scx_bpf_error("unexpected cgv_node stash lookup failure");
-+		ret = -ENOENT;
-+		goto err_destroy_dsq;
-+	}
-+
-+	cgv_node = bpf_obj_new(struct cgv_node);
-+	if (!cgv_node) {
-+		ret = -ENOMEM;
-+		goto err_del_cgv_node;
-+	}
-+
-+	cgv_node->cgid = cgid;
-+	cgv_node->cvtime = cvtime_now;
-+
-+	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
-+	if (cgv_node) {
-+		scx_bpf_error("unexpected !NULL cgv_node stash");
-+		ret = -EBUSY;
-+		goto err_drop;
-+	}
-+
-+	return 0;
-+
-+err_drop:
-+	bpf_obj_drop(cgv_node);
-+err_del_cgv_node:
-+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
-+err_destroy_dsq:
-+	scx_bpf_destroy_dsq(cgid);
-+	return ret;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
-+{
-+	u64 cgid = cgrp->kn->id;
-+
-+	/*
-+	 * For now, there's no way find and remove the cgv_node if it's on the
-+	 * cgv_tree. Let's drain them in the dispatch path as they get popped
-+	 * off the front of the tree.
-+	 */
-+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
-+	scx_bpf_destroy_dsq(cgid);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
-+		    struct cgroup *from, struct cgroup *to)
-+{
-+	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
-+	s64 vtime_delta;
-+
-+	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
-+	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
-+		return;
-+
-+	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
-+	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
-+}
-+
-+s32 BPF_STRUCT_OPS(fcg_init)
-+{
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops flatcg_ops = {
-+	.enqueue		= (void *)fcg_enqueue,
-+	.dispatch		= (void *)fcg_dispatch,
-+	.runnable		= (void *)fcg_runnable,
-+	.running		= (void *)fcg_running,
-+	.stopping		= (void *)fcg_stopping,
-+	.quiescent		= (void *)fcg_quiescent,
-+	.prep_enable		= (void *)fcg_prep_enable,
-+	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
-+	.cgroup_init		= (void *)fcg_cgroup_init,
-+	.cgroup_exit		= (void *)fcg_cgroup_exit,
-+	.cgroup_move		= (void *)fcg_cgroup_move,
-+	.init			= (void *)fcg_init,
-+	.exit			= (void *)fcg_exit,
-+	.flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
-+	.name			= "flatcg",
-+};
-diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
-new file mode 100644
-index 000000000..f824c4b34
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.c
-@@ -0,0 +1,221 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <signal.h>
-+#include <unistd.h>
-+#include <libgen.h>
-+#include <limits.h>
-+#include <fcntl.h>
-+#include <time.h>
-+#include <bpf/bpf.h>
-+#include "scx_common.h"
-+#include "scx_flatcg.h"
-+#include "scx_flatcg.skel.h"
-+
-+#ifndef FILEID_KERNFS
-+#define FILEID_KERNFS		0xfe
-+#endif
-+
-+const char help_fmt[] =
-+"A flattened cgroup hierarchy sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-p]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -i INTERVAL   Report interval\n"
-+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
-+{
-+	FILE *fp;
-+	char buf[4096];
-+	char *line, *cur = NULL, *tok;
-+	__u64 sum = 0, idle = 0;
-+	__u64 delta_sum, delta_idle;
-+	int idx;
-+
-+	fp = fopen("/proc/stat", "r");
-+	if (!fp) {
-+		perror("fopen(\"/proc/stat\")");
-+		return 0.0;
-+	}
-+
-+	if (!fgets(buf, sizeof(buf), fp)) {
-+		perror("fgets(\"/proc/stat\")");
-+		fclose(fp);
-+		return 0.0;
-+	}
-+	fclose(fp);
-+
-+	line = buf;
-+	for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
-+		char *endp = NULL;
-+		__u64 v;
-+
-+		if (idx == 0) {
-+			line = NULL;
-+			continue;
-+		}
-+		v = strtoull(tok, &endp, 0);
-+		if (!endp || *endp != '\0') {
-+			fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
-+				idx, tok);
-+			continue;
-+		}
-+		sum += v;
-+		if (idx == 4)
-+			idle = v;
-+	}
-+
-+	delta_sum = sum - *last_sum;
-+	delta_idle = idle - *last_idle;
-+	*last_sum = sum;
-+	*last_idle = idle;
-+
-+	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
-+}
-+
-+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
-+{
-+	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
-+	__u32 idx;
-+
-+	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
-+
-+	for (idx = 0; idx < FCG_NR_STATS; idx++) {
-+		int ret, cpu;
-+
-+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-+					  &idx, cnts[idx]);
-+		if (ret < 0)
-+			continue;
-+		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
-+			stats[idx] += cnts[idx][cpu];
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_flatcg *skel;
-+	struct bpf_link *link;
-+	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
-+	bool dump_cgrps = false;
-+	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
-+	__u64 last_stats[FCG_NR_STATS] = {};
-+	unsigned long seq = 0;
-+	__s32 opt;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_flatcg__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-+
-+	while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) {
-+		double v;
-+
-+		switch (opt) {
-+		case 's':
-+			v = strtod(optarg, NULL);
-+			skel->rodata->cgrp_slice_ns = v * 1000;
-+			break;
-+		case 'i':
-+			v = strtod(optarg, NULL);
-+			intv_ts.tv_sec = v;
-+			intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
-+			break;
-+		case 'd':
-+			dump_cgrps = true;
-+			break;
-+		case 'f':
-+			skel->rodata->fifo_sched = true;
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			break;
-+		case 'h':
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
-+	       (double)skel->rodata->cgrp_slice_ns / 1000000.0,
-+	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
-+	       dump_cgrps);
-+
-+	SCX_BUG_ON(scx_flatcg__load(skel), "Failed to load skel");
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		__u64 acc_stats[FCG_NR_STATS];
-+		__u64 stats[FCG_NR_STATS];
-+		float cpu_util;
-+		int i;
-+
-+		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
-+
-+		fcg_read_stats(skel, acc_stats);
-+		for (i = 0; i < FCG_NR_STATS; i++)
-+			stats[i] = acc_stats[i] - last_stats[i];
-+
-+		memcpy(last_stats, acc_stats, sizeof(acc_stats));
-+
-+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
-+		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
-+		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
-+		       stats[FCG_STAT_ACT],
-+		       stats[FCG_STAT_DEACT],
-+		       stats[FCG_STAT_LOCAL],
-+		       stats[FCG_STAT_GLOBAL]);
-+		printf("HWT   skip:%6llu   race:%6llu cache:%6llu update:%6llu\n",
-+		       stats[FCG_STAT_HWT_SKIP],
-+		       stats[FCG_STAT_HWT_RACE],
-+		       stats[FCG_STAT_HWT_CACHE],
-+		       stats[FCG_STAT_HWT_UPDATES]);
-+		printf("ENQ   skip:%6llu   race:%6llu\n",
-+		       stats[FCG_STAT_ENQ_SKIP],
-+		       stats[FCG_STAT_ENQ_RACE]);
-+		printf("CNS   keep:%6llu expire:%6llu empty:%6llu   gone:%6llu\n",
-+		       stats[FCG_STAT_CNS_KEEP],
-+		       stats[FCG_STAT_CNS_EXPIRE],
-+		       stats[FCG_STAT_CNS_EMPTY],
-+		       stats[FCG_STAT_CNS_GONE]);
-+		printf("PNC nocgrp:%6llu   next:%6llu empty:%6llu   gone:%6llu\n",
-+		       stats[FCG_STAT_PNC_NO_CGRP],
-+		       stats[FCG_STAT_PNC_NEXT],
-+		       stats[FCG_STAT_PNC_EMPTY],
-+		       stats[FCG_STAT_PNC_GONE]);
-+		printf("BAD remove:%6llu\n",
-+		       acc_stats[FCG_STAT_BAD_REMOVAL]);
-+
-+		nanosleep(&intv_ts, NULL);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_flatcg__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
-new file mode 100644
-index 000000000..490758ed4
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.h
-@@ -0,0 +1,49 @@
-+#ifndef __SCX_EXAMPLE_FLATCG_H
-+#define __SCX_EXAMPLE_FLATCG_H
-+
-+enum {
-+	FCG_HWEIGHT_ONE		= 1LLU << 16,
-+};
-+
-+enum fcg_stat_idx {
-+	FCG_STAT_ACT,
-+	FCG_STAT_DEACT,
-+	FCG_STAT_LOCAL,
-+	FCG_STAT_GLOBAL,
-+
-+	FCG_STAT_HWT_UPDATES,
-+	FCG_STAT_HWT_CACHE,
-+	FCG_STAT_HWT_SKIP,
-+	FCG_STAT_HWT_RACE,
-+
-+	FCG_STAT_ENQ_SKIP,
-+	FCG_STAT_ENQ_RACE,
-+
-+	FCG_STAT_CNS_KEEP,
-+	FCG_STAT_CNS_EXPIRE,
-+	FCG_STAT_CNS_EMPTY,
-+	FCG_STAT_CNS_GONE,
-+
-+	FCG_STAT_PNC_NO_CGRP,
-+	FCG_STAT_PNC_NEXT,
-+	FCG_STAT_PNC_EMPTY,
-+	FCG_STAT_PNC_GONE,
-+
-+	FCG_STAT_BAD_REMOVAL,
-+
-+	FCG_NR_STATS,
-+};
-+
-+struct fcg_cgrp_ctx {
-+	u32			nr_active;
-+	u32			nr_runnable;
-+	u32			queued;
-+	u32			weight;
-+	u32			hweight;
-+	u64			child_weight_sum;
-+	u64			hweight_gen;
-+	s64			cvtime_delta;
-+	u64			tvtime_now;
-+};
-+
-+#endif /* __SCX_EXAMPLE_FLATCG_H */
-diff --git a/tools/sched_ext/scx_layered/.gitignore b/tools/sched_ext/scx_layered/.gitignore
-new file mode 100644
-index 000000000..186dba259
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/.gitignore
-@@ -0,0 +1,3 @@
-+src/bpf/.output
-+Cargo.lock
-+target
-diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
-new file mode 100644
-index 000000000..6ba1b98d2
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/Cargo.toml
-@@ -0,0 +1,30 @@
-+[package]
-+name = "scx_layered"
-+version = "0.0.1"
-+authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
-+edition = "2021"
-+description = "Userspace scheduling with BPF for Ads"
-+license = "GPL-2.0-only"
-+
-+[dependencies]
-+anyhow = "1.0"
-+bitvec = "1.0"
-+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
-+ctrlc = { version = "3.1", features = ["termination"] }
-+fb_procfs = "0.7"
-+lazy_static = "1.4"
-+libbpf-rs = "0.21"
-+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
-+libc = "0.2"
-+log = "0.4"
-+serde = { version = "1.0", features = ["derive"] }
-+serde_json = "1.0"
-+simplelog = "0.12"
-+
-+[build-dependencies]
-+bindgen = { version = "0.61" }
-+libbpf-cargo = "0.21"
-+glob = "0.3"
-+
-+[features]
-+enable_backtrace = []
-diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs
-new file mode 100644
-index 000000000..ea0bbd48a
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/build.rs
-@@ -0,0 +1,77 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+extern crate bindgen;
-+
-+use std::env;
-+use std::fs::create_dir_all;
-+use std::path::Path;
-+use std::path::PathBuf;
-+
-+use glob::glob;
-+use libbpf_cargo::SkeletonBuilder;
-+
-+const HEADER_PATH: &str = "src/bpf/layered.h";
-+
-+fn bindgen_layered() {
-+    // Tell cargo to invalidate the built crate whenever the wrapper changes
-+    println!("cargo:rerun-if-changed={}", HEADER_PATH);
-+
-+    // The bindgen::Builder is the main entry point
-+    // to bindgen, and lets you build up options for
-+    // the resulting bindings.
-+    let bindings = bindgen::Builder::default()
-+        // The input header we would like to generate
-+        // bindings for.
-+        .header(HEADER_PATH)
-+        // Tell cargo to invalidate the built crate whenever any of the
-+        // included header files changed.
-+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-+        // Finish the builder and generate the bindings.
-+        .generate()
-+        // Unwrap the Result and panic on failure.
-+        .expect("Unable to generate bindings");
-+
-+    // Write the bindings to the $OUT_DIR/bindings.rs file.
-+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-+    bindings
-+        .write_to_file(out_path.join("layered_sys.rs"))
-+        .expect("Couldn't write bindings!");
-+}
-+
-+fn gen_bpf_sched(name: &str) {
-+    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
-+    let clang = env::var("SCX_RUST_CLANG").unwrap();
-+    eprintln!("{}", clang);
-+    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
-+    let skel = Path::new(&outpath);
-+    let src = format!("./src/bpf/{}.bpf.c", name);
-+    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
-+    SkeletonBuilder::new()
-+        .source(src.clone())
-+	.obj(obj)
-+        .clang(clang)
-+        .clang_args(bpf_cflags)
-+        .build_and_generate(skel)
-+        .unwrap();
-+
-+    // Trigger rebuild if any .[hc] files are changed in the directory.
-+    for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) {
-+        println!("cargo:rerun-if-changed={}", path.to_str().unwrap());
-+    }
-+}
-+
-+fn main() {
-+    bindgen_layered();
-+    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
-+    // Reasons are because the generated skeleton contains compiler attributes
-+    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
-+    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
-+    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
-+    //
-+    // However, there is hope! When the above feature stabilizes we can clean this
-+    // all up.
-+    create_dir_all("./src/bpf/.output").unwrap();
-+    gen_bpf_sched("layered");
-+}
-diff --git a/tools/sched_ext/scx_layered/rustfmt.toml b/tools/sched_ext/scx_layered/rustfmt.toml
-new file mode 100644
-index 000000000..b7258ed0a
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/rustfmt.toml
-@@ -0,0 +1,8 @@
-+# Get help on options with `rustfmt --help=config`
-+# Please keep these in alphabetical order.
-+edition = "2021"
-+group_imports = "StdExternalCrate"
-+imports_granularity = "Item"
-+merge_derives = false
-+use_field_init_shorthand = true
-+version = "Two"
-diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
-new file mode 100644
-index 000000000..b0a27f3c7
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c
-@@ -0,0 +1,974 @@
-+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-+#include "../../../scx_common.bpf.h"
-+#include "layered.h"
-+
-+#include <errno.h>
-+#include <stdbool.h>
-+#include <string.h>
-+#include <bpf/bpf_core_read.h>
-+#include <bpf/bpf_helpers.h>
-+#include <bpf/bpf_tracing.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile u32 debug = 0;
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+const volatile u32 nr_possible_cpus = 1;
-+const volatile u32 nr_layers = 1;
-+const volatile bool smt_enabled = true;
-+const volatile unsigned char all_cpus[MAX_CPUS_U8];
-+
-+private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
-+struct layer layers[MAX_LAYERS];
-+u32 fallback_cpu;
-+static u32 preempt_cursor;
-+
-+#define dbg(fmt, args...)	do { if (debug) bpf_printk(fmt, ##args); } while (0)
-+#define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
-+
-+#include "util.bpf.c"
-+#include "../../../ravg_impl.bpf.h"
-+
-+struct user_exit_info uei;
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct cpu_ctx);
-+	__uint(max_entries, 1);
-+} cpu_ctxs SEC(".maps");
-+
-+static struct cpu_ctx *lookup_cpu_ctx(int cpu)
-+{
-+	struct cpu_ctx *cctx;
-+	u32 zero = 0;
-+
-+	if (cpu < 0)
-+		cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero);
-+	else
-+		cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu);
-+
-+	if (!cctx) {
-+		scx_bpf_error("no cpu_ctx for cpu %d", cpu);
-+		return NULL;
-+	}
-+
-+	return cctx;
-+}
-+
-+static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx)
-+{
-+	if (idx < 0 || idx >= NR_GSTATS) {
-+		scx_bpf_error("invalid global stat idx %d", idx);
-+		return;
-+	}
-+
-+	cctx->gstats[idx]++;
-+}
-+
-+static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx)
-+{
-+	u64 *vptr;
-+
-+	if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx])))
-+		(*vptr)++;
-+	else
-+		scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx);
-+}
-+
-+struct lock_wrapper {
-+	struct bpf_spin_lock	lock;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct lock_wrapper);
-+	__uint(max_entries, MAX_LAYERS);
-+	__uint(map_flags, 0);
-+} layer_load_locks SEC(".maps");
-+
-+static void adj_load(u32 layer_idx, s64 adj, u64 now)
-+{
-+	struct layer *layer;
-+	struct lock_wrapper *lockw;
-+
-+	layer = MEMBER_VPTR(layers, [layer_idx]);
-+	lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx);
-+
-+	if (!layer || !lockw) {
-+		scx_bpf_error("Can't access layer%d or its load_lock", layer_idx);
-+		return;
-+	}
-+
-+	bpf_spin_lock(&lockw->lock);
-+	layer->load += adj;
-+	ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE);
-+	bpf_spin_unlock(&lockw->lock);
-+
-+	if (debug && adj < 0 && (s64)layer->load < 0)
-+		scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
-+			      bpf_get_smp_processor_id(), layer_idx, layer->load, adj);
-+}
-+
-+struct layer_cpumask_wrapper {
-+	struct bpf_cpumask __kptr *cpumask;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct layer_cpumask_wrapper);
-+	__uint(max_entries, MAX_LAYERS);
-+	__uint(map_flags, 0);
-+} layer_cpumasks SEC(".maps");
-+
-+static struct cpumask *lookup_layer_cpumask(int idx)
-+{
-+	struct layer_cpumask_wrapper *cpumaskw;
-+
-+	if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
-+		return (struct cpumask *)cpumaskw->cpumask;
-+	} else {
-+		scx_bpf_error("no layer_cpumask");
-+		return NULL;
-+	}
-+}
-+
-+static void refresh_cpumasks(int idx)
-+{
-+	struct layer_cpumask_wrapper *cpumaskw;
-+	struct layer *layer;
-+	int cpu, total = 0;
-+
-+	if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0))
-+		return;
-+
-+	cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx);
-+
-+	bpf_for(cpu, 0, nr_possible_cpus) {
-+		u8 *u8_ptr;
-+
-+		if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
-+			/*
-+			 * XXX - The following test should be outside the loop
-+			 * but that makes the verifier think that
-+			 * cpumaskw->cpumask might be NULL in the loop.
-+			 */
-+			barrier_var(cpumaskw);
-+			if (!cpumaskw || !cpumaskw->cpumask) {
-+				scx_bpf_error("can't happen");
-+				return;
-+			}
-+
-+			if (*u8_ptr & (1 << (cpu % 8))) {
-+				bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask);
-+				total++;
-+			} else {
-+				bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask);
-+			}
-+		} else {
-+			scx_bpf_error("can't happen");
-+		}
-+	}
-+
-+	// XXX - shouldn't be necessary
-+	layer = MEMBER_VPTR(layers, [idx]);
-+	if (!layer) {
-+		scx_bpf_error("can't happen");
-+		return;
-+	}
-+
-+	layer->nr_cpus = total;
-+	__sync_fetch_and_add(&layer->cpus_seq, 1);
-+	trace("LAYER[%d] now has %d cpus, seq=%llu", idx, layer->nr_cpus, layer->cpus_seq);
-+}
-+
-+SEC("fentry/scheduler_tick")
-+int scheduler_tick_fentry(const void *ctx)
-+{
-+	int idx;
-+
-+	if (bpf_get_smp_processor_id() == 0)
-+		bpf_for(idx, 0, nr_layers)
-+			refresh_cpumasks(idx);
-+	return 0;
-+}
-+
-+struct task_ctx {
-+	int			pid;
-+
-+	int			layer;
-+	bool			refresh_layer;
-+	u64			layer_cpus_seq;
-+	struct bpf_cpumask __kptr *layered_cpumask;
-+
-+	bool			all_cpus_allowed;
-+	bool			dispatch_local;
-+	u64			started_running_at;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__type(key, pid_t);
-+	__type(value, struct task_ctx);
-+	__uint(max_entries, MAX_TASKS);
-+	__uint(map_flags, 0);
-+} task_ctxs SEC(".maps");
-+
-+struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p)
-+{
-+	s32 pid = p->pid;
-+
-+	return bpf_map_lookup_elem(&task_ctxs, &pid);
-+}
-+
-+struct task_ctx *lookup_task_ctx(struct task_struct *p)
-+{
-+	struct task_ctx *tctx;
-+	s32 pid = p->pid;
-+
-+	if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) {
-+		return tctx;
-+	} else {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return NULL;
-+	}
-+}
-+
-+struct layer *lookup_layer(int idx)
-+{
-+	if (idx < 0 || idx >= nr_layers) {
-+		scx_bpf_error("invalid layer %d", idx);
-+		return NULL;
-+	}
-+	return &layers[idx];
-+}
-+
-+SEC("tp_btf/cgroup_attach_task")
-+int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
-+	     struct task_struct *leader, bool threadgroup)
-+{
-+	struct task_struct *next;
-+	struct task_ctx *tctx;
-+	int leader_pid = leader->pid;
-+
-+	if (!(tctx = lookup_task_ctx_may_fail(leader)))
-+		return 0;
-+	tctx->refresh_layer = true;
-+
-+	if (!threadgroup)
-+		return 0;
-+
-+	if (!(next = bpf_task_acquire(leader))) {
-+		scx_bpf_error("failed to acquire leader");
-+		return 0;
-+	}
-+
-+	bpf_repeat(MAX_TASKS) {
-+		struct task_struct *p;
-+		int pid;
-+
-+		p = container_of(next->thread_group.next, struct task_struct, thread_group);
-+		bpf_task_release(next);
-+
-+		pid = BPF_CORE_READ(p, pid);
-+		if (pid == leader_pid) {
-+			next = NULL;
-+			break;
-+		}
-+
-+		next = bpf_task_from_pid(pid);
-+		if (!next) {
-+			scx_bpf_error("thread iteration failed");
-+			break;
-+		}
-+
-+		if ((tctx = lookup_task_ctx(next)))
-+			tctx->refresh_layer = true;
-+	}
-+
-+	if (next)
-+		bpf_task_release(next);
-+	return 0;
-+}
-+
-+SEC("tp_btf/task_rename")
-+int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
-+{
-+	struct task_ctx *tctx;
-+
-+	if ((tctx = lookup_task_ctx_may_fail(p)))
-+		tctx->refresh_layer = true;
-+	return 0;
-+}
-+
-+static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
-+					  struct task_struct *p, struct task_ctx *tctx,
-+					  const struct cpumask *layer_cpumask)
-+{
-+	u64 layer_seq = layers->cpus_seq;
-+
-+	if (tctx->layer_cpus_seq == layer_seq)
-+		return;
-+
-+	/*
-+	 * XXX - We're assuming that the updated @layer_cpumask matching the new
-+	 * @layer_seq is visible which may not be true. For now, leave it as-is.
-+	 * Let's update once BPF grows enough memory ordering constructs.
-+	 */
-+	bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
-+	tctx->layer_cpus_seq = layer_seq;
-+	trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
-+}
-+
-+static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
-+			      const struct cpumask *idle_cpumask,
-+			      const struct cpumask *idle_smtmask)
-+{
-+	bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
-+	s32 cpu;
-+
-+	/*
-+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-+	 * partially idle @prev_cpu.
-+	 */
-+	if (smt_enabled) {
-+		if (prev_in_cand &&
-+		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
-+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
-+			return prev_cpu;
-+
-+		cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0)
-+			return cpu;
-+	}
-+
-+	if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu))
-+		return prev_cpu;
-+
-+	return scx_bpf_pick_idle_cpu(cand_cpumask, 0);
-+}
-+
-+s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
-+{
-+	const struct cpumask *idle_cpumask, *idle_smtmask;
-+	struct cpumask *layer_cpumask, *layered_cpumask;
-+	struct cpu_ctx *cctx;
-+	struct task_ctx *tctx;
-+	struct layer *layer;
-+	s32 cpu;
-+
-+	/* look up everything we need */
-+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
-+	    !(layered_cpumask = (struct cpumask *)tctx->layered_cpumask))
-+		return prev_cpu;
-+
-+	/*
-+	 * We usually update the layer in layered_runnable() to avoid confusing.
-+	 * As layered_select_cpu() takes place before runnable, new tasks would
-+	 * still have -1 layer. Just return @prev_cpu.
-+	 */
-+	if (tctx->layer < 0)
-+		return prev_cpu;
-+
-+	if (!(layer = lookup_layer(tctx->layer)) ||
-+	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
-+		return prev_cpu;
-+
-+	if (!(idle_cpumask = scx_bpf_get_idle_cpumask()))
-+		return prev_cpu;
-+
-+	if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) {
-+		cpu = prev_cpu;
-+		goto out_put_idle_cpumask;
-+	}
-+
-+	/* not much to do if bound to a single CPU */
-+	if (p->nr_cpus_allowed == 1) {
-+		cpu = prev_cpu;
-+		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+			if (!bpf_cpumask_test_cpu(cpu, layer_cpumask))
-+				lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
-+			goto dispatch_local;
-+		} else {
-+			goto out_put_cpumasks;
-+		}
-+	}
-+
-+	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
-+
-+	/*
-+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-+	 * partially idle @prev_cpu.
-+	 */
-+	if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
-+				      idle_cpumask, idle_smtmask)) >= 0)
-+		goto dispatch_local;
-+
-+	/*
-+	 * If the layer is an open one, we can try the whole machine.
-+	 */
-+	if (layer->open &&
-+	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
-+				       idle_cpumask, idle_smtmask)) >= 0)) {
-+		lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
-+		goto dispatch_local;
-+	}
-+
-+	cpu = prev_cpu;
-+	goto out_put_cpumasks;
-+
-+dispatch_local:
-+	tctx->dispatch_local = true;
-+out_put_cpumasks:
-+	scx_bpf_put_idle_cpumask(idle_smtmask);
-+out_put_idle_cpumask:
-+	scx_bpf_put_idle_cpumask(idle_cpumask);
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct cpu_ctx *cctx;
-+	struct task_ctx *tctx;
-+	struct layer *layer;
-+	u64 vtime = p->scx.dsq_vtime;
-+	u32 idx;
-+
-+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
-+	    !(layer = lookup_layer(tctx->layer)))
-+		return;
-+
-+	if (tctx->dispatch_local) {
-+		tctx->dispatch_local = false;
-+		lstat_inc(LSTAT_LOCAL, layer, cctx);
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	lstat_inc(LSTAT_GLOBAL, layer, cctx);
-+
-+	/*
-+	 * Limit the amount of budget that an idling task can accumulate
-+	 * to one slice.
-+	 */
-+	if (vtime_before(vtime, layer->vtime_now - slice_ns))
-+		vtime = layer->vtime_now - slice_ns;
-+
-+	if (!tctx->all_cpus_allowed) {
-+		lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
-+
-+	if (!layer->preempt)
-+		return;
-+
-+	bpf_for(idx, 0, nr_possible_cpus) {
-+		struct cpu_ctx *cand_cctx;
-+		u32 cpu = (preempt_cursor + idx) % nr_possible_cpus;
-+
-+		if (!all_cpumask ||
-+		    !bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask))
-+			continue;
-+		if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt)
-+			continue;
-+
-+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
-+
-+		/*
-+		 * Round-robining doesn't have to be strict. Let's not bother
-+		 * with atomic ops on $preempt_cursor.
-+		 */
-+		preempt_cursor = (cpu + 1) % nr_possible_cpus;
-+
-+		lstat_inc(LSTAT_PREEMPT, layer, cctx);
-+		break;
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	int idx;
-+
-+	/* consume preempting layers first */
-+	bpf_for(idx, 0, nr_layers)
-+		if (layers[idx].preempt && scx_bpf_consume(idx))
-+			return;
-+
-+	/* consume !open layers second */
-+	bpf_for(idx, 0, nr_layers) {
-+		struct layer *layer = &layers[idx];
-+		struct cpumask *layer_cpumask;
-+
-+		if (layer->open)
-+			continue;
-+
-+		/* consume matching layers */
-+		if (!(layer_cpumask = lookup_layer_cpumask(idx)))
-+			return;
-+
-+		if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
-+		    (cpu == fallback_cpu && layer->nr_cpus == 0)) {
-+			if (scx_bpf_consume(idx))
-+				return;
-+		}
-+	}
-+
-+	/* consume !preempting open layers */
-+	bpf_for(idx, 0, nr_layers) {
-+		if (!layers[idx].preempt && layers[idx].open &&
-+		    scx_bpf_consume(idx))
-+			return;
-+	}
-+}
-+
-+static bool match_one(struct layer_match *match, struct task_struct *p, const char *cgrp_path)
-+{
-+	switch (match->kind) {
-+	case MATCH_CGROUP_PREFIX: {
-+		return match_prefix(match->cgroup_prefix, cgrp_path, MAX_PATH);
-+	}
-+	case MATCH_COMM_PREFIX: {
-+		char comm[MAX_COMM];
-+		memcpy(comm, p->comm, MAX_COMM);
-+		return match_prefix(match->comm_prefix, comm, MAX_COMM);
-+	}
-+	case MATCH_NICE_ABOVE:
-+		return (s32)p->static_prio - 120 > match->nice_above_or_below;
-+	case MATCH_NICE_BELOW:
-+		return (s32)p->static_prio - 120 < match->nice_above_or_below;
-+	default:
-+		scx_bpf_error("invalid match kind %d", match->kind);
-+		return false;
-+	}
-+}
-+
-+static bool match_layer(struct layer *layer, struct task_struct *p, const char *cgrp_path)
-+{
-+	u32 nr_match_ors = layer->nr_match_ors;
-+	u64 or_idx, and_idx;
-+
-+	if (nr_match_ors > MAX_LAYER_MATCH_ORS) {
-+		scx_bpf_error("too many ORs");
-+		return false;
-+	}
-+
-+	bpf_for(or_idx, 0, nr_match_ors) {
-+		struct layer_match_ands *ands;
-+		bool matched = true;
-+
-+		barrier_var(or_idx);
-+		if (or_idx >= MAX_LAYER_MATCH_ORS)
-+			return false; /* can't happen */
-+		ands = &layer->matches[or_idx];
-+
-+		if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
-+			scx_bpf_error("too many ANDs");
-+			return false;
-+		}
-+
-+		bpf_for(and_idx, 0, ands->nr_match_ands) {
-+			struct layer_match *match;
-+
-+			barrier_var(and_idx);
-+			if (and_idx >= NR_LAYER_MATCH_KINDS)
-+				return false; /* can't happen */
-+			match = &ands->matches[and_idx];
-+
-+			if (!match_one(match, p, cgrp_path)) {
-+				matched = false;
-+				break;
-+			}
-+		}
-+
-+		if (matched)
-+			return true;
-+	}
-+
-+	return false;
-+}
-+
-+static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
-+{
-+	const char *cgrp_path;
-+	bool matched = false;
-+	u64 idx;	// XXX - int makes verifier unhappy
-+
-+	if (!tctx->refresh_layer)
-+		return;
-+	tctx->refresh_layer = false;
-+
-+	if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp)))
-+		return;
-+
-+	if (tctx->layer >= 0 && tctx->layer < nr_layers)
-+		__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
-+
-+	bpf_for(idx, 0, nr_layers) {
-+		if (match_layer(&layers[idx], p, cgrp_path)) {
-+			matched = true;
-+			break;
-+		}
-+	}
-+
-+	if (matched) {
-+		struct layer *layer = &layers[idx];
-+
-+		tctx->layer = idx;
-+		tctx->layer_cpus_seq = layer->cpus_seq - 1;
-+		__sync_fetch_and_add(&layer->nr_tasks, 1);
-+		/*
-+		 * XXX - To be correct, we'd need to calculate the vtime
-+		 * delta in the previous layer, scale it by the load
-+		 * fraction difference and then offset from the new
-+		 * layer's vtime_now. For now, just do the simple thing
-+		 * and assume the offset to be zero.
-+		 *
-+		 * Revisit if high frequency dynamic layer switching
-+		 * needs to be supported.
-+		 */
-+		p->scx.dsq_vtime = layer->vtime_now;
-+	} else {
-+		scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid);
-+	}
-+
-+	if (tctx->layer < nr_layers - 1)
-+		trace("LAYER=%d %s[%d] cgrp=\"%s\"",
-+		      tctx->layer, p->comm, p->pid, cgrp_path);
-+}
-+
-+void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags)
-+{
-+	u64 now = bpf_ktime_get_ns();
-+	struct task_ctx *tctx;
-+
-+	if (!(tctx = lookup_task_ctx(p)))
-+		return;
-+
-+	maybe_refresh_layer(p, tctx);
-+
-+	adj_load(tctx->layer, p->scx.weight, now);
-+}
-+
-+void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
-+{
-+	struct cpu_ctx *cctx;
-+	struct task_ctx *tctx;
-+	struct layer *layer;
-+
-+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
-+	    !(layer = lookup_layer(tctx->layer)))
-+		return;
-+
-+	if (vtime_before(layer->vtime_now, p->scx.dsq_vtime))
-+		layer->vtime_now = p->scx.dsq_vtime;
-+
-+	cctx->current_preempt = layer->preempt;
-+	tctx->started_running_at = bpf_ktime_get_ns();
-+}
-+
-+void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
-+{
-+	struct cpu_ctx *cctx;
-+	struct task_ctx *tctx;
-+	u64 used;
-+	u32 layer;
-+
-+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
-+		return;
-+
-+	layer = tctx->layer;
-+	if (layer >= nr_layers) {
-+		scx_bpf_error("invalid layer %u", layer);
-+		return;
-+	}
-+
-+	used = bpf_ktime_get_ns() - tctx->started_running_at;
-+	cctx->layer_cycles[layer] += used;
-+	cctx->current_preempt = false;
-+
-+	/* scale the execution time by the inverse of the weight and charge */
-+	p->scx.dsq_vtime += used * 100 / p->scx.weight;
-+}
-+
-+void BPF_STRUCT_OPS(layered_quiescent, struct task_struct *p, u64 deq_flags)
-+{
-+	struct task_ctx *tctx;
-+
-+	if ((tctx = lookup_task_ctx(p)))
-+		adj_load(tctx->layer, -(s64)p->scx.weight, bpf_ktime_get_ns());
-+}
-+
-+void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
-+{
-+	struct task_ctx *tctx;
-+
-+	if ((tctx = lookup_task_ctx(p)))
-+		tctx->refresh_layer = true;
-+}
-+
-+void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
-+		    const struct cpumask *cpumask)
-+{
-+	struct task_ctx *tctx;
-+
-+	if (!(tctx = lookup_task_ctx(p)))
-+		return;
-+
-+	if (!all_cpumask) {
-+		scx_bpf_error("NULL all_cpumask");
-+		return;
-+	}
-+
-+	tctx->all_cpus_allowed =
-+		bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
-+}
-+
-+s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	struct task_ctx tctx_init = {
-+		.pid = p->pid,
-+		.layer = -1,
-+		.refresh_layer = true,
-+	};
-+	struct task_ctx *tctx;
-+	struct bpf_cpumask *cpumask;
-+	s32 pid = p->pid;
-+	s32 ret;
-+
-+	if (all_cpumask)
-+		tctx_init.all_cpus_allowed =
-+			bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr);
-+	else
-+		scx_bpf_error("missing all_cpumask");
-+
-+	/*
-+	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
-+	 * fail spuriously due to BPF recursion protection triggering
-+	 * unnecessarily.
-+	 */
-+	if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) {
-+		scx_bpf_error("task_ctx allocation failure, ret=%d", ret);
-+		return ret;
-+	}
-+
-+	/*
-+	 * Read the entry from the map immediately so we can add the cpumask
-+	 * with bpf_kptr_xchg().
-+	 */
-+	if (!(tctx = lookup_task_ctx(p)))
-+		return -ENOENT;
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask) {
-+		bpf_map_delete_elem(&task_ctxs, &pid);
-+		return -ENOMEM;
-+	}
-+
-+	cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
-+	if (cpumask) {
-+		/* Should never happen as we just inserted it above. */
-+		bpf_cpumask_release(cpumask);
-+		bpf_map_delete_elem(&task_ctxs, &pid);
-+		return -EINVAL;
-+	}
-+
-+	/*
-+	 * We are matching cgroup hierarchy path directly rather than the CPU
-+	 * controller path. As the former isn't available during the scheduler
-+	 * fork path, let's delay the layer selection until the first
-+	 * runnable().
-+	 */
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
-+{
-+	s32 pid = p->pid;
-+
-+	bpf_map_delete_elem(&task_ctxs, &pid);
-+}
-+
-+void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
-+{
-+	struct cpu_ctx *cctx;
-+	struct task_ctx *tctx;
-+	s32 pid = p->pid;
-+	int ret;
-+
-+	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
-+		return;
-+
-+	if (tctx->layer >= 0 && tctx->layer < nr_layers)
-+		__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
-+
-+	/*
-+	 * XXX - There's no reason delete should fail here but BPF's recursion
-+	 * protection can unnecessarily fail the operation. The fact that
-+	 * deletions aren't reliable means that we sometimes leak task_ctx and
-+	 * can't use BPF_NOEXIST on allocation in .prep_enable().
-+	 */
-+	ret = bpf_map_delete_elem(&task_ctxs, &pid);
-+	if (ret)
-+		gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx);
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
-+{
-+	struct bpf_cpumask *cpumask;
-+	int i, j, k, nr_online_cpus, ret;
-+
-+	scx_bpf_switch_all();
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+
-+	nr_online_cpus = 0;
-+	bpf_for(i, 0, nr_possible_cpus) {
-+		const volatile u8 *u8_ptr;
-+
-+		if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
-+			if (*u8_ptr & (1 << (i % 8))) {
-+				bpf_cpumask_set_cpu(i, cpumask);
-+				nr_online_cpus++;
-+			}
-+		} else {
-+			return -EINVAL;
-+		}
-+	}
-+
-+	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d",
-+	    nr_online_cpus, smt_enabled);
-+
-+	bpf_for(i, 0, nr_layers) {
-+		struct layer *layer = &layers[i];
-+
-+		dbg("CFG LAYER[%d] open=%d preempt=%d",
-+		    i, layer->open, layer->preempt);
-+
-+		if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
-+			scx_bpf_error("too many ORs");
-+			return -EINVAL;
-+		}
-+
-+		bpf_for(j, 0, layer->nr_match_ors) {
-+			struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]);
-+			if (!ands) {
-+				scx_bpf_error("shouldn't happen");
-+				return -EINVAL;
-+			}
-+
-+			if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
-+				scx_bpf_error("too many ANDs");
-+				return -EINVAL;
-+			}
-+
-+			dbg("CFG   OR[%02d]", j);
-+
-+			bpf_for(k, 0, ands->nr_match_ands) {
-+				char header[32];
-+				u64 header_data[1] = { k };
-+				struct layer_match *match;
-+
-+				bpf_snprintf(header, sizeof(header), "CFG     AND[%02d]:",
-+					     header_data, sizeof(header_data));
-+
-+				match = MEMBER_VPTR(layers, [i].matches[j].matches[k]);
-+				if (!match) {
-+					scx_bpf_error("shouldn't happen");
-+					return -EINVAL;
-+				}
-+
-+				switch (match->kind) {
-+				case MATCH_CGROUP_PREFIX:
-+					dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix);
-+					break;
-+				case MATCH_COMM_PREFIX:
-+					dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix);
-+					break;
-+				case MATCH_NICE_ABOVE:
-+					dbg("%s NICE_ABOVE %d", header, match->nice_above_or_below);
-+					break;
-+				case MATCH_NICE_BELOW:
-+					dbg("%s NICE_BELOW %d", header, match->nice_above_or_below);
-+					break;
-+				default:
-+					scx_bpf_error("%s Invalid kind", header);
-+					return -EINVAL;
-+				}
-+			}
-+			if (ands->nr_match_ands == 0)
-+				dbg("CFG     DEFAULT");
-+		}
-+	}
-+
-+	bpf_for(i, 0, nr_layers) {
-+		struct layer_cpumask_wrapper *cpumaskw;
-+
-+		layers[i].idx = i;
-+
-+		ret = scx_bpf_create_dsq(i, -1);
-+		if (ret < 0)
-+			return ret;
-+
-+		if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &i)))
-+			return -ENONET;
-+
-+		cpumask = bpf_cpumask_create();
-+		if (!cpumask)
-+			return -ENOMEM;
-+
-+		/*
-+		 * Start all layers with full cpumask so that everything runs
-+		 * everywhere. This will soon be updated by refresh_cpumasks()
-+		 * once the scheduler starts running.
-+		 */
-+		bpf_cpumask_setall(cpumask);
-+
-+		cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
-+		if (cpumask)
-+			bpf_cpumask_release(cpumask);
-+	}
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops layered = {
-+	.select_cpu		= (void *)layered_select_cpu,
-+	.enqueue		= (void *)layered_enqueue,
-+	.dispatch		= (void *)layered_dispatch,
-+	.runnable		= (void *)layered_runnable,
-+	.running		= (void *)layered_running,
-+	.stopping		= (void *)layered_stopping,
-+	.quiescent		= (void *)layered_quiescent,
-+	.set_weight		= (void *)layered_set_weight,
-+	.set_cpumask		= (void *)layered_set_cpumask,
-+	.prep_enable		= (void *)layered_prep_enable,
-+	.cancel_enable		= (void *)layered_cancel_enable,
-+	.disable		= (void *)layered_disable,
-+	.init			= (void *)layered_init,
-+	.exit			= (void *)layered_exit,
-+	.name			= "layered",
-+};
-diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h
-new file mode 100644
-index 000000000..bedfa0650
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h
-@@ -0,0 +1,100 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#ifndef __LAYERED_H
-+#define __LAYERED_H
-+
-+#include <stdbool.h>
-+#ifndef __kptr
-+#ifdef __KERNEL__
-+#error "__kptr_ref not defined in the kernel"
-+#endif
-+#define __kptr
-+#endif
-+
-+#ifndef __KERNEL__
-+typedef unsigned long long u64;
-+typedef long long s64;
-+#endif
-+
-+#include "../../../ravg.bpf.h"
-+
-+enum consts {
-+	MAX_CPUS_SHIFT		= 9,
-+	MAX_CPUS		= 1 << MAX_CPUS_SHIFT,
-+	MAX_CPUS_U8		= MAX_CPUS / 8,
-+	MAX_TASKS		= 131072,
-+	MAX_PATH		= 4096,
-+	MAX_COMM		= 16,
-+	MAX_LAYER_MATCH_ORS	= 32,
-+	MAX_LAYERS		= 16,
-+	USAGE_HALF_LIFE		= 100000000,	/* 100ms */
-+
-+	/* XXX remove */
-+	MAX_CGRP_PREFIXES = 32
-+};
-+
-+/* Statistics */
-+enum global_stat_idx {
-+	GSTAT_TASK_CTX_FREE_FAILED,
-+	NR_GSTATS,
-+};
-+
-+enum layer_stat_idx {
-+	LSTAT_LOCAL,
-+	LSTAT_GLOBAL,
-+	LSTAT_OPEN_IDLE,
-+	LSTAT_AFFN_VIOL,
-+	LSTAT_PREEMPT,
-+	NR_LSTATS,
-+};
-+
-+struct cpu_ctx {
-+	bool			current_preempt;
-+	u64			layer_cycles[MAX_LAYERS];
-+	u64			gstats[NR_GSTATS];
-+	u64			lstats[MAX_LAYERS][NR_LSTATS];
-+};
-+
-+enum layer_match_kind {
-+	MATCH_CGROUP_PREFIX,
-+	MATCH_COMM_PREFIX,
-+	MATCH_NICE_ABOVE,
-+	MATCH_NICE_BELOW,
-+
-+	NR_LAYER_MATCH_KINDS,
-+};
-+
-+struct layer_match {
-+	int		kind;
-+	char		cgroup_prefix[MAX_PATH];
-+	char		comm_prefix[MAX_COMM];
-+	int		nice_above_or_below;
-+};
-+
-+struct layer_match_ands {
-+	struct layer_match	matches[NR_LAYER_MATCH_KINDS];
-+	int			nr_match_ands;
-+};
-+
-+struct layer {
-+	struct layer_match_ands	matches[MAX_LAYER_MATCH_ORS];
-+	unsigned int		nr_match_ors;
-+	unsigned int		idx;
-+	bool			open;
-+	bool			preempt;
-+
-+	u64			vtime_now;
-+	u64			nr_tasks;
-+
-+	u64			load;
-+	struct ravg_data	load_rd;
-+
-+	u64			cpus_seq;
-+	unsigned int		refresh_cpus;
-+	unsigned char		cpus[MAX_CPUS_U8];
-+	unsigned int		nr_cpus;	// managed from BPF side
-+};
-+
-+#endif /* __LAYERED_H */
-diff --git a/tools/sched_ext/scx_layered/src/bpf/util.bpf.c b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c
-new file mode 100644
-index 000000000..703e0eece
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c
-@@ -0,0 +1,68 @@
-+/* to be included in the main bpf.c file */
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(key_size, sizeof(u32));
-+	/* double size because verifier can't follow length calculation */
-+	__uint(value_size, 2 * MAX_PATH);
-+	__uint(max_entries, 1);
-+} cgrp_path_bufs SEC(".maps");
-+
-+static char *format_cgrp_path(struct cgroup *cgrp)
-+{
-+	u32 zero = 0;
-+	char *path = bpf_map_lookup_elem(&cgrp_path_bufs, &zero);
-+	u32 len = 0, level, max_level;
-+
-+	if (!path) {
-+		scx_bpf_error("cgrp_path_buf lookup failed");
-+		return NULL;
-+	}
-+
-+	max_level = cgrp->level;
-+	if (max_level > 127)
-+		max_level = 127;
-+
-+	bpf_for(level, 1, max_level + 1) {
-+		int ret;
-+
-+		if (level > 1 && len < MAX_PATH - 1)
-+			path[len++] = '/';
-+
-+		if (len >= MAX_PATH - 1) {
-+			scx_bpf_error("cgrp_path_buf overflow");
-+			return NULL;
-+		}
-+
-+		ret = bpf_probe_read_kernel_str(path + len, MAX_PATH - len - 1,
-+						BPF_CORE_READ(cgrp, ancestors[level], kn, name));
-+		if (ret < 0) {
-+			scx_bpf_error("bpf_probe_read_kernel_str failed");
-+			return NULL;
-+		}
-+
-+		len += ret - 1;
-+	}
-+
-+	if (len >= MAX_PATH - 2) {
-+		scx_bpf_error("cgrp_path_buf overflow");
-+		return NULL;
-+	}
-+	path[len] = '/';
-+	path[len + 1] = '\0';
-+
-+	return path;
-+}
-+
-+static inline bool match_prefix(const char *prefix, const char *str, u32 max_len)
-+{
-+	int c;
-+
-+	bpf_for(c, 0, max_len) {
-+		if (prefix[c] == '\0')
-+			return true;
-+		if (str[c] != prefix[c])
-+			return false;
-+	}
-+	return false;
-+}
-diff --git a/tools/sched_ext/scx_layered/src/layered_sys.rs b/tools/sched_ext/scx_layered/src/layered_sys.rs
-new file mode 100644
-index 000000000..afc821d38
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/src/layered_sys.rs
-@@ -0,0 +1,10 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#![allow(non_upper_case_globals)]
-+#![allow(non_camel_case_types)]
-+#![allow(non_snake_case)]
-+#![allow(dead_code)]
-+
-+include!(concat!(env!("OUT_DIR"), "/layered_sys.rs"));
-diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
-new file mode 100644
-index 000000000..7eb2edf53
---- /dev/null
-+++ b/tools/sched_ext/scx_layered/src/main.rs
-@@ -0,0 +1,1641 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#[path = "bpf/.output/layered.skel.rs"]
-+mod layered;
-+pub use layered::*;
-+pub mod layered_sys;
-+
-+use std::collections::BTreeMap;
-+use std::collections::BTreeSet;
-+use std::ffi::CStr;
-+use std::ffi::CString;
-+use std::fs;
-+use std::io::Read;
-+use std::io::Write;
-+use std::ops::Sub;
-+use std::sync::atomic::AtomicBool;
-+use std::sync::atomic::Ordering;
-+use std::sync::Arc;
-+use std::time::Duration;
-+use std::time::Instant;
-+
-+use ::fb_procfs as procfs;
-+use anyhow::anyhow;
-+use anyhow::bail;
-+use anyhow::Context;
-+use anyhow::Result;
-+use bitvec::prelude::*;
-+use clap::Parser;
-+use libbpf_rs::skel::OpenSkel as _;
-+use libbpf_rs::skel::Skel as _;
-+use libbpf_rs::skel::SkelBuilder as _;
-+use log::debug;
-+use log::info;
-+use log::trace;
-+use serde::Deserialize;
-+use serde::Serialize;
-+
-+const RAVG_FRAC_BITS: u32 = layered_sys::ravg_consts_RAVG_FRAC_BITS;
-+const MAX_CPUS: usize = layered_sys::consts_MAX_CPUS as usize;
-+const MAX_PATH: usize = layered_sys::consts_MAX_PATH as usize;
-+const MAX_COMM: usize = layered_sys::consts_MAX_COMM as usize;
-+const MAX_LAYER_MATCH_ORS: usize = layered_sys::consts_MAX_LAYER_MATCH_ORS as usize;
-+const MAX_LAYERS: usize = layered_sys::consts_MAX_LAYERS as usize;
-+const USAGE_HALF_LIFE: u32 = layered_sys::consts_USAGE_HALF_LIFE;
-+const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
-+const NR_GSTATS: usize = layered_sys::global_stat_idx_NR_GSTATS as usize;
-+const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize;
-+const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
-+const CORE_CACHE_LEVEL: u32 = 2;
-+
-+include!("../../ravg_read.rs.h");
-+
-+lazy_static::lazy_static! {
-+    static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap();
-+    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
-+}
-+
-+/// scx_layered: A highly configurable multi-layer sched_ext scheduler
-+///
-+/// scx_layered allows classifying tasks into multiple layers and applying
-+/// different scheduling policies to them. The configuration is specified in
-+/// json and composed of two parts - matches and policies.
-+///
-+/// Matches
-+/// =======
-+///
-+/// Whenever a task is forked or its attributes are changed, the task goes
-+/// through a series of matches to determine the layer it belongs to. A
-+/// match set is composed of OR groups of AND blocks. An example:
-+///
-+///   "matches": [
-+///     [
-+///       {
-+///         "CgroupPrefix": "system.slice/"
-+///       }
-+///     ],
-+///     [
-+///       {
-+///         "CommPrefix": "fbagent"
-+///       },
-+///       {
-+///         "NiceAbove": 0
-+///       }
-+///     ]
-+///   ],
-+///
-+/// The outer array contains the OR groups and the inner AND blocks, so the
-+/// above matches:
-+///
-+/// * Tasks which are in the cgroup sub-hierarchy under "system.slice".
-+/// * Or tasks whose comm starts with "fbagent" and have a nice value > 0.
-+///
-+/// Currently, the following matches are supported:
-+///
-+/// * CgroupPrefix: Matches the prefix of the cgroup that the task belongs
-+///   to. As this is a string match, whether the pattern has the trailing
-+///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
-+///   which are under that particular cgroup while "TOP/CHILD" also matches
-+///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
-+///
-+/// * CommPrefix: Matches the task's comm prefix.
-+///
-+/// * NiceAbove: Matches if the task's nice value is greater than the
-+///   pattern.
-+///
-+/// * NiceBelow: Matches if the task's nice value is smaller than the
-+///   pattern.
-+///
-+/// While there are complexity limitations as the matches are performed in
-+/// BPF, it is straightforward to add more types of matches.
-+///
-+/// Policies
-+/// ========
-+///
-+/// The following is an example policy configuration for a layer.
-+///
-+///   "kind": {
-+///     "Confined": {
-+///       "cpus_range": [1, 8],
-+///       "util_range": [0.8, 0.9]
-+///     }
-+///   }
-+///
-+/// It's of "Confined" kind, which tries to concentrate the layer's tasks
-+/// into a limited number of CPUs. In the above case, the number of CPUs
-+/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
-+/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
-+/// than 90%, more CPUs are allocated to the layer. If the utilization drops
-+/// below 80%, the layer loses CPUs.
-+///
-+/// Currently, the following policy kinds are supported:
-+///
-+/// * Confined: Tasks are restricted to the allocated CPUs. The number of
-+///   CPUs allocated is modulated to keep the per-CPU utilization in
-+///   "util_range". The range can optionally be restricted with the
-+///   "cpus_range" property.
-+///
-+/// * Grouped: Similar to Confined but tasks may spill outside if there are
-+///   idle CPUs outside the allocated ones. If "preempt" is true, tasks in
-+///   this layer will preempt tasks which belong to other non-preempting
-+///   layers when no idle CPUs are available.
-+///
-+/// * Open: Prefer the CPUs which are not occupied by Confined or Grouped
-+///   layers. Tasks in this group will spill into occupied CPUs if there are
-+///   no unoccupied idle CPUs. If "preempt" is true, tasks in this layer
-+///   will preempt tasks which belong to other non-preempting layers when no
-+///   idle CPUs are available.
-+///
-+/// Similar to matches, adding new policies and extending existing ones
-+/// should be relatively straightforward.
-+///
-+/// Configuration example and running scx_layered
-+/// =============================================
-+///
-+/// A scx_layered config is composed of layer configs and a layer config is
-+/// composed of a name, a set of matches and a policy block. Running the
-+/// following will write an example configuration into example.json.
-+///
-+///   $ scx_layered -e example.json
-+///
-+/// Note that the last layer in the configuration must have an empty match
-+/// set as it must match all tasks which haven't been matched into previous
-+/// layers.
-+///
-+/// The configuration can be specified in multiple json files and command
-+/// line arguments. Each must contain valid layer configurations and they're
-+/// concatenated in the specified order. In most cases, something like the
-+/// following should do.
-+///
-+///   $ scx_layered file:example.json
-+///
-+/// Statistics
-+/// ==========
-+///
-+/// scx_layered will print out a set of statistics every monitoring
-+/// interval.
-+///
-+///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 tctx_err=9 proc=6ms
-+///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
-+///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
-+///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
-+///                cpus=  2 [  2,  2] 04000001 00000000
-+///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
-+///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
-+///                cpus= 50 [ 50, 50] fbfffffe 000fffff
-+///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
-+///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
-+///                cpus= 50 [ 50, 50] fbfffffe 000fffff
-+///
-+/// Global statistics:
-+///
-+/// - tot: Total scheduling events in the period.
-+///
-+/// - local: % that got scheduled directly into an idle CPU.
-+///
-+/// - open_idle: % of open layer tasks scheduled into occupied idle CPUs.
-+///
-+/// - affn_viol: % which violated configured policies due to CPU affinity
-+///   restrictions.
-+///
-+/// - proc: CPU time this binary consumed during the period.
-+///
-+/// - busy: CPU busy % (100% means all CPUs were fully occupied)
-+///
-+/// - util: CPU utilization % (100% means one CPU was fully occupied)
-+///
-+/// - load: Sum of weight * duty_cycle for all tasks
-+///
-+/// Per-layer statistics:
-+///
-+/// - util/frac: CPU utilization and fraction % (sum of fractions across
-+///   layers is always 100%).
-+///
-+/// - load/frac: Load sum and fraction %.
-+///
-+/// - tasks: Number of tasks.
-+///
-+/// - tot: Total scheduling events.
-+///
-+/// - open_idle: % of tasks scheduled into idle CPUs occupied by other layers.
-+///
-+/// - preempt: % of tasks that preempted other tasks.
-+///
-+/// - affn_viol: % which violated configured policies due to CPU affinity
-+///   restrictions.
-+///
-+/// - cpus: CUR_NR_CPUS [MIN_NR_CPUS, MAX_NR_CPUS] CUR_CPU_MASK
-+///
-+#[derive(Debug, Parser)]
-+#[command(verbatim_doc_comment)]
-+struct Opts {
-+    /// Scheduling slice duration in microseconds.
-+    #[clap(short = 's', long, default_value = "20000")]
-+    slice_us: u64,
-+
-+    /// Scheduling interval in seconds.
-+    #[clap(short = 'i', long, default_value = "0.1")]
-+    interval: f64,
-+
-+    /// Monitoring interval in seconds.
-+    #[clap(short = 'm', long, default_value = "2.0")]
-+    monitor: f64,
-+
-+    /// Disable load-fraction based max layer CPU limit. ***NOTE***
-+    /// load-fraction calculation is currently broken due to lack of
-+    /// infeasible weight adjustments. Setting this option is recommended.
-+    #[clap(short = 'n', long)]
-+    no_load_frac_limit: bool,
-+
-+    /// Enable verbose output including libbpf details. Specify multiple
-+    /// times to increase verbosity.
-+    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
-+    verbose: u8,
-+
-+    /// Write example layer specifications into the file and exit.
-+    #[clap(short = 'e', long)]
-+    example: Option<String>,
-+
-+    /// Layer specification. See --help.
-+    specs: Vec<String>,
-+}
-+
-+#[derive(Clone, Debug, Serialize, Deserialize)]
-+enum LayerMatch {
-+    CgroupPrefix(String),
-+    CommPrefix(String),
-+    NiceAbove(i32),
-+    NiceBelow(i32),
-+}
-+
-+#[derive(Clone, Debug, Serialize, Deserialize)]
-+enum LayerKind {
-+    Confined {
-+        cpus_range: Option<(usize, usize)>,
-+        util_range: (f64, f64),
-+    },
-+    Grouped {
-+        cpus_range: Option<(usize, usize)>,
-+        util_range: (f64, f64),
-+        preempt: bool,
-+    },
-+    Open {
-+        preempt: bool,
-+    },
-+}
-+
-+#[derive(Clone, Debug, Serialize, Deserialize)]
-+struct LayerSpec {
-+    name: String,
-+    comment: Option<String>,
-+    matches: Vec<Vec<LayerMatch>>,
-+    kind: LayerKind,
-+}
-+
-+impl LayerSpec {
-+    fn parse(input: &str) -> Result<Vec<Self>> {
-+        let config: LayerConfig = if input.starts_with("f:") || input.starts_with("file:") {
-+            let mut f = fs::OpenOptions::new()
-+                .read(true)
-+                .open(input.split_once(':').unwrap().1)?;
-+            let mut content = String::new();
-+            f.read_to_string(&mut content)?;
-+            serde_json::from_str(&content)?
-+        } else {
-+            serde_json::from_str(input)?
-+        };
-+        Ok(config.specs)
-+    }
-+}
-+
-+#[derive(Clone, Debug, Serialize, Deserialize)]
-+#[serde(transparent)]
-+struct LayerConfig {
-+    specs: Vec<LayerSpec>,
-+}
-+
-+fn now_monotonic() -> u64 {
-+    let mut time = libc::timespec {
-+        tv_sec: 0,
-+        tv_nsec: 0,
-+    };
-+    let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) };
-+    assert!(ret == 0);
-+    time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64
-+}
-+
-+fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
-+    reader
-+        .read_stat()
-+        .context("Failed to read procfs")?
-+        .total_cpu
-+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
-+}
-+
-+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
-+    match (curr, prev) {
-+        (
-+            procfs::CpuStat {
-+                user_usec: Some(curr_user),
-+                nice_usec: Some(curr_nice),
-+                system_usec: Some(curr_system),
-+                idle_usec: Some(curr_idle),
-+                iowait_usec: Some(curr_iowait),
-+                irq_usec: Some(curr_irq),
-+                softirq_usec: Some(curr_softirq),
-+                stolen_usec: Some(curr_stolen),
-+                ..
-+            },
-+            procfs::CpuStat {
-+                user_usec: Some(prev_user),
-+                nice_usec: Some(prev_nice),
-+                system_usec: Some(prev_system),
-+                idle_usec: Some(prev_idle),
-+                iowait_usec: Some(prev_iowait),
-+                irq_usec: Some(prev_irq),
-+                softirq_usec: Some(prev_softirq),
-+                stolen_usec: Some(prev_stolen),
-+                ..
-+            },
-+        ) => {
-+            let idle_usec = curr_idle - prev_idle;
-+            let iowait_usec = curr_iowait - prev_iowait;
-+            let user_usec = curr_user - prev_user;
-+            let system_usec = curr_system - prev_system;
-+            let nice_usec = curr_nice - prev_nice;
-+            let irq_usec = curr_irq - prev_irq;
-+            let softirq_usec = curr_softirq - prev_softirq;
-+            let stolen_usec = curr_stolen - prev_stolen;
-+
-+            let busy_usec =
-+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
-+            let total_usec = idle_usec + busy_usec + iowait_usec;
-+            if total_usec > 0 {
-+                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
-+            } else {
-+                Ok(1.0)
-+            }
-+        }
-+        _ => {
-+            bail!("Missing stats in cpustat");
-+        }
-+    }
-+}
-+
-+fn copy_into_cstr(dst: &mut [i8], src: &str) {
-+    let cstr = CString::new(src).unwrap();
-+    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
-+    dst[0..bytes.len()].copy_from_slice(bytes);
-+}
-+
-+fn format_bitvec(bitvec: &BitVec) -> String {
-+    let mut vals = Vec::<u32>::new();
-+    let mut val: u32 = 0;
-+    for (idx, bit) in bitvec.iter().enumerate() {
-+        if idx > 0 && idx % 32 == 0 {
-+            vals.push(val);
-+            val = 0;
-+        }
-+        if *bit {
-+            val |= 1 << (idx % 32);
-+        }
-+    }
-+    vals.push(val);
-+    let mut output = vals
-+        .iter()
-+        .fold(String::new(), |string, v| format!("{}{:08x} ", string, v));
-+    output.pop();
-+    output
-+}
-+
-+fn read_cpu_ctxs(skel: &LayeredSkel) -> Result<Vec<layered_sys::cpu_ctx>> {
-+    let mut cpu_ctxs = vec![];
-+    let cpu_ctxs_vec = skel
-+        .maps()
-+        .cpu_ctxs()
-+        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
-+        .context("Failed to lookup cpu_ctx")?
-+        .unwrap();
-+    for cpu in 0..*NR_POSSIBLE_CPUS {
-+        cpu_ctxs.push(*unsafe {
-+            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const layered_sys::cpu_ctx)
-+        });
-+    }
-+    Ok(cpu_ctxs)
-+}
-+
-+#[derive(Clone, Debug)]
-+struct BpfStats {
-+    gstats: Vec<u64>,
-+    lstats: Vec<Vec<u64>>,
-+    lstats_sums: Vec<u64>,
-+}
-+
-+impl BpfStats {
-+    fn read(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Self {
-+        let mut gstats = vec![0u64; NR_GSTATS];
-+        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
-+
-+        for cpu in 0..*NR_POSSIBLE_CPUS {
-+            for stat in 0..NR_GSTATS {
-+                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
-+            }
-+            for layer in 0..nr_layers {
-+                for stat in 0..NR_LSTATS {
-+                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
-+                }
-+            }
-+        }
-+
-+        let mut lstats_sums = vec![0u64; NR_LSTATS];
-+        for layer in 0..nr_layers {
-+            for stat in 0..NR_LSTATS {
-+                lstats_sums[stat] += lstats[layer][stat];
-+            }
-+        }
-+
-+        Self {
-+            gstats,
-+            lstats,
-+            lstats_sums,
-+        }
-+    }
-+}
-+
-+impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
-+    type Output = BpfStats;
-+
-+    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
-+        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
-+        BpfStats {
-+            gstats: vec_sub(&self.gstats, &rhs.gstats),
-+            lstats: self
-+                .lstats
-+                .iter()
-+                .zip(rhs.lstats.iter())
-+                .map(|(l, r)| vec_sub(l, r))
-+                .collect(),
-+            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
-+        }
-+    }
-+}
-+
-+struct Stats {
-+    nr_layers: usize,
-+    at: Instant,
-+
-+    nr_layer_tasks: Vec<usize>,
-+
-+    total_load: f64,
-+    layer_loads: Vec<f64>,
-+
-+    total_util: f64, // Running AVG of sum of layer_utils
-+    layer_utils: Vec<f64>,
-+    prev_layer_cycles: Vec<u64>,
-+
-+    cpu_busy: f64, // Read from /proc, maybe higher than total_util
-+    prev_total_cpu: procfs::CpuStat,
-+
-+    bpf_stats: BpfStats,
-+    prev_bpf_stats: BpfStats,
-+}
-+
-+impl Stats {
-+    fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec<f64>) {
-+        let now_mono = now_monotonic();
-+        let layer_loads: Vec<f64> = skel
-+            .bss()
-+            .layers
-+            .iter()
-+            .take(nr_layers)
-+            .map(|layer| {
-+                let rd = &layer.load_rd;
-+                ravg_read(
-+                    rd.val,
-+                    rd.val_at,
-+                    rd.old,
-+                    rd.cur,
-+                    now_mono,
-+                    USAGE_HALF_LIFE,
-+                    RAVG_FRAC_BITS,
-+                )
-+            })
-+            .collect();
-+        (layer_loads.iter().sum(), layer_loads)
-+    }
-+
-+    fn read_layer_cycles(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Vec<u64> {
-+        let mut layer_cycles = vec![0u64; nr_layers];
-+
-+        for cpu in 0..*NR_POSSIBLE_CPUS {
-+            for layer in 0..nr_layers {
-+                layer_cycles[layer] += cpu_ctxs[cpu].layer_cycles[layer];
-+            }
-+        }
-+
-+        layer_cycles
-+    }
-+
-+    fn new(skel: &mut LayeredSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
-+        let nr_layers = skel.rodata().nr_layers as usize;
-+        let bpf_stats = BpfStats::read(&read_cpu_ctxs(skel)?, nr_layers);
-+
-+        Ok(Self {
-+            at: Instant::now(),
-+            nr_layers,
-+
-+            nr_layer_tasks: vec![0; nr_layers],
-+
-+            total_load: 0.0,
-+            layer_loads: vec![0.0; nr_layers],
-+
-+            total_util: 0.0,
-+            layer_utils: vec![0.0; nr_layers],
-+            prev_layer_cycles: vec![0; nr_layers],
-+
-+            cpu_busy: 0.0,
-+            prev_total_cpu: read_total_cpu(&proc_reader)?,
-+
-+            bpf_stats: bpf_stats.clone(),
-+            prev_bpf_stats: bpf_stats,
-+        })
-+    }
-+
-+    fn refresh(
-+        &mut self,
-+        skel: &mut LayeredSkel,
-+        proc_reader: &procfs::ProcReader,
-+        now: Instant,
-+    ) -> Result<()> {
-+        let elapsed = now.duration_since(self.at).as_secs_f64() as f64;
-+        let cpu_ctxs = read_cpu_ctxs(skel)?;
-+
-+        let nr_layer_tasks: Vec<usize> = skel
-+            .bss()
-+            .layers
-+            .iter()
-+            .take(self.nr_layers)
-+            .map(|layer| layer.nr_tasks as usize)
-+            .collect();
-+
-+        let (total_load, layer_loads) = Self::read_layer_loads(skel, self.nr_layers);
-+
-+        let cur_layer_cycles = Self::read_layer_cycles(&cpu_ctxs, self.nr_layers);
-+        let cur_layer_utils: Vec<f64> = cur_layer_cycles
-+            .iter()
-+            .zip(self.prev_layer_cycles.iter())
-+            .map(|(cur, prev)| (cur - prev) as f64 / 1_000_000_000.0 / elapsed)
-+            .collect();
-+        let layer_utils: Vec<f64> = cur_layer_utils
-+            .iter()
-+            .zip(self.layer_utils.iter())
-+            .map(|(cur, prev)| {
-+                let decay = USAGE_DECAY.powf(elapsed);
-+                prev * decay + cur * (1.0 - decay)
-+            })
-+            .collect();
-+
-+        let cur_total_cpu = read_total_cpu(proc_reader)?;
-+        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
-+
-+        let cur_bpf_stats = BpfStats::read(&cpu_ctxs, self.nr_layers);
-+        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
-+
-+        *self = Self {
-+            at: now,
-+            nr_layers: self.nr_layers,
-+
-+            nr_layer_tasks,
-+
-+            total_load,
-+            layer_loads,
-+
-+            total_util: layer_utils.iter().sum(),
-+            layer_utils: layer_utils.try_into().unwrap(),
-+            prev_layer_cycles: cur_layer_cycles,
-+
-+            cpu_busy,
-+            prev_total_cpu: cur_total_cpu,
-+
-+            bpf_stats,
-+            prev_bpf_stats: cur_bpf_stats,
-+        };
-+        Ok(())
-+    }
-+}
-+
-+#[derive(Debug, Default)]
-+struct UserExitInfo {
-+    kind: i32,
-+    reason: Option<String>,
-+    msg: Option<String>,
-+}
-+
-+impl UserExitInfo {
-+    fn read(bpf_uei: &layered_bss_types::user_exit_info) -> Result<Self> {
-+        let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) };
-+
-+        let (reason, msg) = if kind != 0 {
-+            (
-+                Some(
-+                    unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) }
-+                        .to_str()
-+                        .context("Failed to convert reason to string")?
-+                        .to_string(),
-+                )
-+                .filter(|s| !s.is_empty()),
-+                Some(
-+                    unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) }
-+                        .to_str()
-+                        .context("Failed to convert msg to string")?
-+                        .to_string(),
-+                )
-+                .filter(|s| !s.is_empty()),
-+            )
-+        } else {
-+            (None, None)
-+        };
-+
-+        Ok(Self { kind, reason, msg })
-+    }
-+
-+    fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result<bool> {
-+        Ok(Self::read(bpf_uei)?.kind != 0)
-+    }
-+
-+    fn report(&self) -> Result<()> {
-+        let why = match (&self.reason, &self.msg) {
-+            (Some(reason), None) => format!("{}", reason),
-+            (Some(reason), Some(msg)) => format!("{} ({})", reason, msg),
-+            _ => "".into(),
-+        };
-+
-+        match self.kind {
-+            0 => Ok(()),
-+            etype => {
-+                if etype != 64 {
-+                    bail!("EXIT: kind={} {}", etype, why);
-+                } else {
-+                    info!("EXIT: {}", why);
-+                    Ok(())
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+#[derive(Debug)]
-+struct CpuPool {
-+    nr_cores: usize,
-+    nr_cpus: usize,
-+    all_cpus: BitVec,
-+    core_cpus: Vec<BitVec>,
-+    cpu_core: Vec<usize>,
-+    available_cores: BitVec,
-+    first_cpu: usize,
-+    fallback_cpu: usize, // next free or the first CPU if none is free
-+}
-+
-+impl CpuPool {
-+    fn new() -> Result<Self> {
-+        if *NR_POSSIBLE_CPUS > MAX_CPUS {
-+            bail!(
-+                "NR_POSSIBLE_CPUS {} > MAX_CPUS {}",
-+                *NR_POSSIBLE_CPUS,
-+                MAX_CPUS
-+            );
-+        }
-+
-+        let mut cpu_to_cache = vec![]; // (cpu_id, Option<cache_id>)
-+        let mut cache_ids = BTreeSet::<usize>::new();
-+        let mut nr_offline = 0;
-+
-+        // Build cpu -> cache ID mapping.
-+        for cpu in 0..*NR_POSSIBLE_CPUS {
-+            let path = format!(
-+                "/sys/devices/system/cpu/cpu{}/cache/index{}/id",
-+                cpu, CORE_CACHE_LEVEL
-+            );
-+            let id = match std::fs::read_to_string(&path) {
-+                Ok(val) => Some(val.trim().parse::<usize>().with_context(|| {
-+                    format!("Failed to parse {:?}'s content {:?}", &path, &val)
-+                })?),
-+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-+                    nr_offline += 1;
-+                    None
-+                }
-+                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
-+            };
-+
-+            cpu_to_cache.push(id);
-+            if let Some(id) = id {
-+                cache_ids.insert(id);
-+            }
-+        }
-+
-+        let nr_cpus = *NR_POSSIBLE_CPUS - nr_offline;
-+
-+        // Cache IDs may have holes. Assign consecutive core IDs to existing
-+        // cache IDs.
-+        let mut cache_to_core = BTreeMap::<usize, usize>::new();
-+        let mut nr_cores = 0;
-+        for cache_id in cache_ids.iter() {
-+            cache_to_core.insert(*cache_id, nr_cores);
-+            nr_cores += 1;
-+        }
-+
-+        // Build core -> cpumask and cpu -> core mappings.
-+        let mut all_cpus = bitvec![0; *NR_POSSIBLE_CPUS];
-+        let mut core_cpus = vec![bitvec![0; *NR_POSSIBLE_CPUS]; nr_cores];
-+        let mut cpu_core = vec![];
-+
-+        for (cpu, cache) in cpu_to_cache.iter().enumerate().take(*NR_POSSIBLE_CPUS) {
-+            if let Some(cache_id) = cache {
-+                let core_id = cache_to_core[cache_id];
-+                all_cpus.set(cpu, true);
-+                core_cpus[core_id].set(cpu, true);
-+                cpu_core.push(core_id);
-+            }
-+        }
-+
-+        info!(
-+            "CPUs: online/possible={}/{} nr_cores={}",
-+            nr_cpus, *NR_POSSIBLE_CPUS, nr_cores,
-+        );
-+
-+        let first_cpu = core_cpus[0].first_one().unwrap();
-+
-+        let mut cpu_pool = Self {
-+            nr_cores,
-+            nr_cpus,
-+            all_cpus,
-+            core_cpus,
-+            cpu_core,
-+            available_cores: bitvec![1; nr_cores],
-+            first_cpu,
-+            fallback_cpu: first_cpu,
-+        };
-+        cpu_pool.update_fallback_cpu();
-+        Ok(cpu_pool)
-+    }
-+
-+    fn update_fallback_cpu(&mut self) {
-+        match self.available_cores.first_one() {
-+            Some(next) => self.fallback_cpu = self.core_cpus[next].first_one().unwrap(),
-+            None => self.fallback_cpu = self.first_cpu,
-+        }
-+    }
-+
-+    fn alloc<'a>(&'a mut self) -> Option<&'a BitVec> {
-+        let core = self.available_cores.first_one()?;
-+        self.available_cores.set(core, false);
-+        self.update_fallback_cpu();
-+        Some(&self.core_cpus[core])
-+    }
-+
-+    fn cpus_to_cores(&self, cpus_to_match: &BitVec) -> Result<BitVec> {
-+        let mut cpus = cpus_to_match.clone();
-+        let mut cores = bitvec![0; self.nr_cores];
-+
-+        while let Some(cpu) = cpus.first_one() {
-+            let core = self.cpu_core[cpu];
-+
-+            if (self.core_cpus[core].clone() & !cpus.clone()).count_ones() != 0 {
-+                bail!(
-+                    "CPUs {} partially intersect with core {} ({})",
-+                    cpus_to_match,
-+                    core,
-+                    self.core_cpus[core],
-+                );
-+            }
-+
-+            cpus &= !self.core_cpus[core].clone();
-+            cores.set(core, true);
-+        }
-+
-+        Ok(cores)
-+    }
-+
-+    fn free<'a>(&'a mut self, cpus_to_free: &BitVec) -> Result<()> {
-+        let cores = self.cpus_to_cores(cpus_to_free)?;
-+        if (self.available_cores.clone() & &cores).any() {
-+            bail!("Some of CPUs {} are already free", cpus_to_free);
-+        }
-+        self.available_cores |= cores;
-+        self.update_fallback_cpu();
-+        Ok(())
-+    }
-+
-+    fn next_to_free<'a>(&'a self, cands: &BitVec) -> Result<Option<&'a BitVec>> {
-+        let last = match cands.last_one() {
-+            Some(ret) => ret,
-+            None => return Ok(None),
-+        };
-+        let core = self.cpu_core[last];
-+        if (self.core_cpus[core].clone() & !cands.clone()).count_ones() != 0 {
-+            bail!(
-+                "CPUs{} partially intersect with core {} ({})",
-+                cands,
-+                core,
-+                self.core_cpus[core]
-+            );
-+        }
-+
-+        Ok(Some(&self.core_cpus[core]))
-+    }
-+
-+    fn available_cpus(&self) -> BitVec {
-+        let mut cpus = bitvec![0; self.nr_cpus];
-+        for core in self.available_cores.iter_ones() {
-+            cpus |= &self.core_cpus[core];
-+        }
-+        cpus
-+    }
-+}
-+
-+#[derive(Debug)]
-+struct Layer {
-+    name: String,
-+    kind: LayerKind,
-+
-+    nr_cpus: usize,
-+    cpus: BitVec,
-+}
-+
-+impl Layer {
-+    fn new(cpu_pool: &mut CpuPool, name: &str, kind: LayerKind) -> Result<Self> {
-+        match &kind {
-+            LayerKind::Confined {
-+                cpus_range,
-+                util_range,
-+            } => {
-+                let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX));
-+                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
-+                    bail!("invalid cpus_range {:?}", cpus_range);
-+                }
-+                if util_range.0 < 0.0
-+                    || util_range.0 > 1.0
-+                    || util_range.1 < 0.0
-+                    || util_range.1 > 1.0
-+                    || util_range.0 >= util_range.1
-+                {
-+                    bail!("invalid util_range {:?}", util_range);
-+                }
-+            }
-+            _ => {}
-+        }
-+
-+        let nr_cpus = cpu_pool.nr_cpus;
-+
-+        let mut layer = Self {
-+            name: name.into(),
-+            kind,
-+
-+            nr_cpus: 0,
-+            cpus: bitvec![0; nr_cpus],
-+        };
-+
-+        match &layer.kind {
-+            LayerKind::Confined {
-+                cpus_range,
-+                util_range,
-+            }
-+            | LayerKind::Grouped {
-+                cpus_range,
-+                util_range,
-+                ..
-+            } => {
-+                layer.resize_confined_or_grouped(
-+                    cpu_pool,
-+                    *cpus_range,
-+                    *util_range,
-+                    (0.0, 0.0),
-+                    (0.0, 0.0),
-+                    false,
-+                )?;
-+            }
-+            _ => {}
-+        }
-+
-+        Ok(layer)
-+    }
-+
-+    fn grow_confined_or_grouped(
-+        &mut self,
-+        cpu_pool: &mut CpuPool,
-+        (cpus_min, cpus_max): (usize, usize),
-+        (_util_low, util_high): (f64, f64),
-+        (layer_load, total_load): (f64, f64),
-+        (layer_util, _total_util): (f64, f64),
-+        no_load_frac_limit: bool,
-+    ) -> Result<bool> {
-+        if self.nr_cpus >= cpus_max {
-+            return Ok(false);
-+        }
-+
-+        // Do we already have enough?
-+        if self.nr_cpus >= cpus_min
-+            && (layer_util == 0.0
-+                || (self.nr_cpus > 0 && layer_util / self.nr_cpus as f64 <= util_high))
-+        {
-+            return Ok(false);
-+        }
-+
-+        // Can't have more CPUs than our load fraction.
-+        if !no_load_frac_limit
-+            && self.nr_cpus >= cpus_min
-+            && (total_load >= 0.0
-+                && self.nr_cpus as f64 / cpu_pool.nr_cpus as f64 >= layer_load / total_load)
-+        {
-+            trace!(
-+                "layer-{} needs more CPUs (util={:.3}) but is over the load fraction",
-+                &self.name,
-+                layer_util
-+            );
-+            return Ok(false);
-+        }
-+
-+        let new_cpus = match cpu_pool.alloc().clone() {
-+            Some(ret) => ret.clone(),
-+            None => {
-+                trace!("layer-{} can't grow, no CPUs", &self.name);
-+                return Ok(false);
-+            }
-+        };
-+
-+        trace!(
-+            "layer-{} adding {} CPUs to {} CPUs",
-+            &self.name,
-+            new_cpus.count_ones(),
-+            self.nr_cpus
-+        );
-+
-+        self.nr_cpus += new_cpus.count_ones();
-+        self.cpus |= &new_cpus;
-+        Ok(true)
-+    }
-+
-+    fn cpus_to_free(
-+        &self,
-+        cpu_pool: &mut CpuPool,
-+        (cpus_min, _cpus_max): (usize, usize),
-+        (util_low, util_high): (f64, f64),
-+        (layer_load, total_load): (f64, f64),
-+        (layer_util, _total_util): (f64, f64),
-+        no_load_frac_limit: bool,
-+    ) -> Result<Option<BitVec>> {
-+        if self.nr_cpus <= cpus_min {
-+            return Ok(None);
-+        }
-+
-+        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus)? {
-+            Some(ret) => ret.clone(),
-+            None => return Ok(None),
-+        };
-+
-+        let nr_to_free = cpus_to_free.count_ones();
-+
-+        // If we'd be over the load fraction even after freeing
-+        // $cpus_to_free, we have to free.
-+        if !no_load_frac_limit
-+            && total_load >= 0.0
-+            && (self.nr_cpus - nr_to_free) as f64 / cpu_pool.nr_cpus as f64
-+                >= layer_load / total_load
-+        {
-+            return Ok(Some(cpus_to_free));
-+        }
-+
-+        if layer_util / self.nr_cpus as f64 >= util_low {
-+            return Ok(None);
-+        }
-+
-+        // Can't shrink if losing the CPUs pushes us over @util_high.
-+        match self.nr_cpus - nr_to_free {
-+            0 => {
-+                if layer_util > 0.0 {
-+                    return Ok(None);
-+                }
-+            }
-+            nr_left => {
-+                if layer_util / nr_left as f64 >= util_high {
-+                    return Ok(None);
-+                }
-+            }
-+        }
-+
-+        return Ok(Some(cpus_to_free));
-+    }
-+
-+    fn shrink_confined_or_grouped(
-+        &mut self,
-+        cpu_pool: &mut CpuPool,
-+        cpus_range: (usize, usize),
-+        util_range: (f64, f64),
-+        load: (f64, f64),
-+        util: (f64, f64),
-+        no_load_frac_limit: bool,
-+    ) -> Result<bool> {
-+        match self.cpus_to_free(
-+            cpu_pool,
-+            cpus_range,
-+            util_range,
-+            load,
-+            util,
-+            no_load_frac_limit,
-+        )? {
-+            Some(cpus_to_free) => {
-+                trace!("freeing CPUs {}", &cpus_to_free);
-+                self.nr_cpus -= cpus_to_free.count_ones();
-+                self.cpus &= !cpus_to_free.clone();
-+                cpu_pool.free(&cpus_to_free)?;
-+                Ok(true)
-+            }
-+            None => Ok(false),
-+        }
-+    }
-+
-+    fn resize_confined_or_grouped(
-+        &mut self,
-+        cpu_pool: &mut CpuPool,
-+        cpus_range: Option<(usize, usize)>,
-+        util_range: (f64, f64),
-+        load: (f64, f64),
-+        util: (f64, f64),
-+        no_load_frac_limit: bool,
-+    ) -> Result<i64> {
-+        let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX));
-+        let mut adjusted = 0;
-+
-+        while self.grow_confined_or_grouped(
-+            cpu_pool,
-+            cpus_range,
-+            util_range,
-+            load,
-+            util,
-+            no_load_frac_limit,
-+        )? {
-+            adjusted += 1;
-+            trace!("{} grew, adjusted={}", &self.name, adjusted);
-+        }
-+
-+        if adjusted == 0 {
-+            while self.shrink_confined_or_grouped(
-+                cpu_pool,
-+                cpus_range,
-+                util_range,
-+                load,
-+                util,
-+                no_load_frac_limit,
-+            )? {
-+                adjusted -= 1;
-+                trace!("{} shrunk, adjusted={}", &self.name, adjusted);
-+            }
-+        }
-+
-+        if adjusted != 0 {
-+            trace!("{} done resizing, adjusted={}", &self.name, adjusted);
-+        }
-+        Ok(adjusted)
-+    }
-+}
-+
-+struct Scheduler<'a> {
-+    skel: LayeredSkel<'a>,
-+    struct_ops: Option<libbpf_rs::Link>,
-+    layer_specs: Vec<LayerSpec>,
-+
-+    sched_intv: Duration,
-+    monitor_intv: Duration,
-+    no_load_frac_limit: bool,
-+
-+    cpu_pool: CpuPool,
-+    layers: Vec<Layer>,
-+
-+    proc_reader: procfs::ProcReader,
-+    sched_stats: Stats,
-+    report_stats: Stats,
-+
-+    nr_layer_cpus_min_max: Vec<(usize, usize)>,
-+    processing_dur: Duration,
-+    prev_processing_dur: Duration,
-+}
-+
-+impl<'a> Scheduler<'a> {
-+    fn init_layers(skel: &mut OpenLayeredSkel, specs: &Vec<LayerSpec>) -> Result<()> {
-+        skel.rodata().nr_layers = specs.len() as u32;
-+
-+        for (spec_i, spec) in specs.iter().enumerate() {
-+            let layer = &mut skel.bss().layers[spec_i];
-+
-+            for (or_i, or) in spec.matches.iter().enumerate() {
-+                for (and_i, and) in or.iter().enumerate() {
-+                    let mt = &mut layer.matches[or_i].matches[and_i];
-+                    match and {
-+                        LayerMatch::CgroupPrefix(prefix) => {
-+                            mt.kind = layered_sys::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
-+                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
-+                        }
-+                        LayerMatch::CommPrefix(prefix) => {
-+                            mt.kind = layered_sys::layer_match_kind_MATCH_COMM_PREFIX as i32;
-+                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
-+                        }
-+                        LayerMatch::NiceAbove(nice) => {
-+                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_ABOVE as i32;
-+                            mt.nice_above_or_below = *nice;
-+                        }
-+                        LayerMatch::NiceBelow(nice) => {
-+                            mt.kind = layered_sys::layer_match_kind_MATCH_NICE_BELOW as i32;
-+                            mt.nice_above_or_below = *nice;
-+                        }
-+                    }
-+                }
-+                layer.matches[or_i].nr_match_ands = or.len() as i32;
-+            }
-+
-+            layer.nr_match_ors = spec.matches.len() as u32;
-+
-+            match &spec.kind {
-+                LayerKind::Open { preempt } | LayerKind::Grouped { preempt, .. } => {
-+                    layer.open = true;
-+                    layer.preempt = *preempt;
-+                }
-+                _ => {}
-+            }
-+        }
-+
-+        Ok(())
-+    }
-+
-+    fn init(opts: &Opts, layer_specs: Vec<LayerSpec>) -> Result<Self> {
-+        let nr_layers = layer_specs.len();
-+        let mut cpu_pool = CpuPool::new()?;
-+
-+        // Open the BPF prog first for verification.
-+        let mut skel_builder = LayeredSkelBuilder::default();
-+        skel_builder.obj_builder.debug(opts.verbose > 1);
-+        let mut skel = skel_builder.open().context("Failed to open BPF program")?;
-+
-+        // Initialize skel according to @opts.
-+        skel.rodata().debug = opts.verbose as u32;
-+        skel.rodata().slice_ns = opts.slice_us * 1000;
-+        skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
-+        skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
-+        for cpu in cpu_pool.all_cpus.iter_ones() {
-+            skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
-+        }
-+        Self::init_layers(&mut skel, &layer_specs)?;
-+
-+        // Attach.
-+        let mut skel = skel.load().context("Failed to load BPF program")?;
-+        skel.attach().context("Failed to attach BPF program")?;
-+        let struct_ops = Some(
-+            skel.maps_mut()
-+                .layered()
-+                .attach_struct_ops()
-+                .context("Failed to attach layered struct ops")?,
-+        );
-+        info!("Layered Scheduler Attached");
-+
-+        let mut layers = vec![];
-+        for spec in layer_specs.iter() {
-+            layers.push(Layer::new(&mut cpu_pool, &spec.name, spec.kind.clone())?);
-+        }
-+
-+        // Other stuff.
-+        let proc_reader = procfs::ProcReader::new();
-+
-+        Ok(Self {
-+            struct_ops, // should be held to keep it attached
-+            layer_specs,
-+
-+            sched_intv: Duration::from_secs_f64(opts.interval),
-+            monitor_intv: Duration::from_secs_f64(opts.monitor),
-+            no_load_frac_limit: opts.no_load_frac_limit,
-+
-+            cpu_pool,
-+            layers,
-+
-+            sched_stats: Stats::new(&mut skel, &proc_reader)?,
-+            report_stats: Stats::new(&mut skel, &proc_reader)?,
-+
-+            nr_layer_cpus_min_max: vec![(0, 0); nr_layers],
-+            processing_dur: Duration::from_millis(0),
-+            prev_processing_dur: Duration::from_millis(0),
-+
-+            proc_reader,
-+            skel,
-+        })
-+    }
-+
-+    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut layered_bss_types::layer) {
-+        for bit in 0..layer.cpus.len() {
-+            if layer.cpus[bit] {
-+                bpf_layer.cpus[bit / 8] |= 1 << (bit % 8);
-+            } else {
-+                bpf_layer.cpus[bit / 8] &= !(1 << (bit % 8));
-+            }
-+        }
-+        bpf_layer.refresh_cpus = 1;
-+    }
-+
-+    fn step(&mut self) -> Result<()> {
-+        let started_at = Instant::now();
-+        self.sched_stats
-+            .refresh(&mut self.skel, &self.proc_reader, started_at)?;
-+        let mut updated = false;
-+
-+        for idx in 0..self.layers.len() {
-+            match self.layers[idx].kind {
-+                LayerKind::Confined {
-+                    cpus_range,
-+                    util_range,
-+                }
-+                | LayerKind::Grouped {
-+                    cpus_range,
-+                    util_range,
-+                    ..
-+                } => {
-+                    let load = (
-+                        self.sched_stats.layer_loads[idx],
-+                        self.sched_stats.total_load,
-+                    );
-+                    let util = (
-+                        self.sched_stats.layer_utils[idx],
-+                        self.sched_stats.total_util,
-+                    );
-+                    if self.layers[idx].resize_confined_or_grouped(
-+                        &mut self.cpu_pool,
-+                        cpus_range,
-+                        util_range,
-+                        load,
-+                        util,
-+                        self.no_load_frac_limit,
-+                    )? != 0
-+                    {
-+                        Self::update_bpf_layer_cpumask(
-+                            &self.layers[idx],
-+                            &mut self.skel.bss().layers[idx],
-+                        );
-+                        updated = true;
-+                    }
-+                }
-+                _ => {}
-+            }
-+        }
-+
-+        if updated {
-+            let available_cpus = self.cpu_pool.available_cpus();
-+            let nr_available_cpus = available_cpus.count_ones();
-+            for idx in 0..self.layers.len() {
-+                let layer = &mut self.layers[idx];
-+                let bpf_layer = &mut self.skel.bss().layers[idx];
-+                match &layer.kind {
-+                    LayerKind::Open { .. } => {
-+                        layer.cpus.copy_from_bitslice(&available_cpus);
-+                        layer.nr_cpus = nr_available_cpus;
-+                        Self::update_bpf_layer_cpumask(layer, bpf_layer);
-+                    }
-+                    _ => {}
-+                }
-+            }
-+
-+            self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
-+
-+            for (lidx, layer) in self.layers.iter().enumerate() {
-+                self.nr_layer_cpus_min_max[lidx] = (
-+                    self.nr_layer_cpus_min_max[lidx].0.min(layer.nr_cpus),
-+                    self.nr_layer_cpus_min_max[lidx].1.max(layer.nr_cpus),
-+                );
-+            }
-+        }
-+
-+        self.processing_dur += Instant::now().duration_since(started_at);
-+        Ok(())
-+    }
-+
-+    fn report(&mut self) -> Result<()> {
-+        let started_at = Instant::now();
-+        self.report_stats
-+            .refresh(&mut self.skel, &self.proc_reader, started_at)?;
-+        let stats = &self.report_stats;
-+
-+        let processing_dur = self.processing_dur - self.prev_processing_dur;
-+        self.prev_processing_dur = self.processing_dur;
-+
-+        let lsum = |idx| stats.bpf_stats.lstats_sums[idx as usize];
-+        let total = lsum(layered_sys::layer_stat_idx_LSTAT_LOCAL)
-+            + lsum(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
-+        let lsum_pct = |idx| {
-+            if total != 0 {
-+                lsum(idx) as f64 / total as f64 * 100.0
-+            } else {
-+                0.0
-+            }
-+        };
-+
-+        info!(
-+            "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms",
-+            total,
-+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
-+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
-+            lsum_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
-+            stats.prev_bpf_stats.gstats
-+                [layered_sys::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize],
-+            processing_dur.as_millis(),
-+        );
-+
-+        info!(
-+            "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}",
-+            stats.cpu_busy * 100.0,
-+            stats.total_util * 100.0,
-+            stats.total_load,
-+            self.cpu_pool.fallback_cpu,
-+        );
-+
-+        let header_width = self
-+            .layer_specs
-+            .iter()
-+            .map(|spec| spec.name.len())
-+            .max()
-+            .unwrap()
-+            .max(4);
-+
-+        let calc_frac = |a, b| {
-+            if b != 0.0 { a / b * 100.0 } else { 0.0 }
-+        };
-+
-+        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
-+            let lstat = |sidx| stats.bpf_stats.lstats[lidx][sidx as usize];
-+            let ltotal = lstat(layered_sys::layer_stat_idx_LSTAT_LOCAL)
-+                + lstat(layered_sys::layer_stat_idx_LSTAT_GLOBAL);
-+            let lstat_pct = |sidx| {
-+                if ltotal != 0 {
-+                    lstat(sidx) as f64 / ltotal as f64 * 100.0
-+                } else {
-+                    0.0
-+                }
-+            };
-+
-+            info!(
-+                "  {:<width$}: util/frac={:7.1}/{:5.1} load/frac={:9.1}:{:5.1} tasks={:6}",
-+                spec.name,
-+                stats.layer_utils[lidx] * 100.0,
-+                calc_frac(stats.layer_utils[lidx], stats.total_util),
-+                stats.layer_loads[lidx],
-+                calc_frac(stats.layer_loads[lidx], stats.total_load),
-+                stats.nr_layer_tasks[lidx],
-+                width = header_width,
-+            );
-+            info!(
-+                "  {:<width$}  tot={:7} local={:5.2} open_idle={:5.2} preempt={:5.2} affn_viol={:5.2}",
-+                "",
-+                ltotal,
-+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL),
-+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE),
-+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_PREEMPT),
-+                lstat_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL),
-+                width = header_width,
-+            );
-+            info!(
-+                "  {:<width$}  cpus={:3} [{:3},{:3}] {}",
-+                "",
-+                layer.nr_cpus,
-+                self.nr_layer_cpus_min_max[lidx].0,
-+                self.nr_layer_cpus_min_max[lidx].1,
-+                format_bitvec(&layer.cpus),
-+                width = header_width
-+            );
-+            self.nr_layer_cpus_min_max[lidx] = (layer.nr_cpus, layer.nr_cpus);
-+        }
-+
-+        self.processing_dur += Instant::now().duration_since(started_at);
-+        Ok(())
-+    }
-+
-+    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
-+        let now = Instant::now();
-+        let mut next_sched_at = now + self.sched_intv;
-+        let mut next_monitor_at = now + self.monitor_intv;
-+
-+        while !shutdown.load(Ordering::Relaxed) && !UserExitInfo::exited(&self.skel.bss().uei)? {
-+            let now = Instant::now();
-+
-+            if now >= next_sched_at {
-+                self.step()?;
-+                while next_sched_at < now {
-+                    next_sched_at += self.sched_intv;
-+                }
-+            }
-+
-+            if now >= next_monitor_at {
-+                self.report()?;
-+                while next_monitor_at < now {
-+                    next_monitor_at += self.monitor_intv;
-+                }
-+            }
-+
-+            std::thread::sleep(
-+                next_sched_at
-+                    .min(next_monitor_at)
-+                    .duration_since(Instant::now()),
-+            );
-+        }
-+
-+        self.struct_ops.take();
-+        UserExitInfo::read(&self.skel.bss().uei)?.report()
-+    }
-+}
-+
-+impl<'a> Drop for Scheduler<'a> {
-+    fn drop(&mut self) {
-+        if let Some(struct_ops) = self.struct_ops.take() {
-+            drop(struct_ops);
-+        }
-+    }
-+}
-+
-+fn write_example_file(path: &str) -> Result<()> {
-+    let example = LayerConfig {
-+        specs: vec![
-+            LayerSpec {
-+                name: "batch".into(),
-+                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
-+                matches: vec![
-+                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
-+                    vec![LayerMatch::NiceAbove(0)],
-+                ],
-+                kind: LayerKind::Confined {
-+                    cpus_range: Some((0, 16)),
-+                    util_range: (0.8, 0.9),
-+                },
-+            },
-+            LayerSpec {
-+                name: "immediate".into(),
-+                comment: Some("tasks under workload.slice with nice value < 0".into()),
-+                matches: vec![vec![
-+                    LayerMatch::CgroupPrefix("workload.slice/".into()),
-+                    LayerMatch::NiceBelow(0),
-+                ]],
-+                kind: LayerKind::Open { preempt: true },
-+            },
-+            LayerSpec {
-+                name: "normal".into(),
-+                comment: Some("the rest".into()),
-+                matches: vec![vec![]],
-+                kind: LayerKind::Grouped {
-+                    cpus_range: None,
-+                    util_range: (0.5, 0.6),
-+                    preempt: false,
-+                },
-+            },
-+        ],
-+    };
-+
-+    let mut f = fs::OpenOptions::new()
-+        .create_new(true)
-+        .write(true)
-+        .open(path)?;
-+    Ok(f.write_all(serde_json::to_string_pretty(&example)?.as_bytes())?)
-+}
-+
-+fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
-+    let nr_specs = specs.len();
-+    if nr_specs == 0 {
-+        bail!("No layer spec");
-+    }
-+    if nr_specs > MAX_LAYERS {
-+        bail!("Too many layer specs");
-+    }
-+
-+    for (idx, spec) in specs.iter().enumerate() {
-+        if idx < nr_specs - 1 {
-+            if spec.matches.len() == 0 {
-+                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
-+            }
-+        } else {
-+            if spec.matches.len() != 1 || spec.matches[0].len() != 0 {
-+                bail!("Terminal spec {:?} must have an empty match", spec.name);
-+            }
-+        }
-+
-+        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
-+            bail!(
-+                "Spec {:?} has too many ({}) OR match blocks",
-+                spec.name,
-+                spec.matches.len()
-+            );
-+        }
-+
-+        for (ands_idx, ands) in spec.matches.iter().enumerate() {
-+            if ands.len() > NR_LAYER_MATCH_KINDS {
-+                bail!(
-+                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
-+                    spec.name,
-+                    ands_idx,
-+                    ands.len()
-+                );
-+            }
-+            for one in ands.iter() {
-+                match one {
-+                    LayerMatch::CgroupPrefix(prefix) => {
-+                        if prefix.len() > MAX_PATH {
-+                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
-+                        }
-+                    }
-+                    LayerMatch::CommPrefix(prefix) => {
-+                        if prefix.len() > MAX_COMM {
-+                            bail!("Spec {:?} has too long a comm prefix", spec.name);
-+                        }
-+                    }
-+                    _ => {}
-+                }
-+            }
-+        }
-+
-+        match spec.kind {
-+            LayerKind::Confined {
-+                cpus_range,
-+                util_range,
-+            }
-+            | LayerKind::Grouped {
-+                cpus_range,
-+                util_range,
-+                ..
-+            } => {
-+                if let Some((cpus_min, cpus_max)) = cpus_range {
-+                    if cpus_min > cpus_max {
-+                        bail!(
-+                            "Spec {:?} has invalid cpus_range({}, {})",
-+                            spec.name,
-+                            cpus_min,
-+                            cpus_max
-+                        );
-+                    }
-+                }
-+                if util_range.0 >= util_range.1 {
-+                    bail!(
-+                        "Spec {:?} has invalid util_range ({}, {})",
-+                        spec.name,
-+                        util_range.0,
-+                        util_range.1
-+                    );
-+                }
-+            }
-+            _ => {}
-+        }
-+    }
-+
-+    Ok(())
-+}
-+
-+fn main() -> Result<()> {
-+    let opts = Opts::parse();
-+
-+    let llv = match opts.verbose {
-+        0 => simplelog::LevelFilter::Info,
-+        1 => simplelog::LevelFilter::Debug,
-+        _ => simplelog::LevelFilter::Trace,
-+    };
-+    let mut lcfg = simplelog::ConfigBuilder::new();
-+    lcfg.set_time_level(simplelog::LevelFilter::Error)
-+        .set_location_level(simplelog::LevelFilter::Off)
-+        .set_target_level(simplelog::LevelFilter::Off)
-+        .set_thread_level(simplelog::LevelFilter::Off);
-+    simplelog::TermLogger::init(
-+        llv,
-+        lcfg.build(),
-+        simplelog::TerminalMode::Stderr,
-+        simplelog::ColorChoice::Auto,
-+    )?;
-+
-+    debug!("opts={:?}", &opts);
-+
-+    if let Some(path) = &opts.example {
-+        write_example_file(path)?;
-+        return Ok(());
-+    }
-+
-+    let mut layer_config = LayerConfig { specs: vec![] };
-+    for (idx, input) in opts.specs.iter().enumerate() {
-+        layer_config.specs.append(
-+            &mut LayerSpec::parse(input)
-+                .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?,
-+        );
-+    }
-+
-+    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
-+    verify_layer_specs(&layer_config.specs)?;
-+
-+    let mut sched = Scheduler::init(&opts, layer_config.specs)?;
-+
-+    let shutdown = Arc::new(AtomicBool::new(false));
-+    let shutdown_clone = shutdown.clone();
-+    ctrlc::set_handler(move || {
-+        shutdown_clone.store(true, Ordering::Relaxed);
-+    })
-+    .context("Error setting Ctrl-C handler")?;
-+
-+    sched.run(shutdown)
-+}
-diff --git a/tools/sched_ext/scx_nest.bpf.c b/tools/sched_ext/scx_nest.bpf.c
-new file mode 100644
-index 000000000..3ab6d52d0
---- /dev/null
-+++ b/tools/sched_ext/scx_nest.bpf.c
-@@ -0,0 +1,681 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * As described in [0], a Nest scheduler which encourages task placement on
-+ * cores that are likely to be running at higher frequency, based upon recent usage.
-+ *
-+ * [0]: https://hal.inria.fr/hal-03612592/file/paper.pdf
-+ *
-+ * It operates as a global weighted vtime scheduler (similarly to CFS), while
-+ * using the Nest algorithm to choose idle cores at wakup time.
-+ *
-+ * It also demonstrates the following niceties.
-+ *
-+ * - More robust task placement policies.
-+ * - Termination notification for userspace.
-+ *
-+ * While rather simple, this scheduler should work reasonably well on CPUs with
-+ * a uniform L3 cache topology. While preemption is not implemented, the fact
-+ * that the scheduling queue is shared across all CPUs means that whatever is
-+ * at the front of the queue is likely to be executed fairly quickly given
-+ * enough number of CPUs.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include "scx_common.bpf.h"
-+#include "vmlinux.h"
-+#include "scx_nest.h"
-+
-+#define TASK_DEAD                       0x00000080
-+
-+char _license[] SEC("license") = "GPL";
-+
-+enum {
-+	FALLBACK_DSQ_ID		= 0,
-+	MSEC_PER_SEC		= 1000LLU,
-+	USEC_PER_MSEC		= 1000LLU,
-+	NSEC_PER_USEC		= 1000LLU,
-+	NSEC_PER_MSEC		= USEC_PER_MSEC * NSEC_PER_USEC,
-+	USEC_PER_SEC		= USEC_PER_MSEC * MSEC_PER_SEC,
-+	NSEC_PER_SEC		= NSEC_PER_USEC * USEC_PER_SEC,
-+};
-+
-+#define CLOCK_BOOTTIME 7
-+#define NUMA_NO_NODE -1
-+
-+const volatile u64 p_remove_ns = 2 * NSEC_PER_MSEC;
-+const volatile u64 r_max = 5;
-+const volatile u64 r_impatient = 2;
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+const volatile bool find_fully_idle = false;
-+const volatile u64 sampling_cadence_ns = 1 * NSEC_PER_SEC;
-+const volatile u64 r_depth = 5;
-+
-+// Used for stats tracking. May be stale at any given time.
-+u64 stats_primary_mask, stats_reserved_mask, stats_other_mask, stats_idle_mask;
-+
-+// Used for internal tracking.
-+static s32 nr_reserved;
-+
-+static u64 vtime_now;
-+struct user_exit_info uei;
-+
-+extern unsigned long CONFIG_HZ __kconfig;
-+
-+/* Per-task scheduling context */
-+struct task_ctx {
-+	/*
-+	 * A temporary cpumask for calculating a task's primary and reserve
-+	 * mask.
-+	 */
-+	struct bpf_cpumask __kptr *tmp_mask;
-+
-+	/*
-+	 * The number of times that a task observes that its previous core is
-+	 * not idle. If this occurs r_impatient times in a row, a core is
-+	 * attempted to be retrieved from either the reserve nest, or the
-+	 * fallback nest.
-+	 */
-+	u32 prev_misses;
-+
-+	/*
-+	 * A core that the task is "attached" to, meaning the last core that it
-+	 * executed on at least twice in a row, and the core that it first
-+	 * tries to migrate to on wakeup. The task only migrates to the
-+	 * attached core if it is idle and in the primary nest.
-+	 */
-+	s32 attached_core;
-+
-+	/*
-+	 * The last core that the task executed on. This is used to determine
-+	 * if the task should attach to the core that it will execute on next.
-+	 */
-+	s32 prev_cpu;
-+
-+	/* Dispatch directly to local_dsq */
-+	bool force_local;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct task_ctx);
-+} task_ctx_stor SEC(".maps");
-+
-+struct pcpu_ctx {
-+	/* The timer used to compact the core from the primary nest. */
-+	struct bpf_timer timer;
-+
-+	/* Whether the current core has been scheduled for compaction. */
-+	bool scheduled_compaction;
-+
-+	/* Number of times a primary core has been scheduled for compaction. */
-+	u32 num_schedulings;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__uint(max_entries, 1024);
-+	__type(key, s32);
-+	__type(value, struct pcpu_ctx);
-+} pcpu_ctxs SEC(".maps");
-+
-+struct stats_timer {
-+	struct bpf_timer timer;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__uint(max_entries, 1);
-+	__type(key, u32);
-+	__type(value, struct stats_timer);
-+} stats_timer SEC(".maps");
-+
-+const volatile u32 nr_cpus = 1; /* !0 for veristat, set during init. */
-+
-+private(NESTS) struct bpf_cpumask __kptr *primary_cpumask;
-+private(NESTS) struct bpf_cpumask __kptr *reserve_cpumask;
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(key_size, sizeof(u32));
-+	__uint(value_size, sizeof(u64));
-+	__uint(max_entries, NEST_STAT(NR));
-+} stats SEC(".maps");
-+
-+
-+static __attribute__((always_inline)) void stat_inc(u32 idx)
-+{
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
-+	if (cnt_p)
-+		(*cnt_p)++;
-+}
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+static const struct cpumask *cast_mask(struct bpf_cpumask *mask)
-+{
-+	return (const struct cpumask *)mask;
-+}
-+
-+static  __attribute__((always_inline)) void
-+try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion)
-+{
-+	s32 tmp_nr_reserved;
-+
-+	/*
-+	 * This check is racy, but that's OK. If we incorrectly fail to promote
-+	 * a core to reserve, it's because another context added or removed a
-+	 * core from reserved in this small window. It will balance out over
-+	 * subsequent wakeups.
-+	 */
-+	tmp_nr_reserved = nr_reserved;
-+	if (tmp_nr_reserved < r_max) {
-+		/*
-+		 * It's possible that we could exceed r_max for a time here,
-+		 * but that should balance out as more cores are either demoted
-+		 * or fail to be promoted into the reserve nest.
-+		 */
-+		__sync_fetch_and_add(&nr_reserved, 1);
-+		bpf_cpumask_set_cpu(cpu, reserved);
-+		if (promotion)
-+			stat_inc(NEST_STAT(PROMOTED_TO_RESERVED));
-+		else
-+			stat_inc(NEST_STAT(DEMOTED_TO_RESERVED));
-+	} else {
-+		bpf_cpumask_clear_cpu(cpu, reserved);
-+		stat_inc(NEST_STAT(RESERVED_AT_CAPACITY));
-+	}
-+}
-+
-+static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu)
-+{
-+	if (tctx->prev_cpu == new_cpu)
-+		tctx->attached_core = new_cpu;
-+	tctx->prev_cpu = prev_cpu;
-+}
-+
-+s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu,
-+		   u64 wake_flags)
-+{
-+	struct bpf_cpumask *p_mask, *primary, *reserve;
-+	s32 cpu;
-+	struct task_ctx *tctx;
-+	struct pcpu_ctx *pcpu_ctx;
-+	bool direct_to_primary = false;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx)
-+		return -ENOENT;
-+
-+	bpf_rcu_read_lock();
-+	p_mask = tctx->tmp_mask;
-+	primary = primary_cpumask;
-+	reserve = reserve_cpumask;
-+	if (!p_mask || !primary || !reserve) {
-+		bpf_rcu_read_unlock();
-+		return -ENOENT;
-+	}
-+
-+	// Unset below if we can't find a core to migrate to.
-+	tctx->force_local = true;
-+	tctx->prev_cpu = prev_cpu;
-+
-+	bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
-+
-+	/* First try to wake the task on its attached core. */
-+	if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) &&
-+	    scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) {
-+		cpu = tctx->attached_core;
-+		tctx->prev_misses = 0;
-+		stat_inc(NEST_STAT(WAKEUP_ATTACHED));
-+		goto migrate_primary;
-+	}
-+
-+	/*
-+	 * Try to stay on the previous core if it's in the primary set, and
-+	 * there's no hypertwin. If the previous core is the core the task is
-+	 * attached to, don't bother as we already just tried that above.
-+	 */
-+	if (prev_cpu != tctx->attached_core &&
-+	    bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
-+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+		cpu = prev_cpu;
-+		tctx->prev_misses = 0;
-+		stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY));
-+		goto migrate_primary;
-+	}
-+
-+	if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) {
-+		direct_to_primary = true;
-+		tctx->prev_misses = 0;
-+		stat_inc(NEST_STAT(TASK_IMPATIENT));
-+		goto search_reserved;
-+	}
-+
-+	if (find_fully_idle) {
-+		/* Then try any fully idle core in primary. */
-+		cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
-+					    SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0) {
-+			stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY));
-+			goto migrate_primary;
-+		}
-+	}
-+
-+	/* Then try _any_ idle core in primary, even if its hypertwin is active. */
-+	cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
-+	if (cpu >= 0) {
-+		stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY));
-+		goto migrate_primary;
-+	}
-+
-+search_reserved:
-+	/* Then try any fully idle core in reserve. */
-+	bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve));
-+	if (find_fully_idle) {
-+		cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
-+					    SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0) {
-+			stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE));
-+			goto promote_to_primary;
-+		}
-+	}
-+
-+	/* Then try _any_ idle core in reserve, even if its hypertwin is active. */
-+	cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
-+	if (cpu >= 0) {
-+		stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE));
-+		goto promote_to_primary;
-+	}
-+
-+	/* Then try _any_ idle core in the task's cpumask. */
-+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0) {
-+		/*
-+		 * We found a core that (we didn't _think_) is in any nest.
-+		 * This means that we need to either promote the core to the
-+		 * reserve nest, or if we're going direct to primary due to
-+		 * r_impatient being exceeded, promote directly to primary.
-+		 *
-+		 * We have to do one final check here to see if the core is in
-+		 * the primary or reserved cpumask because we could potentially
-+		 * race with the core changing states between AND'ing the
-+		 * primary and reserve masks with p->cpus_ptr above, and
-+		 * atomically reserving it from the idle mask with
-+		 * scx_bpf_pick_idle_cpu(). This is also technically true of
-+		 * the checks above, but in all of those cases we just put the
-+		 * core directly into the primary mask so it's not really that
-+		 * big of a problem. Here, we want to make sure that we don't
-+		 * accidentally put a core into the reserve nest that was e.g.
-+		 * already in the primary nest. This is unlikely, but we check
-+		 * for it on what should be a relatively cold path regardless.
-+		 */
-+		stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER));
-+		if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
-+			goto migrate_primary;
-+		else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
-+			goto promote_to_primary;
-+		else if (direct_to_primary)
-+			goto promote_to_primary;
-+		else
-+			try_make_core_reserved(cpu, reserve, true);
-+		bpf_rcu_read_unlock();
-+		return cpu;
-+	}
-+
-+	bpf_rcu_read_unlock();
-+	tctx->force_local = false;
-+	return prev_cpu;
-+
-+promote_to_primary:
-+	stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY));
-+migrate_primary:
-+	pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
-+	if (pcpu_ctx) {
-+		if (pcpu_ctx->scheduled_compaction) {
-+			if (bpf_timer_cancel(&pcpu_ctx->timer) < 0)
-+				scx_bpf_error("Failed to cancel pcpu timer");
-+			pcpu_ctx->scheduled_compaction = false;
-+			stat_inc(NEST_STAT(CANCELLED_COMPACTION));
-+		}
-+	} else {
-+		scx_bpf_error("Failed to lookup pcpu ctx");
-+	}
-+	bpf_cpumask_set_cpu(cpu, primary);
-+	/*
-+	 * Check to see whether the CPU is in the reserved nest. This can
-+	 * happen if the core is compacted concurrently with us trying to place
-+	 * the currently-waking task onto it. Similarly, this is the expected
-+	 * state of the core if we found the core in the reserve nest and are
-+	 * promoting it.
-+	 *
-+	 * We don't have to worry about racing with any other waking task here
-+	 * because we've atomically reserved the core with (some variant of)
-+	 * scx_bpf_pick_idle_cpu().
-+	 */
-+	if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) {
-+		__sync_sub_and_fetch(&nr_reserved, 1);
-+		bpf_cpumask_clear_cpu(cpu, reserve);
-+	}
-+	bpf_rcu_read_unlock();
-+	update_attached(tctx, prev_cpu, cpu);
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct task_ctx *tctx;
-+	u64 vtime = p->scx.dsq_vtime;
-+	s32 cpu = bpf_get_smp_processor_id();
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("Unable to find task ctx");
-+		return;
-+	}
-+
-+	if (tctx->force_local || (enq_flags & SCX_ENQ_LOCAL)) {
-+		tctx->force_local = false;
-+		if (enq_flags & SCX_ENQ_LOCAL)
-+			update_attached(tctx, tctx->prev_cpu, cpu);
-+
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	/*
-+	 * Limit the amount of budget that an idling task can accumulate
-+	 * to one slice.
-+	 */
-+	if (vtime_before(vtime, vtime_now - slice_ns))
-+		vtime = vtime_now - slice_ns;
-+
-+	scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime,
-+			       enq_flags);
-+}
-+
-+void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	struct pcpu_ctx *pcpu_ctx;
-+	struct bpf_cpumask *primary, *reserve;
-+	s32 key = cpu;
-+	bool in_primary;
-+
-+	primary = primary_cpumask;
-+	reserve = reserve_cpumask;
-+	if (!primary || !reserve) {
-+		scx_bpf_error("No primary or reserve cpumask");
-+		return;
-+	}
-+
-+	if (!scx_bpf_consume(FALLBACK_DSQ_ID)) {
-+		in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary));
-+
-+		if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) {
-+			scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0);
-+			return;
-+		}
-+
-+		stat_inc(NEST_STAT(NOT_CONSUMED));
-+		if (in_primary) {
-+			pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
-+			if (!pcpu_ctx) {
-+				scx_bpf_error("Failed to lookup pcpu ctx");
-+				return;
-+			}
-+
-+			/*
-+			 * Immediately demote a primary core if:
-+			 * - It's been scheduled for compaction at least
-+			 *   r_depth times without actually being compacted.
-+			 * - The previous task on it is dying
-+			 *
-+			 * Note that we elect to not compact the "first" CPU in
-+			 * the mask so as to encourage at least one core to
-+			 * remain in the nest. It would be better to check for
-+			 * whether there is only one core remaining in the
-+			 * nest, but BPF doesn't yet have a kfunc for querying
-+			 * cpumask weight.
-+			 */
-+			if ((prev && prev->__state == TASK_DEAD) ||
-+			    (cpu != bpf_cpumask_first(cast_mask(primary)) && pcpu_ctx->num_schedulings >= r_depth)) {
-+				stat_inc(NEST_STAT(COMPACTED));
-+				bpf_cpumask_clear_cpu(cpu, primary);
-+				try_make_core_reserved(cpu, reserve, false);
-+				pcpu_ctx->num_schedulings = 0;
-+			} else  {
-+				pcpu_ctx->scheduled_compaction = true;
-+				/*
-+				 * The core isn't being used anymore. Set a
-+				 * timer to remove the core from the nest in
-+				 * p_remove if it's still unused by that point.
-+				 */
-+				bpf_timer_start(&pcpu_ctx->timer, p_remove_ns,
-+						0 /*BPF_F_TIMER_CPU_PIN*/);
-+				pcpu_ctx->num_schedulings++;
-+				stat_inc(NEST_STAT(SCHEDULED_COMPACTION));
-+			}
-+		}
-+		return;
-+	}
-+	stat_inc(NEST_STAT(CONSUMED));
-+}
-+
-+void BPF_STRUCT_OPS(nest_running, struct task_struct *p)
-+{
-+	/*
-+	 * Global vtime always progresses forward as tasks start executing. The
-+	 * test and update can be performed concurrently from multiple CPUs and
-+	 * thus racy. Any error should be contained and temporary. Let's just
-+	 * live with it.
-+	 */
-+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
-+		vtime_now = p->scx.dsq_vtime;
-+}
-+
-+void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable)
-+{
-+	/* scale the execution time by the inverse of the weight and charge */
-+	p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
-+}
-+
-+s32 BPF_STRUCT_OPS(nest_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	struct task_ctx *tctx;
-+	struct bpf_cpumask *cpumask;
-+
-+	/*
-+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-+	 * in this function and the following will automatically use GFP_KERNEL.
-+	 */
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
-+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
-+	if (!tctx)
-+		return -ENOMEM;
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+
-+	cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	tctx->attached_core = -1;
-+	tctx->prev_cpu = -1;
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(nest_enable, struct task_struct *p,
-+		    struct scx_enable_args *args)
-+{
-+	p->scx.dsq_vtime = vtime_now;
-+}
-+
-+static int compact_primary_core(void *map, int *key, struct bpf_timer *timer)
-+{
-+	struct bpf_cpumask *primary, *reserve;
-+	s32 cpu = bpf_get_smp_processor_id();
-+	struct pcpu_ctx *pcpu_ctx;
-+
-+	stat_inc(NEST_STAT(COMPACTED));
-+	/*
-+	 * If we made it to this callback, it means that the timer callback was
-+	 * never cancelled, and so the core needs to be demoted from the
-+	 * primary nest.
-+	 */
-+	pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
-+	if (!pcpu_ctx) {
-+		scx_bpf_error("Couldn't lookup pcpu ctx");
-+		return 0;
-+	}
-+	bpf_rcu_read_lock();
-+	primary = primary_cpumask;
-+	reserve = reserve_cpumask;
-+	if (!primary || !reserve) {
-+		scx_bpf_error("Couldn't find primary or reserve");
-+		bpf_rcu_read_unlock();
-+		return 0;
-+	}
-+
-+	bpf_cpumask_clear_cpu(cpu, primary);
-+	try_make_core_reserved(cpu, reserve, false);
-+	bpf_rcu_read_unlock();
-+	pcpu_ctx->num_schedulings = 0;
-+	pcpu_ctx->scheduled_compaction = false;
-+	return 0;
-+}
-+
-+static int stats_timerfn(void *map, int *key, struct bpf_timer *timer)
-+{
-+	s32 cpu;
-+	struct bpf_cpumask *primary, *reserve;
-+	const struct cpumask *idle;
-+	stats_primary_mask = 0;
-+	stats_reserved_mask = 0;
-+	stats_other_mask = 0;
-+	stats_idle_mask = 0;
-+	long err;
-+
-+	bpf_rcu_read_lock();
-+	primary = primary_cpumask;
-+	reserve = reserve_cpumask;
-+	if (!primary || !reserve) {
-+		bpf_rcu_read_unlock();
-+		scx_bpf_error("Failed to lookup primary or reserve");
-+		return 0;
-+	}
-+
-+	idle = scx_bpf_get_idle_cpumask();
-+	bpf_for(cpu, 0, nr_cpus) {
-+		if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
-+			stats_primary_mask |= (1ULL << cpu);
-+		else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
-+			stats_reserved_mask |= (1ULL << cpu);
-+		else
-+			stats_other_mask |= (1ULL << cpu);
-+
-+		if (bpf_cpumask_test_cpu(cpu, idle))
-+			stats_idle_mask |= (1ULL << cpu);
-+	}
-+	bpf_rcu_read_unlock();
-+	scx_bpf_put_idle_cpumask(idle);
-+
-+	err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
-+	if (err)
-+		scx_bpf_error("Failed to arm stats timer");
-+
-+	return 0;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init)
-+{
-+	struct bpf_cpumask *cpumask;
-+	s32 cpu;
-+	int err;
-+	struct bpf_timer *timer;
-+	u32 key = 0;
-+
-+	scx_bpf_switch_all();
-+
-+	err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE);
-+	if (err) {
-+		scx_bpf_error("Failed to create fallback DSQ");
-+		return err;
-+	}
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+	bpf_cpumask_clear(cpumask);
-+	cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+
-+	bpf_cpumask_clear(cpumask);
-+	cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	bpf_for(cpu, 0, nr_cpus) {
-+		s32 key = cpu;
-+		struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
-+
-+		if (!ctx) {
-+			scx_bpf_error("Failed to lookup pcpu_ctx");
-+			return -ENOENT;
-+		}
-+		ctx->scheduled_compaction = false;
-+		if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) {
-+			scx_bpf_error("Failed to initialize pcpu timer");
-+			return -EINVAL;
-+		}
-+		ctx->num_schedulings  = 0;
-+		bpf_timer_set_callback(&ctx->timer, compact_primary_core);
-+	}
-+
-+	timer = bpf_map_lookup_elem(&stats_timer, &key);
-+	if (!timer) {
-+		scx_bpf_error("Failed to lookup central timer");
-+		return -ESRCH;
-+	}
-+	bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME);
-+	bpf_timer_set_callback(timer, stats_timerfn);
-+	err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
-+	if (err)
-+		scx_bpf_error("Failed to arm stats timer");
-+
-+	return err;
-+}
-+
-+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops nest_ops = {
-+	.select_cpu		= (void *)nest_select_cpu,
-+	.enqueue		= (void *)nest_enqueue,
-+	.dispatch		= (void *)nest_dispatch,
-+	.running		= (void *)nest_running,
-+	.stopping		= (void *)nest_stopping,
-+	.prep_enable		= (void *)nest_prep_enable,
-+	.enable			= (void *)nest_enable,
-+	.init			= (void *)nest_init,
-+	.exit			= (void *)nest_exit,
-+	.flags			= 0,
-+	.name			= "nest",
-+};
-diff --git a/tools/sched_ext/scx_nest.c b/tools/sched_ext/scx_nest.c
-new file mode 100644
-index 000000000..90f5a8bd2
---- /dev/null
-+++ b/tools/sched_ext/scx_nest.c
-@@ -0,0 +1,227 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "user_exit_info.h"
-+#include "scx_nest.skel.h"
-+#include "scx_common.h"
-+#include "scx_nest.h"
-+
-+#define SAMPLING_CADENCE_S 2
-+
-+const char help_fmt[] =
-+"A Nest sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-p] [-d DELAY] [-m <max>] [-i ITERS]\n"
-+"\n"
-+"  -d DELAY_US   Delay (us), before removing an idle core from the primary nest (default 2000us / 2ms)\n"
-+"  -m R_MAX      Maximum number of cores in the reserve nest (default 5)\n"
-+"  -i ITERS      Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n"
-+"  -s SLICE_US   Override slice duration in us (default 20000us / 20ms)\n"
-+"  -D R_SCHED    Override the number of times that a core may be scheduled for compaction before having compaction happen immediately (default 5), or -1 to disable\n"
-+"  -I            First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int nest)
-+{
-+	exit_req = 1;
-+}
-+
-+struct nest_stat {
-+        const char *label;
-+        enum nest_stat_group group;
-+        enum nest_stat_idx idx;
-+};
-+
-+#define NEST_ST(__stat, __grp, __desc) {	\
-+	.label = #__stat,		\
-+	.group = __grp,			\
-+	.idx = NEST_STAT(__stat)		\
-+},
-+static struct nest_stat nest_stats[NEST_STAT(NR)] = {
-+#include "scx_nest_stats_table.h"
-+};
-+#undef NEST_ST
-+
-+static void read_stats(struct scx_nest *skel, u64 *stats)
-+{
-+	int nr_cpus = libbpf_num_possible_cpus();
-+	u64 cnts[NEST_STAT(NR)][nr_cpus];
-+	u32 idx;
-+
-+	memset(stats, 0, sizeof(stats[0]) * NEST_STAT(NR));
-+
-+	for (idx = 0; idx < NEST_STAT(NR); idx++) {
-+		int ret, cpu;
-+
-+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-+					  &idx, cnts[idx]);
-+		if (ret < 0)
-+			continue;
-+		for (cpu = 0; cpu < nr_cpus; cpu++)
-+			stats[idx] += cnts[idx][cpu];
-+	}
-+}
-+
-+static void print_underline(const char *str)
-+{
-+	char buf[64];
-+	size_t len;
-+
-+	len = strlen(str);
-+	memset(buf, '-', len);
-+	buf[len] = '\0';
-+	printf("\n\n%s\n%s\n", str, buf);
-+}
-+
-+static void print_stat_grp(enum nest_stat_group grp)
-+{
-+	const char *group;
-+
-+	switch (grp) {
-+		case STAT_GRP_WAKEUP:
-+			group = "Wakeup stats";
-+			break;
-+		case STAT_GRP_NEST:
-+			group = "Nest stats";
-+			break;
-+		case STAT_GRP_CONSUME:
-+			group = "Consume stats";
-+			break;
-+		default:
-+			group = "Unknown stats";
-+			break;
-+	}
-+
-+	print_underline(group);
-+}
-+
-+static void print_active_nests(const struct scx_nest *skel)
-+{
-+	u64 primary = skel->bss->stats_primary_mask;
-+	u64 reserved = skel->bss->stats_reserved_mask;
-+	u64 other = skel->bss->stats_other_mask;
-+	u64 idle = skel->bss->stats_idle_mask;
-+	u32 nr_cpus = skel->rodata->nr_cpus, cpu;
-+	int idx;
-+	char cpus[nr_cpus + 1];
-+
-+	memset(cpus, 0, nr_cpus + 1);
-+	print_underline("Masks");
-+	for (idx = 0; idx < 4; idx++) {
-+		const char *mask_str;
-+		u64 mask, total = 0;
-+
-+		memset(cpus, '-', nr_cpus);
-+		if (idx == 0) {
-+			mask_str = "PRIMARY";
-+			mask = primary;
-+		} else if (idx == 1) {
-+			mask_str = "RESERVED";
-+			mask = reserved;
-+		} else if (idx == 2) {
-+			mask_str = "OTHER";
-+			mask = other;
-+		} else {
-+			mask_str = "IDLE";
-+			mask = idle;
-+		}
-+		for (cpu = 0; cpu < nr_cpus; cpu++) {
-+			if (mask & (1ULL << cpu)) {
-+				cpus[cpu] = '*';
-+				total++;
-+			}
-+		}
-+		printf("%-9s(%2lu): | %s |\n", mask_str, total, cpus);
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_nest *skel;
-+	struct bpf_link *link;
-+	__u32 opt;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_nest__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-+	skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000;
-+
-+	while ((opt = getopt(argc, argv, "hId:D:m:i:s:")) != -1) {
-+		switch (opt) {
-+		case 'd':
-+			skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		case 'D':
-+			skel->rodata->r_depth = strtoull(optarg, NULL, 0);
-+			break;
-+		case 'm':
-+			skel->rodata->r_max = strtoull(optarg, NULL, 0);
-+			break;
-+		case 'i':
-+			skel->rodata->r_impatient = strtoull(optarg, NULL, 0);
-+			break;
-+		case 'I':
-+			skel->rodata->find_fully_idle = true;
-+			break;
-+		case 's':
-+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	SCX_BUG_ON(scx_nest__load(skel), "Failed to load skel");
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.nest_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		u64 stats[NEST_STAT(NR)];
-+		enum nest_stat_idx i;
-+		enum nest_stat_group last_grp = -1;
-+
-+		read_stats(skel, stats);
-+		for (i = 0; i < NEST_STAT(NR); i++) {
-+			struct nest_stat *nest_stat;
-+
-+			nest_stat = &nest_stats[i];
-+			if (nest_stat->group != last_grp) {
-+				print_stat_grp(nest_stat->group);
-+				last_grp = nest_stat->group;
-+			}
-+			printf("%s=%lu\n", nest_stat->label, stats[nest_stat->idx]);
-+		}
-+		printf("\n");
-+		print_active_nests(skel);
-+		printf("\n");
-+		printf("\n");
-+		printf("\n");
-+		fflush(stdout);
-+		sleep(SAMPLING_CADENCE_S);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_nest__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_nest.h b/tools/sched_ext/scx_nest.h
-new file mode 100644
-index 000000000..060444f81
---- /dev/null
-+++ b/tools/sched_ext/scx_nest.h
-@@ -0,0 +1,18 @@
-+#ifndef __SCX_NEST_H
-+#define __SCX_NEST_H
-+
-+enum nest_stat_group {
-+	STAT_GRP_WAKEUP,
-+	STAT_GRP_NEST,
-+	STAT_GRP_CONSUME,
-+};
-+
-+#define NEST_STAT(__stat) BPFSTAT_##__stat
-+#define NEST_ST(__stat, __grp, __desc) NEST_STAT(__stat),
-+enum nest_stat_idx {
-+#include "scx_nest_stats_table.h"
-+	NEST_ST(NR, 0, 0)
-+};
-+#undef NEST_ST
-+
-+#endif /* __SCX_NEST_H */
-diff --git a/tools/sched_ext/scx_nest_stats_table.h b/tools/sched_ext/scx_nest_stats_table.h
-new file mode 100644
-index 000000000..b6ef2e4d3
---- /dev/null
-+++ b/tools/sched_ext/scx_nest_stats_table.h
-@@ -0,0 +1,19 @@
-+NEST_ST(WAKEUP_ATTACHED, STAT_GRP_WAKEUP, "Attached CPU was idle, and in primary nest")
-+NEST_ST(WAKEUP_PREV_PRIMARY, STAT_GRP_WAKEUP, "Previous CPU was idle, and in primary nest")
-+NEST_ST(WAKEUP_FULLY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to fully idle primary nest core")
-+NEST_ST(WAKEUP_ANY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to idle logical primary nest core")
-+NEST_ST(WAKEUP_FULLY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to fully idle reserve nest core")
-+NEST_ST(WAKEUP_ANY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to idle logical reserve nest core")
-+NEST_ST(WAKEUP_IDLE_OTHER, STAT_GRP_WAKEUP, "Woken to any idle logical core in p->cpus_ptr")
-+
-+NEST_ST(TASK_IMPATIENT, STAT_GRP_NEST, "A task was found to be impatient")
-+NEST_ST(PROMOTED_TO_PRIMARY, STAT_GRP_NEST, "A core was promoted into the primary nest")
-+NEST_ST(PROMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was promoted into the reserve nest")
-+NEST_ST(DEMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was demoted into the reserve nest")
-+NEST_ST(RESERVED_AT_CAPACITY, STAT_GRP_NEST, "Reserved nest was at capacity")
-+NEST_ST(SCHEDULED_COMPACTION, STAT_GRP_NEST, "Scheduled a primary core to be compacted")
-+NEST_ST(CANCELLED_COMPACTION, STAT_GRP_NEST, "Cancelled a primary core from being compacted at task wakeup time")
-+NEST_ST(COMPACTED, STAT_GRP_NEST, "A core was compacted")
-+
-+NEST_ST(CONSUMED, STAT_GRP_CONSUME, "A task was consumed from the global DSQ")
-+NEST_ST(NOT_CONSUMED, STAT_GRP_CONSUME, "There was no task in the global DSQ")
-diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
-new file mode 100644
-index 000000000..9c9cf97f4
---- /dev/null
-+++ b/tools/sched_ext/scx_pair.bpf.c
-@@ -0,0 +1,626 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair
-+ * execute from the same CPU cgroup.
-+ *
-+ * This scheduler is a minimal implementation and would need some form of
-+ * priority handling both inside each cgroup and across the cgroups to be
-+ * practically useful.
-+ *
-+ * Each CPU in the system is paired with exactly one other CPU, according to a
-+ * "stride" value that can be specified when the BPF scheduler program is first
-+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
-+ * that they will only ever schedule tasks that belong to the same CPU cgroup.
-+ *
-+ * Scheduler Initialization
-+ * ------------------------
-+ *
-+ * The scheduler BPF program is first initialized from user space, before it is
-+ * enabled. During this initialization process, each CPU on the system is
-+ * assigned several values that are constant throughout its runtime:
-+ *
-+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
-+ *		  decisions. Paired CPUs always schedule tasks from the same
-+ *		  CPU cgroup, and synchronize with each other to guarantee
-+ *		  that this constraint is not violated.
-+ * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
-+ *		  a struct pair_ctx object that is shared between the pair.
-+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
-+ *		       pair. Each struct pair_ctx has an active_mask field,
-+ *		       which is a bitmap used to indicate whether each core
-+ *		       in the pair currently has an actively running task.
-+ *		       This index specifies which entry in the bitmap corresponds
-+ *		       to each CPU in the pair.
-+ *
-+ * During this initialization, the CPUs are paired according to a "stride" that
-+ * may be specified when invoking the user space program that initializes and
-+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
-+ *
-+ * Tasks and cgroups
-+ * -----------------
-+ *
-+ * Every cgroup in the system is registered with the scheduler using the
-+ * pair_cgroup_init() callback, and every task in the system is associated with
-+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to
-+ * always schedule tasks from the same cgroup within a given CPU pair. When a
-+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
-+ * cgroup ID is read from its task struct, and then a corresponding queue map
-+ * is used to FIFO-enqueue the task for that cgroup.
-+ *
-+ * If you look through the implementation of the scheduler, you'll notice that
-+ * there is quite a bit of complexity involved with looking up the per-cgroup
-+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
-+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
-+ * allocated in the BPF program. This is done because we use separate maps to
-+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This
-+ * complexity is only present because of current deficiencies in BPF that will
-+ * soon be addressed. The main point to keep in mind is that newly enqueued
-+ * tasks are added to their cgroup's FIFO queue.
-+ *
-+ * Dispatching tasks
-+ * -----------------
-+ *
-+ * This section will describe how enqueued tasks are dispatched and scheduled.
-+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
-+ * as follows:
-+ *
-+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
-+ *    the structure that's used to synchronize amongst the two pair CPUs in their
-+ *    scheduling decisions. After any of the following events have occurred:
-+ *
-+ * - The cgroup's slice run has expired, or
-+ * - The cgroup becomes empty, or
-+ * - Either CPU in the pair is preempted by a higher priority scheduling class
-+ *
-+ * The cgroup transitions to the draining state and stops executing new tasks
-+ * from the cgroup.
-+ *
-+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
-+ *    wait for the pair CPU to be preempted.
-+ *
-+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto
-+ *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
-+ *
-+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
-+ *
-+ * Note again that this scheduling behavior is simple, but the implementation
-+ * is complex mostly because this it hits several BPF shortcomings and has to
-+ * work around in often awkward ways. Most of the shortcomings are expected to
-+ * be resolved in the near future which should allow greatly simplifying this
-+ * scheduler.
-+ *
-+ * Dealing with preemption
-+ * -----------------------
-+ *
-+ * SCX is the lowest priority sched_class, and could be preempted by them at
-+ * any time. To address this, the scheduler implements pair_cpu_release() and
-+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
-+ * the scheduler loses and gains control of the CPU respectively.
-+ *
-+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
-+ * then invoke:
-+ *
-+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
-+ *
-+ * This preempts the pair CPU, and waits until it has re-entered the scheduler
-+ * before returning. This is necessary to ensure that the higher priority
-+ * sched_class that preempted our scheduler does not schedule a task
-+ * concurrently with our pair CPU.
-+ *
-+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
-+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
-+ * pair scheduling.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include "scx_common.bpf.h"
-+#include "scx_pair.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile bool switch_partial;
-+
-+/* !0 for veristat, set during init */
-+const volatile u32 nr_cpu_ids = 1;
-+
-+/* a pair of CPUs stay on a cgroup for this duration */
-+const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL;
-+
-+/* cpu ID -> pair cpu ID */
-+const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);
-+
-+/* cpu ID -> pair_id */
-+const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);
-+
-+/* CPU ID -> CPU # in the pair (0 or 1) */
-+const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);
-+
-+struct pair_ctx {
-+	struct bpf_spin_lock	lock;
-+
-+	/* the cgroup the pair is currently executing */
-+	u64			cgid;
-+
-+	/* the pair started executing the current cgroup at */
-+	u64			started_at;
-+
-+	/* whether the current cgroup is draining */
-+	bool			draining;
-+
-+	/* the CPUs that are currently active on the cgroup */
-+	u32			active_mask;
-+
-+	/*
-+	 * the CPUs that are currently preempted and running tasks in a
-+	 * different scheduler.
-+	 */
-+	u32			preempted_mask;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct pair_ctx);
-+} pair_ctx SEC(".maps");
-+
-+/* queue of cgrp_q's possibly with tasks on them */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	/*
-+	 * Because it's difficult to build strong synchronization encompassing
-+	 * multiple non-trivial operations in BPF, this queue is managed in an
-+	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
-+	 * is always on it but possibly multiple times. Once we have more robust
-+	 * synchronization constructs and e.g. linked list, we should be able to
-+	 * do this in a prettier way but for now just size it big enough.
-+	 */
-+	__uint(max_entries, 4 * MAX_CGRPS);
-+	__type(value, u64);
-+} top_q SEC(".maps");
-+
-+/* per-cgroup q which FIFOs the tasks from the cgroup */
-+struct cgrp_q {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, MAX_QUEUED);
-+	__type(value, u32);
-+};
-+
-+/*
-+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
-+ * storage; however, a cgroup local storage can only be accessed from the BPF
-+ * progs attached to the cgroup. For now, work around by allocating array of
-+ * cgrp_q's and then allocating per-cgroup indices.
-+ *
-+ * Another caveat: It's difficult to populate a large array of maps statically
-+ * or from BPF. Initialize it from userland.
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
-+	__uint(max_entries, MAX_CGRPS);
-+	__type(key, s32);
-+	__array(values, struct cgrp_q);
-+} cgrp_q_arr SEC(".maps");
-+
-+static u64 cgrp_q_len[MAX_CGRPS];
-+
-+/*
-+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
-+ * useful to have as a map type.
-+ */
-+static u32 cgrp_q_idx_cursor;
-+static u64 cgrp_q_idx_busy[MAX_CGRPS];
-+
-+/*
-+ * All added up, the following is what we do:
-+ *
-+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
-+ *    for a free ID. If not found, fail cgroup creation with -EBUSY.
-+ *
-+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
-+ *    cgrp_q_idx_hash.
-+ *
-+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
-+ *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
-+ *
-+ * This is sadly complicated for something pretty simple. Hopefully, we should
-+ * be able to simplify in the future.
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__uint(max_entries, MAX_CGRPS);
-+	__uint(key_size, sizeof(u64));		/* cgrp ID */
-+	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
-+} cgrp_q_idx_hash SEC(".maps");
-+
-+/* statistics */
-+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
-+u64 nr_exps, nr_exp_waits, nr_exp_empty;
-+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;
-+
-+struct user_exit_info uei;
-+
-+static bool time_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct cgroup *cgrp;
-+	struct cgrp_q *cgq;
-+	s32 pid = p->pid;
-+	u64 cgid;
-+	u32 *q_idx;
-+	u64 *cgq_len;
-+
-+	__sync_fetch_and_add(&nr_total, 1);
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgid = cgrp->kn->id;
-+	bpf_cgroup_release(cgrp);
-+
-+	/* find the cgroup's q and push @p into it */
-+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
-+	if (!q_idx) {
-+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
-+		return;
-+	}
-+
-+	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
-+	if (!cgq) {
-+		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
-+			      cgid, *q_idx);
-+		return;
-+	}
-+
-+	if (bpf_map_push_elem(cgq, &pid, 0)) {
-+		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
-+		return;
-+	}
-+
-+	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
-+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
-+	if (!cgq_len) {
-+		scx_bpf_error("MEMBER_VTPR malfunction");
-+		return;
-+	}
-+
-+	if (!__sync_fetch_and_add(cgq_len, 1) &&
-+	    bpf_map_push_elem(&top_q, &cgid, 0)) {
-+		scx_bpf_error("top_q overflow");
-+		return;
-+	}
-+}
-+
-+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
-+{
-+	u32 *vptr;
-+
-+	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
-+	if (!vptr)
-+		return -EINVAL;
-+
-+	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
-+	if (!(*pairc))
-+		return -EINVAL;
-+
-+	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
-+	if (!vptr)
-+		return -EINVAL;
-+
-+	*mask = 1U << *vptr;
-+
-+	return 0;
-+}
-+
-+static int try_dispatch(s32 cpu)
-+{
-+	struct pair_ctx *pairc;
-+	struct bpf_map *cgq_map;
-+	struct task_struct *p;
-+	u64 now = bpf_ktime_get_ns();
-+	bool kick_pair = false;
-+	bool expired, pair_preempted;
-+	u32 *vptr, in_pair_mask;
-+	s32 pid, q_idx;
-+	u64 cgid;
-+	int ret;
-+
-+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
-+	if (ret) {
-+		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
-+			      cpu);
-+		return -ENOENT;
-+	}
-+
-+	bpf_spin_lock(&pairc->lock);
-+	pairc->active_mask &= ~in_pair_mask;
-+
-+	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
-+	if (expired || pairc->draining) {
-+		u64 new_cgid = 0;
-+
-+		__sync_fetch_and_add(&nr_exps, 1);
-+
-+		/*
-+		 * We're done with the current cgid. An obvious optimization
-+		 * would be not draining if the next cgroup is the current one.
-+		 * For now, be dumb and always expire.
-+		 */
-+		pairc->draining = true;
-+
-+		pair_preempted = pairc->preempted_mask;
-+		if (pairc->active_mask || pair_preempted) {
-+			/*
-+			 * The other CPU is still active, or is no longer under
-+			 * our control due to e.g. being preempted by a higher
-+			 * priority sched_class. We want to wait until this
-+			 * cgroup expires, or until control of our pair CPU has
-+			 * been returned to us.
-+			 *
-+			 * If the pair controls its CPU, and the time already
-+			 * expired, kick.  When the other CPU arrives at
-+			 * dispatch and clears its active mask, it'll push the
-+			 * pair to the next cgroup and kick this CPU.
-+			 */
-+			__sync_fetch_and_add(&nr_exp_waits, 1);
-+			bpf_spin_unlock(&pairc->lock);
-+			if (expired && !pair_preempted)
-+				kick_pair = true;
-+			goto out_maybe_kick;
-+		}
-+
-+		bpf_spin_unlock(&pairc->lock);
-+
-+		/*
-+		 * Pick the next cgroup. It'd be easier / cleaner to not drop
-+		 * pairc->lock and use stronger synchronization here especially
-+		 * given that we'll be switching cgroups significantly less
-+		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
-+		 * really protect anything non-trivial. Let's do opportunistic
-+		 * operations instead.
-+		 */
-+		bpf_repeat(BPF_MAX_LOOPS) {
-+			u32 *q_idx;
-+			u64 *cgq_len;
-+
-+			if (bpf_map_pop_elem(&top_q, &new_cgid)) {
-+				/* no active cgroup, go idle */
-+				__sync_fetch_and_add(&nr_exp_empty, 1);
-+				return 0;
-+			}
-+
-+			q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
-+			if (!q_idx)
-+				continue;
-+
-+			/*
-+			 * This is the only place where empty cgroups are taken
-+			 * off the top_q.
-+			 */
-+			cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
-+			if (!cgq_len || !*cgq_len)
-+				continue;
-+
-+			/*
-+			 * If it has any tasks, requeue as we may race and not
-+			 * execute it.
-+			 */
-+			bpf_map_push_elem(&top_q, &new_cgid, 0);
-+			break;
-+		}
-+
-+		bpf_spin_lock(&pairc->lock);
-+
-+		/*
-+		 * The other CPU may already have started on a new cgroup while
-+		 * we dropped the lock. Make sure that we're still draining and
-+		 * start on the new cgroup.
-+		 */
-+		if (pairc->draining && !pairc->active_mask) {
-+			__sync_fetch_and_add(&nr_cgrp_next, 1);
-+			pairc->cgid = new_cgid;
-+			pairc->started_at = now;
-+			pairc->draining = false;
-+			kick_pair = true;
-+		} else {
-+			__sync_fetch_and_add(&nr_cgrp_coll, 1);
-+		}
-+	}
-+
-+	cgid = pairc->cgid;
-+	pairc->active_mask |= in_pair_mask;
-+	bpf_spin_unlock(&pairc->lock);
-+
-+	/* again, it'd be better to do all these with the lock held, oh well */
-+	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
-+	if (!vptr) {
-+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
-+		return -ENOENT;
-+	}
-+	q_idx = *vptr;
-+
-+	/* claim one task from cgrp_q w/ q_idx */
-+	bpf_repeat(BPF_MAX_LOOPS) {
-+		u64 *cgq_len, len;
-+
-+		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
-+		if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {
-+			/* the cgroup must be empty, expire and repeat */
-+			__sync_fetch_and_add(&nr_cgrp_empty, 1);
-+			bpf_spin_lock(&pairc->lock);
-+			pairc->draining = true;
-+			pairc->active_mask &= ~in_pair_mask;
-+			bpf_spin_unlock(&pairc->lock);
-+			return -EAGAIN;
-+		}
-+
-+		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
-+			continue;
-+
-+		break;
-+	}
-+
-+	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
-+	if (!cgq_map) {
-+		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
-+			      cgid, q_idx);
-+		return -ENOENT;
-+	}
-+
-+	if (bpf_map_pop_elem(cgq_map, &pid)) {
-+		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
-+			      cgid, q_idx);
-+		return -ENOENT;
-+	}
-+
-+	p = bpf_task_from_pid(pid);
-+	if (p) {
-+		__sync_fetch_and_add(&nr_dispatched, 1);
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-+		bpf_task_release(p);
-+	} else {
-+		/* we don't handle dequeues, retry on lost tasks */
-+		__sync_fetch_and_add(&nr_missing, 1);
-+		return -EAGAIN;
-+	}
-+
-+out_maybe_kick:
-+	if (kick_pair) {
-+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
-+		if (pair) {
-+			__sync_fetch_and_add(&nr_kicks, 1);
-+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
-+		}
-+	}
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	bpf_repeat(BPF_MAX_LOOPS) {
-+		if (try_dispatch(cpu) != -EAGAIN)
-+			break;
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
-+{
-+	int ret;
-+	u32 in_pair_mask;
-+	struct pair_ctx *pairc;
-+	bool kick_pair;
-+
-+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
-+	if (ret)
-+		return;
-+
-+	bpf_spin_lock(&pairc->lock);
-+	pairc->preempted_mask &= ~in_pair_mask;
-+	/* Kick the pair CPU, unless it was also preempted. */
-+	kick_pair = !pairc->preempted_mask;
-+	bpf_spin_unlock(&pairc->lock);
-+
-+	if (kick_pair) {
-+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
-+
-+		if (pair) {
-+			__sync_fetch_and_add(&nr_kicks, 1);
-+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
-+		}
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
-+{
-+	int ret;
-+	u32 in_pair_mask;
-+	struct pair_ctx *pairc;
-+	bool kick_pair;
-+
-+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
-+	if (ret)
-+		return;
-+
-+	bpf_spin_lock(&pairc->lock);
-+	pairc->preempted_mask |= in_pair_mask;
-+	pairc->active_mask &= ~in_pair_mask;
-+	/* Kick the pair CPU if it's still running. */
-+	kick_pair = pairc->active_mask;
-+	pairc->draining = true;
-+	bpf_spin_unlock(&pairc->lock);
-+
-+	if (kick_pair) {
-+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
-+
-+		if (pair) {
-+			__sync_fetch_and_add(&nr_kicks, 1);
-+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
-+		}
-+	}
-+	__sync_fetch_and_add(&nr_preemptions, 1);
-+}
-+
-+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
-+{
-+	u64 cgid = cgrp->kn->id;
-+	s32 i, q_idx;
-+
-+	bpf_for(i, 0, MAX_CGRPS) {
-+		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
-+		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
-+			break;
-+	}
-+	if (i == MAX_CGRPS)
-+		return -EBUSY;
-+
-+	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
-+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
-+		if (busy)
-+			*busy = 0;
-+		return -EBUSY;
-+	}
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
-+{
-+	u64 cgid = cgrp->kn->id;
-+	s32 *q_idx;
-+
-+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
-+	if (q_idx) {
-+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
-+		if (busy)
-+			*busy = 0;
-+		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
-+	}
-+}
-+
-+s32 BPF_STRUCT_OPS(pair_init)
-+{
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops pair_ops = {
-+	.enqueue		= (void *)pair_enqueue,
-+	.dispatch		= (void *)pair_dispatch,
-+	.cpu_acquire		= (void *)pair_cpu_acquire,
-+	.cpu_release		= (void *)pair_cpu_release,
-+	.cgroup_init		= (void *)pair_cgroup_init,
-+	.cgroup_exit		= (void *)pair_cgroup_exit,
-+	.init			= (void *)pair_init,
-+	.exit			= (void *)pair_exit,
-+	.name			= "pair",
-+};
-diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
-new file mode 100644
-index 000000000..48344af03
---- /dev/null
-+++ b/tools/sched_ext/scx_pair.c
-@@ -0,0 +1,168 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "scx_common.h"
-+#include "scx_pair.h"
-+#include "scx_pair.skel.h"
-+
-+const char help_fmt[] =
-+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
-+"execute from the same CPU cgroup.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-S STRIDE] [-p]\n"
-+"\n"
-+"  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_pair *skel;
-+	struct bpf_link *link;
-+	__u64 seq = 0;
-+	__s32 stride, i, opt, outer_fd;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_pair__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
-+
-+	/* pair up the earlier half to the latter by default, override with -s */
-+	stride = skel->rodata->nr_cpu_ids / 2;
-+
-+	while ((opt = getopt(argc, argv, "S:ph")) != -1) {
-+		switch (opt) {
-+		case 'S':
-+			stride = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
-+
-+	/* Resize arrays so their element count is equal to cpu count. */
-+	RESIZE_ARRAY(rodata, pair_cpu, skel->rodata->nr_cpu_ids);
-+	RESIZE_ARRAY(rodata, pair_id, skel->rodata->nr_cpu_ids);
-+	RESIZE_ARRAY(rodata, in_pair_idx, skel->rodata->nr_cpu_ids);
-+
-+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++)
-+		skel->rodata_pair_cpu->pair_cpu[i] = -1;
-+
-+	printf("Pairs: ");
-+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
-+		int j = (i + stride) % skel->rodata->nr_cpu_ids;
-+
-+		if (skel->rodata_pair_cpu->pair_cpu[i] >= 0)
-+			continue;
-+
-+		SCX_BUG_ON(i == j,
-+			   "Invalid stride %d - CPU%d wants to be its own pair",
-+			   stride, i);
-+
-+		SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0,
-+			   "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
-+			   stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]);
-+
-+		skel->rodata_pair_cpu->pair_cpu[i] = j;
-+		skel->rodata_pair_cpu->pair_cpu[j] = i;
-+		skel->rodata_pair_id->pair_id[i] = i;
-+		skel->rodata_pair_id->pair_id[j] = i;
-+		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
-+		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;
-+
-+		printf("[%d, %d] ", i, j);
-+	}
-+	printf("\n");
-+
-+	SCX_BUG_ON(scx_pair__load(skel), "Failed to load skel");
-+
-+	/*
-+	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
-+	 * queues. It'd probably be better to do this from BPF but there are too
-+	 * many to initialize statically and there's no way to dynamically
-+	 * populate from BPF.
-+	 */
-+	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
-+	SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd);
-+
-+	printf("Initializing");
-+        for (i = 0; i < MAX_CGRPS; i++) {
-+		__s32 inner_fd;
-+
-+		if (exit_req)
-+			break;
-+
-+		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
-+					  sizeof(__u32), MAX_QUEUED, NULL);
-+		SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d",
-+			   inner_fd);
-+		SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY),
-+			   "Failed to set inner map");
-+		close(inner_fd);
-+
-+		if (!(i % 10))
-+			printf(".");
-+		fflush(stdout);
-+        }
-+	printf("\n");
-+
-+	/*
-+	 * Fully initialized, attach and run.
-+	 */
-+	link = bpf_map__attach_struct_ops(skel->maps.pair_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		printf("[SEQ %llu]\n", seq++);
-+		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
-+		       skel->bss->nr_total,
-+		       skel->bss->nr_dispatched,
-+		       skel->bss->nr_missing);
-+		printf(" kicks:%10lu preemptions:%7lu\n",
-+		       skel->bss->nr_kicks,
-+		       skel->bss->nr_preemptions);
-+		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
-+		       skel->bss->nr_exps,
-+		       skel->bss->nr_exp_waits,
-+		       skel->bss->nr_exp_empty);
-+		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
-+		       skel->bss->nr_cgrp_next,
-+		       skel->bss->nr_cgrp_coll,
-+		       skel->bss->nr_cgrp_empty);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_pair__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h
-new file mode 100644
-index 000000000..d9666a447
---- /dev/null
-+++ b/tools/sched_ext/scx_pair.h
-@@ -0,0 +1,9 @@
-+#ifndef __SCX_EXAMPLE_PAIR_H
-+#define __SCX_EXAMPLE_PAIR_H
-+
-+enum {
-+	MAX_QUEUED		= 4096,
-+	MAX_CGRPS		= 4096,
-+};
-+
-+#endif /* __SCX_EXAMPLE_PAIR_H */
-diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
-new file mode 100644
-index 000000000..b6365df0f
---- /dev/null
-+++ b/tools/sched_ext/scx_qmap.bpf.c
-@@ -0,0 +1,401 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A simple five-level FIFO queue scheduler.
-+ *
-+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
-+ * assigned to one depending on its compound weight. Each CPU round robins
-+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
-+ * queue0, 2 from queue1, 4 from queue2 and so on.
-+ *
-+ * This scheduler demonstrates:
-+ *
-+ * - BPF-side queueing using PIDs.
-+ * - Sleepable per-task storage allocation using ops.prep_enable().
-+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
-+ *   the CPU away.
-+ * - Core-sched support.
-+ *
-+ * This scheduler is primarily for demonstration and testing of sched_ext
-+ * features and unlikely to be useful for actual workloads.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include "scx_common.bpf.h"
-+#include <linux/sched/prio.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+const volatile bool switch_partial;
-+const volatile u32 stall_user_nth;
-+const volatile u32 stall_kernel_nth;
-+const volatile u32 dsp_inf_loop_after;
-+const volatile s32 disallow_tgid;
-+
-+u32 test_error_cnt;
-+
-+struct user_exit_info uei;
-+
-+struct qmap {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, 4096);
-+	__type(value, u32);
-+} queue0 SEC(".maps"),
-+  queue1 SEC(".maps"),
-+  queue2 SEC(".maps"),
-+  queue3 SEC(".maps"),
-+  queue4 SEC(".maps");
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
-+	__uint(max_entries, 5);
-+	__type(key, int);
-+	__array(values, struct qmap);
-+} queue_arr SEC(".maps") = {
-+	.values = {
-+		[0] = &queue0,
-+		[1] = &queue1,
-+		[2] = &queue2,
-+		[3] = &queue3,
-+		[4] = &queue4,
-+	},
-+};
-+
-+/*
-+ * Per-queue sequence numbers to implement core-sched ordering.
-+ *
-+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the
-+ * sequence number of the latest dispatched task. The distance between the a
-+ * task's seq and the associated queue's head seq is called the queue distance
-+ * and used when comparing two tasks for ordering. See qmap_core_sched_before().
-+ */
-+static u64 core_sched_head_seqs[5];
-+static u64 core_sched_tail_seqs[5];
-+
-+/* Per-task scheduling context */
-+struct task_ctx {
-+	bool	force_local;	/* Dispatch directly to local_dsq */
-+	u64	core_sched_seq;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct task_ctx);
-+} task_ctx_stor SEC(".maps");
-+
-+/* Per-cpu dispatch index and remaining count */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(max_entries, 2);
-+	__type(key, u32);
-+	__type(value, u64);
-+} dispatch_idx_cnt SEC(".maps");
-+
-+/* Statistics */
-+unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-+unsigned long nr_core_sched_execed;
-+
-+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	struct task_ctx *tctx;
-+	s32 cpu;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return -ESRCH;
-+	}
-+
-+	if (p->nr_cpus_allowed == 1 ||
-+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+		tctx->force_local = true;
-+		return prev_cpu;
-+	}
-+
-+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0)
-+		return cpu;
-+
-+	return prev_cpu;
-+}
-+
-+static int weight_to_idx(u32 weight)
-+{
-+	/* Coarsely map the compound weight to a FIFO. */
-+	if (weight <= 25)
-+		return 0;
-+	else if (weight <= 50)
-+		return 1;
-+	else if (weight < 200)
-+		return 2;
-+	else if (weight < 400)
-+		return 3;
-+	else
-+		return 4;
-+}
-+
-+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	static u32 user_cnt, kernel_cnt;
-+	struct task_ctx *tctx;
-+	u32 pid = p->pid;
-+	int idx = weight_to_idx(p->scx.weight);
-+	void *ring;
-+
-+	if (p->flags & PF_KTHREAD) {
-+		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
-+			return;
-+	} else {
-+		if (stall_user_nth && !(++user_cnt % stall_user_nth))
-+			return;
-+	}
-+
-+	if (test_error_cnt && !--test_error_cnt)
-+		scx_bpf_error("test triggering error");
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	/*
-+	 * All enqueued tasks must have their core_sched_seq updated for correct
-+	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-+	 * qmap_ops.flags.
-+	 */
-+	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
-+
-+	/*
-+	 * If qmap_select_cpu() is telling us to or this is the last runnable
-+	 * task on the CPU, enqueue locally.
-+	 */
-+	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
-+		tctx->force_local = false;
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	/*
-+	 * If the task was re-enqueued due to the CPU being preempted by a
-+	 * higher priority scheduling class, just re-enqueue the task directly
-+	 * on the global DSQ. As we want another CPU to pick it up, find and
-+	 * kick an idle CPU.
-+	 */
-+	if (enq_flags & SCX_ENQ_REENQ) {
-+		s32 cpu;
-+
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
-+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+		if (cpu >= 0)
-+			scx_bpf_kick_cpu(cpu, 0);
-+		return;
-+	}
-+
-+	ring = bpf_map_lookup_elem(&queue_arr, &idx);
-+	if (!ring) {
-+		scx_bpf_error("failed to find ring %d", idx);
-+		return;
-+	}
-+
-+	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
-+	if (bpf_map_push_elem(ring, &pid, 0)) {
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	__sync_fetch_and_add(&nr_enqueued, 1);
-+}
-+
-+/*
-+ * The BPF queue map doesn't support removal and sched_ext can handle spurious
-+ * dispatches. qmap_dequeue() is only used to collect statistics.
-+ */
-+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
-+{
-+	__sync_fetch_and_add(&nr_dequeued, 1);
-+	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
-+		__sync_fetch_and_add(&nr_core_sched_execed, 1);
-+}
-+
-+static void update_core_sched_head_seq(struct task_struct *p)
-+{
-+	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	int idx = weight_to_idx(p->scx.weight);
-+
-+	if (tctx)
-+		core_sched_head_seqs[idx] = tctx->core_sched_seq;
-+	else
-+		scx_bpf_error("task_ctx lookup failed");
-+}
-+
-+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	u32 zero = 0, one = 1;
-+	u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero);
-+	u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one);
-+	void *fifo;
-+	s32 pid;
-+	int i;
-+
-+	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
-+		struct task_struct *p;
-+
-+		/*
-+		 * PID 2 should be kthreadd which should mostly be idle and off
-+		 * the scheduler. Let's keep dispatching it to force the kernel
-+		 * to call this function over and over again.
-+		 */
-+		p = bpf_task_from_pid(2);
-+		if (p) {
-+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
-+			bpf_task_release(p);
-+			return;
-+		}
-+	}
-+
-+	if (!idx || !cnt) {
-+		scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
-+		return;
-+	}
-+
-+	for (i = 0; i < 5; i++) {
-+		/* Advance the dispatch cursor and pick the fifo. */
-+		if (!*cnt) {
-+			*idx = (*idx + 1) % 5;
-+			*cnt = 1 << *idx;
-+		}
-+		(*cnt)--;
-+
-+		fifo = bpf_map_lookup_elem(&queue_arr, idx);
-+		if (!fifo) {
-+			scx_bpf_error("failed to find ring %llu", *idx);
-+			return;
-+		}
-+
-+		/* Dispatch or advance. */
-+		if (!bpf_map_pop_elem(fifo, &pid)) {
-+			struct task_struct *p;
-+
-+			p = bpf_task_from_pid(pid);
-+			if (p) {
-+				update_core_sched_head_seq(p);
-+				__sync_fetch_and_add(&nr_dispatched, 1);
-+				scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
-+				bpf_task_release(p);
-+				return;
-+			}
-+		}
-+
-+		*cnt = 0;
-+	}
-+}
-+
-+/*
-+ * The distance from the head of the queue scaled by the weight of the queue.
-+ * The lower the number, the older the task and the higher the priority.
-+ */
-+static s64 task_qdist(struct task_struct *p)
-+{
-+	int idx = weight_to_idx(p->scx.weight);
-+	struct task_ctx *tctx;
-+	s64 qdist;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return 0;
-+	}
-+
-+	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
-+
-+	/*
-+	 * As queue index increments, the priority doubles. The queue w/ index 3
-+	 * is dispatched twice more frequently than 2. Reflect the difference by
-+	 * scaling qdists accordingly. Note that the shift amount needs to be
-+	 * flipped depending on the sign to avoid flipping priority direction.
-+	 */
-+	if (qdist >= 0)
-+		return qdist << (4 - idx);
-+	else
-+		return qdist << idx;
-+}
-+
-+/*
-+ * This is called to determine the task ordering when core-sched is picking
-+ * tasks to execute on SMT siblings and should encode about the same ordering as
-+ * the regular scheduling path. Use the priority-scaled distances from the head
-+ * of the queues to compare the two tasks which should be consistent with the
-+ * dispatch path behavior.
-+ */
-+bool BPF_STRUCT_OPS(qmap_core_sched_before,
-+		    struct task_struct *a, struct task_struct *b)
-+{
-+	return task_qdist(a) > task_qdist(b);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
-+{
-+	u32 cnt;
-+
-+	/*
-+	 * Called when @cpu is taken by a higher priority scheduling class. This
-+	 * makes @cpu no longer available for executing sched_ext tasks. As we
-+	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
-+	 * becomes available again, re-enqueue them into the global dsq. See
-+	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
-+	 */
-+	cnt = scx_bpf_reenqueue_local();
-+	if (cnt)
-+		__sync_fetch_and_add(&nr_reenqueued, cnt);
-+}
-+
-+s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	if (p->tgid == disallow_tgid)
-+		p->scx.disallow = true;
-+
-+	/*
-+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-+	 * in this function and the following will automatically use GFP_KERNEL.
-+	 */
-+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-+		return 0;
-+	else
-+		return -ENOMEM;
-+}
-+
-+s32 BPF_STRUCT_OPS(qmap_init)
-+{
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops qmap_ops = {
-+	.select_cpu		= (void *)qmap_select_cpu,
-+	.enqueue		= (void *)qmap_enqueue,
-+	.dequeue		= (void *)qmap_dequeue,
-+	.dispatch		= (void *)qmap_dispatch,
-+	.core_sched_before	= (void *)qmap_core_sched_before,
-+	.cpu_release		= (void *)qmap_cpu_release,
-+	.prep_enable		= (void *)qmap_prep_enable,
-+	.init			= (void *)qmap_init,
-+	.exit			= (void *)qmap_exit,
-+	.flags			= SCX_OPS_ENQ_LAST,
-+	.timeout_ms		= 5000U,
-+	.name			= "qmap",
-+};
-diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
-new file mode 100644
-index 000000000..edc3d0a4e
---- /dev/null
-+++ b/tools/sched_ext/scx_qmap.c
-@@ -0,0 +1,105 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "scx_common.h"
-+#include "scx_qmap.skel.h"
-+
-+const char help_fmt[] =
-+"A simple five-level FIFO queue sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
-+"  -t COUNT      Stall every COUNT'th user thread\n"
-+"  -T COUNT      Stall every COUNT'th kernel thread\n"
-+"  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
-+"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_qmap *skel;
-+	struct bpf_link *link;
-+	int opt;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_qmap__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) {
-+		switch (opt) {
-+		case 's':
-+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		case 'e':
-+			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
-+			break;
-+		case 't':
-+			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'T':
-+			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'l':
-+			skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'd':
-+			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
-+			if (skel->rodata->disallow_tgid < 0)
-+				skel->rodata->disallow_tgid = getpid();
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel");
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		long nr_enqueued = skel->bss->nr_enqueued;
-+		long nr_dispatched = skel->bss->nr_dispatched;
-+
-+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n",
-+		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
-+		       skel->bss->nr_core_sched_execed);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_qmap__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_rusty/.gitignore b/tools/sched_ext/scx_rusty/.gitignore
-new file mode 100644
-index 000000000..186dba259
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/.gitignore
-@@ -0,0 +1,3 @@
-+src/bpf/.output
-+Cargo.lock
-+target
-diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
-new file mode 100644
-index 000000000..b0edd3b93
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/Cargo.toml
-@@ -0,0 +1,28 @@
-+[package]
-+name = "scx_rusty"
-+version = "0.5.0"
-+authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
-+edition = "2021"
-+description = "Userspace scheduling with BPF"
-+license = "GPL-2.0-only"
-+
-+[dependencies]
-+anyhow = "1.0.65"
-+bitvec = { version = "1.0", features = ["serde"] }
-+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
-+ctrlc = { version = "3.1", features = ["termination"] }
-+fb_procfs = "0.7.0"
-+hex = "0.4.3"
-+libbpf-rs = "0.21.0"
-+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] }
-+libc = "0.2.137"
-+log = "0.4.17"
-+ordered-float = "3.4.0"
-+simplelog = "0.12.0"
-+
-+[build-dependencies]
-+bindgen = { version = "0.61.0" }
-+libbpf-cargo = "0.21.0"
-+
-+[features]
-+enable_backtrace = []
-diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs
-new file mode 100644
-index 000000000..c54b8f33c
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/build.rs
-@@ -0,0 +1,72 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+extern crate bindgen;
-+
-+use std::env;
-+use std::fs::create_dir_all;
-+use std::path::Path;
-+use std::path::PathBuf;
-+
-+use libbpf_cargo::SkeletonBuilder;
-+
-+const HEADER_PATH: &str = "src/bpf/rusty.h";
-+
-+fn bindgen_rusty() {
-+    // Tell cargo to invalidate the built crate whenever the wrapper changes
-+    println!("cargo:rerun-if-changed={}", HEADER_PATH);
-+
-+    // The bindgen::Builder is the main entry point
-+    // to bindgen, and lets you build up options for
-+    // the resulting bindings.
-+    let bindings = bindgen::Builder::default()
-+        // The input header we would like to generate
-+        // bindings for.
-+        .header(HEADER_PATH)
-+        // Tell cargo to invalidate the built crate whenever any of the
-+        // included header files changed.
-+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-+        // Finish the builder and generate the bindings.
-+        .generate()
-+        // Unwrap the Result and panic on failure.
-+        .expect("Unable to generate bindings");
-+
-+    // Write the bindings to the $OUT_DIR/bindings.rs file.
-+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-+    bindings
-+        .write_to_file(out_path.join("rusty_sys.rs"))
-+        .expect("Couldn't write bindings!");
-+}
-+
-+fn gen_bpf_sched(name: &str) {
-+    let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap();
-+    let clang = env::var("SCX_RUST_CLANG").unwrap();
-+    eprintln!("{}", clang);
-+    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
-+    let skel = Path::new(&outpath);
-+    let src = format!("./src/bpf/{}.bpf.c", name);
-+    let obj = format!("./src/bpf/.output/{}.bpf.o", name);
-+    SkeletonBuilder::new()
-+        .source(src.clone())
-+	.obj(obj)
-+        .clang(clang)
-+        .clang_args(bpf_cflags)
-+        .build_and_generate(skel)
-+        .unwrap();
-+    println!("cargo:rerun-if-changed={}", src);
-+}
-+
-+fn main() {
-+    bindgen_rusty();
-+    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
-+    // Reasons are because the generated skeleton contains compiler attributes
-+    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
-+    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
-+    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
-+    //
-+    // However, there is hope! When the above feature stabilizes we can clean this
-+    // all up.
-+    create_dir_all("./src/bpf/.output").unwrap();
-+    gen_bpf_sched("rusty");
-+}
-diff --git a/tools/sched_ext/scx_rusty/rustfmt.toml b/tools/sched_ext/scx_rusty/rustfmt.toml
-new file mode 100644
-index 000000000..b7258ed0a
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/rustfmt.toml
-@@ -0,0 +1,8 @@
-+# Get help on options with `rustfmt --help=config`
-+# Please keep these in alphabetical order.
-+edition = "2021"
-+group_imports = "StdExternalCrate"
-+imports_granularity = "Item"
-+merge_derives = false
-+use_field_init_shorthand = true
-+version = "Two"
-diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
-new file mode 100644
-index 000000000..7a8b27cea
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c
-@@ -0,0 +1,1153 @@
-+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-+/*
-+ * This software may be used and distributed according to the terms of the
-+ * GNU General Public License version 2.
-+ *
-+ * scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF
-+ * part does simple round robin in each domain and the userspace part
-+ * calculates the load factor of each domain and tells the BPF part how to load
-+ * balance the domains.
-+ *
-+ * Every task has an entry in the task_data map which lists which domain the
-+ * task belongs to. When a task first enters the system (rusty_prep_enable),
-+ * they are round-robined to a domain.
-+ *
-+ * rusty_select_cpu is the primary scheduling logic, invoked when a task
-+ * becomes runnable. The lb_data map is populated by userspace to inform the BPF
-+ * scheduler that a task should be migrated to a new domain. Otherwise, the task
-+ * is scheduled in priority order as follows:
-+ * * The current core if the task was woken up synchronously and there are idle
-+ *   cpus in the system
-+ * * The previous core, if idle
-+ * * The pinned-to core if the task is pinned to a specific core
-+ * * Any idle cpu in the domain
-+ *
-+ * If none of the above conditions are met, then the task is enqueued to a
-+ * dispatch queue corresponding to the domain (rusty_enqueue).
-+ *
-+ * rusty_dispatch will attempt to consume a task from its domain's
-+ * corresponding dispatch queue (this occurs after scheduling any tasks directly
-+ * assigned to it due to the logic in rusty_select_cpu). If no task is found,
-+ * then greedy load stealing will attempt to find a task on another dispatch
-+ * queue to run.
-+ *
-+ * Load balancing is almost entirely handled by userspace. BPF populates the
-+ * task weight, dom mask and current dom in the task_data map and executes the
-+ * load balance based on userspace populating the lb_data map.
-+ */
-+#include "../../../scx_common.bpf.h"
-+#include "../../../ravg_impl.bpf.h"
-+#include "rusty.h"
-+
-+#include <errno.h>
-+#include <stdbool.h>
-+#include <string.h>
-+#include <bpf/bpf_core_read.h>
-+#include <bpf/bpf_helpers.h>
-+#include <bpf/bpf_tracing.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+/*
-+ * const volatiles are set during initialization and treated as consts by the
-+ * jit compiler.
-+ */
-+
-+/*
-+ * Domains and cpus
-+ */
-+const volatile u32 nr_doms = 32;	/* !0 for veristat, set during init */
-+const volatile u32 nr_cpus = 64;	/* !0 for veristat, set during init */
-+const volatile u32 cpu_dom_id_map[MAX_CPUS];
-+const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
-+const volatile u32 load_half_life = 1000000000	/* 1s */;
-+
-+const volatile bool kthreads_local;
-+const volatile bool fifo_sched;
-+const volatile bool switch_partial;
-+const volatile u32 greedy_threshold;
-+const volatile u32 debug;
-+
-+/* base slice duration */
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+
-+/*
-+ * Exit info
-+ */
-+int exit_kind = SCX_EXIT_NONE;
-+char exit_msg[SCX_EXIT_MSG_LEN];
-+
-+/*
-+ * Per-CPU context
-+ */
-+struct pcpu_ctx {
-+	u32 dom_rr_cur; /* used when scanning other doms */
-+
-+	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
-+	u8 _padding[CACHELINE_SIZE - sizeof(u32)];
-+} __attribute__((aligned(CACHELINE_SIZE)));
-+
-+struct pcpu_ctx pcpu_ctx[MAX_CPUS];
-+
-+/*
-+ * Domain context
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct dom_ctx);
-+	__uint(max_entries, MAX_DOMS);
-+	__uint(map_flags, 0);
-+} dom_data SEC(".maps");
-+
-+struct lock_wrapper {
-+	struct bpf_spin_lock lock;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct lock_wrapper);
-+	__uint(max_entries, MAX_DOMS);
-+	__uint(map_flags, 0);
-+} dom_load_locks SEC(".maps");
-+
-+struct dom_active_pids {
-+	u64 gen;
-+	u64 read_idx;
-+	u64 write_idx;
-+	s32 pids[MAX_DOM_ACTIVE_PIDS];
-+};
-+
-+struct dom_active_pids dom_active_pids[MAX_DOMS];
-+
-+const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
-+
-+static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
-+{
-+	struct dom_ctx *domc;
-+	struct lock_wrapper *lockw;
-+
-+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-+	lockw = bpf_map_lookup_elem(&dom_load_locks, &dom_id);
-+
-+	if (!domc || !lockw) {
-+		scx_bpf_error("dom_ctx / lock lookup failed");
-+		return;
-+	}
-+
-+	bpf_spin_lock(&lockw->lock);
-+	domc->load += adj;
-+	ravg_accumulate(&domc->load_rd, domc->load, now, load_half_life);
-+	bpf_spin_unlock(&lockw->lock);
-+
-+	if (adj < 0 && (s64)domc->load < 0)
-+		scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)",
-+			      bpf_get_smp_processor_id(), dom_id, domc->load, adj);
-+
-+	if (debug >=2 &&
-+	    (!domc->dbg_load_printed_at || now - domc->dbg_load_printed_at >= 1000000000)) {
-+		bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu",
-+			   dom_id,
-+			   adj,
-+			   ravg_read(&domc->load_rd, now, load_half_life) >> RAVG_FRAC_BITS);
-+		domc->dbg_load_printed_at = now;
-+	}
-+}
-+
-+static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
-+			       u32 from_dom_id, u32 to_dom_id, u64 now)
-+{
-+	struct dom_ctx *from_domc, *to_domc;
-+	struct lock_wrapper *from_lockw, *to_lockw;
-+	struct ravg_data task_load_rd;
-+	u64 from_load[2], to_load[2], task_load;
-+
-+	from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
-+	from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id);
-+	to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id);
-+	to_lockw = bpf_map_lookup_elem(&dom_load_locks, &to_dom_id);
-+	if (!from_domc || !from_lockw || !to_domc || !to_lockw) {
-+		scx_bpf_error("dom_ctx / lock lookup failed");
-+		return;
-+	}
-+
-+	/*
-+	 * @p is moving from @from_dom_id to @to_dom_id. Its load contribution
-+	 * should be moved together. We only track duty cycle for tasks. Scale
-+	 * it by weight to get load_rd.
-+	 */
-+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
-+	task_load_rd = taskc->dcyc_rd;
-+	ravg_scale(&task_load_rd, p->scx.weight, 0);
-+
-+	if (debug >= 2)
-+		task_load = ravg_read(&task_load_rd, now, load_half_life);
-+
-+	/* transfer out of @from_dom_id */
-+	bpf_spin_lock(&from_lockw->lock);
-+	if (taskc->runnable)
-+		from_domc->load -= p->scx.weight;
-+
-+	if (debug >= 2)
-+		from_load[0] = ravg_read(&from_domc->load_rd, now, load_half_life);
-+
-+	ravg_transfer(&from_domc->load_rd, from_domc->load,
-+		      &task_load_rd, taskc->runnable, load_half_life, false);
-+
-+	if (debug >= 2)
-+		from_load[1] = ravg_read(&from_domc->load_rd, now, load_half_life);
-+
-+	bpf_spin_unlock(&from_lockw->lock);
-+
-+	/* transfer into @to_dom_id */
-+	bpf_spin_lock(&to_lockw->lock);
-+	if (taskc->runnable)
-+		to_domc->load += p->scx.weight;
-+
-+	if (debug >= 2)
-+		to_load[0] = ravg_read(&to_domc->load_rd, now, load_half_life);
-+
-+	ravg_transfer(&to_domc->load_rd, to_domc->load,
-+		      &task_load_rd, taskc->runnable, load_half_life, true);
-+
-+	if (debug >= 2)
-+		to_load[1] = ravg_read(&to_domc->load_rd, now, load_half_life);
-+
-+	bpf_spin_unlock(&to_lockw->lock);
-+
-+	if (debug >= 2)
-+		bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
-+			   from_dom_id, to_dom_id,
-+			   task_load >> RAVG_FRAC_BITS,
-+			   from_load[0] >> RAVG_FRAC_BITS,
-+			   from_load[1] >> RAVG_FRAC_BITS,
-+			   to_load[0] >> RAVG_FRAC_BITS,
-+			   to_load[1] >> RAVG_FRAC_BITS);
-+}
-+
-+/*
-+ * Statistics
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(key_size, sizeof(u32));
-+	__uint(value_size, sizeof(u64));
-+	__uint(max_entries, RUSTY_NR_STATS);
-+} stats SEC(".maps");
-+
-+static inline void stat_add(enum stat_idx idx, u64 addend)
-+{
-+	u32 idx_v = idx;
-+
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
-+	if (cnt_p)
-+		(*cnt_p) += addend;
-+}
-+
-+/* Map pid -> task_ctx */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__type(key, pid_t);
-+	__type(value, struct task_ctx);
-+	__uint(max_entries, 1000000);
-+	__uint(map_flags, 0);
-+} task_data SEC(".maps");
-+
-+struct task_ctx *lookup_task_ctx(struct task_struct *p)
-+{
-+	struct task_ctx *taskc;
-+	s32 pid = p->pid;
-+
-+	if ((taskc = bpf_map_lookup_elem(&task_data, &pid))) {
-+		return taskc;
-+	} else {
-+		scx_bpf_error("task_ctx lookup failed for pid %d", p->pid);
-+		return NULL;
-+	}
-+}
-+
-+/*
-+ * This is populated from userspace to indicate which pids should be reassigned
-+ * to new doms.
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__type(key, pid_t);
-+	__type(value, u32);
-+	__uint(max_entries, 1000);
-+	__uint(map_flags, 0);
-+} lb_data SEC(".maps");
-+
-+/*
-+ * Userspace tuner will frequently update the following struct with tuning
-+ * parameters and bump its gen. refresh_tune_params() converts them into forms
-+ * that can be used directly in the scheduling paths.
-+ */
-+struct tune_input{
-+	u64 gen;
-+	u64 direct_greedy_cpumask[MAX_CPUS / 64];
-+	u64 kick_greedy_cpumask[MAX_CPUS / 64];
-+} tune_input;
-+
-+u64 tune_params_gen;
-+private(A) struct bpf_cpumask __kptr *all_cpumask;
-+private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
-+private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+static u32 cpu_to_dom_id(s32 cpu)
-+{
-+	const volatile u32 *dom_idp;
-+
-+	if (nr_doms <= 1)
-+		return 0;
-+
-+	dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]);
-+	if (!dom_idp)
-+		return MAX_DOMS;
-+
-+	return *dom_idp;
-+}
-+
-+static void refresh_tune_params(void)
-+{
-+	s32 cpu;
-+
-+	if (tune_params_gen == tune_input.gen)
-+		return;
-+
-+	tune_params_gen = tune_input.gen;
-+
-+	bpf_for(cpu, 0, nr_cpus) {
-+		u32 dom_id = cpu_to_dom_id(cpu);
-+		struct dom_ctx *domc;
-+
-+		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
-+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-+			return;
-+		}
-+
-+		if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
-+			if (direct_greedy_cpumask)
-+				bpf_cpumask_set_cpu(cpu, direct_greedy_cpumask);
-+			if (domc->direct_greedy_cpumask)
-+				bpf_cpumask_set_cpu(cpu, domc->direct_greedy_cpumask);
-+		} else {
-+			if (direct_greedy_cpumask)
-+				bpf_cpumask_clear_cpu(cpu, direct_greedy_cpumask);
-+			if (domc->direct_greedy_cpumask)
-+				bpf_cpumask_clear_cpu(cpu, domc->direct_greedy_cpumask);
-+		}
-+
-+		if (tune_input.kick_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
-+			if (kick_greedy_cpumask)
-+				bpf_cpumask_set_cpu(cpu, kick_greedy_cpumask);
-+		} else {
-+			if (kick_greedy_cpumask)
-+				bpf_cpumask_clear_cpu(cpu, kick_greedy_cpumask);
-+		}
-+	}
-+}
-+
-+static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p,
-+			    u32 new_dom_id, bool init_dsq_vtime)
-+{
-+	struct dom_ctx *old_domc, *new_domc;
-+	struct bpf_cpumask *d_cpumask, *t_cpumask;
-+	u32 old_dom_id = taskc->dom_id;
-+	s64 vtime_delta;
-+
-+	old_domc = bpf_map_lookup_elem(&dom_data, &old_dom_id);
-+	if (!old_domc) {
-+		scx_bpf_error("Failed to lookup old dom%u", old_dom_id);
-+		return false;
-+	}
-+
-+	if (init_dsq_vtime)
-+		vtime_delta = 0;
-+	else
-+		vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now;
-+
-+	new_domc = bpf_map_lookup_elem(&dom_data, &new_dom_id);
-+	if (!new_domc) {
-+		scx_bpf_error("Failed to lookup new dom%u", new_dom_id);
-+		return false;
-+	}
-+
-+	d_cpumask = new_domc->cpumask;
-+	if (!d_cpumask) {
-+		scx_bpf_error("Failed to get dom%u cpumask kptr",
-+			      new_dom_id);
-+		return false;
-+	}
-+
-+	t_cpumask = taskc->cpumask;
-+	if (!t_cpumask) {
-+		scx_bpf_error("Failed to look up task cpumask");
-+		return false;
-+	}
-+
-+	/*
-+	 * set_cpumask might have happened between userspace requesting LB and
-+	 * here and @p might not be able to run in @dom_id anymore. Verify.
-+	 */
-+	if (bpf_cpumask_intersects((const struct cpumask *)d_cpumask,
-+				   p->cpus_ptr)) {
-+		u64 now = bpf_ktime_get_ns();
-+
-+		dom_load_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now);
-+
-+		p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta;
-+		taskc->dom_id = new_dom_id;
-+		bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask,
-+				p->cpus_ptr);
-+	}
-+
-+	return taskc->dom_id == new_dom_id;
-+}
-+
-+s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
-+		   u64 wake_flags)
-+{
-+	const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
-+	struct task_ctx *taskc;
-+	struct bpf_cpumask *p_cpumask;
-+	bool prev_domestic, has_idle_cores;
-+	s32 cpu;
-+
-+	refresh_tune_params();
-+
-+	if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask))
-+		goto enoent;
-+
-+	if (kthreads_local &&
-+	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
-+		cpu = prev_cpu;
-+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
-+		goto direct;
-+	}
-+
-+	/*
-+	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
-+	 * local dsq of the waker.
-+	 */
-+	if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) {
-+		struct task_struct *current = (void *)bpf_get_current_task();
-+
-+		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
-+		    taskc->dom_id < MAX_DOMS) {
-+			struct dom_ctx *domc;
-+			struct bpf_cpumask *d_cpumask;
-+			const struct cpumask *idle_cpumask;
-+			bool has_idle;
-+
-+			domc = bpf_map_lookup_elem(&dom_data, &taskc->dom_id);
-+			if (!domc) {
-+				scx_bpf_error("Failed to find dom%u", taskc->dom_id);
-+				goto enoent;
-+			}
-+			d_cpumask = domc->cpumask;
-+			if (!d_cpumask) {
-+				scx_bpf_error("Failed to acquire dom%u cpumask kptr",
-+					      taskc->dom_id);
-+				goto enoent;
-+			}
-+
-+			idle_cpumask = scx_bpf_get_idle_cpumask();
-+
-+			has_idle = bpf_cpumask_intersects((const struct cpumask *)d_cpumask,
-+							  idle_cpumask);
-+
-+			scx_bpf_put_idle_cpumask(idle_cpumask);
-+
-+			if (has_idle) {
-+				cpu = bpf_get_smp_processor_id();
-+				if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-+					stat_add(RUSTY_STAT_WAKE_SYNC, 1);
-+					goto direct;
-+				}
-+			}
-+		}
-+	}
-+
-+	/* If only one CPU is allowed, dispatch */
-+	if (p->nr_cpus_allowed == 1) {
-+		stat_add(RUSTY_STAT_PINNED, 1);
-+		cpu = prev_cpu;
-+		goto direct;
-+	}
-+
-+	has_idle_cores = !bpf_cpumask_empty(idle_smtmask);
-+
-+	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
-+	prev_domestic = bpf_cpumask_test_cpu(prev_cpu,
-+					     (const struct cpumask *)p_cpumask);
-+
-+	/*
-+	 * See if we want to keep @prev_cpu. We want to keep @prev_cpu if the
-+	 * whole physical core is idle. If the sibling[s] are busy, it's likely
-+	 * more advantageous to look for wholly idle cores first.
-+	 */
-+	if (prev_domestic) {
-+		if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
-+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+			stat_add(RUSTY_STAT_PREV_IDLE, 1);
-+			cpu = prev_cpu;
-+			goto direct;
-+		}
-+	} else {
-+		/*
-+		 * @prev_cpu is foreign. Linger iff the domain isn't too busy as
-+		 * indicated by direct_greedy_cpumask. There may also be an idle
-+		 * CPU in the domestic domain
-+		 */
-+		if (direct_greedy_cpumask &&
-+		    bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *)
-+					 direct_greedy_cpumask) &&
-+		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
-+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+			stat_add(RUSTY_STAT_GREEDY_IDLE, 1);
-+			cpu = prev_cpu;
-+			goto direct;
-+		}
-+	}
-+
-+	/*
-+	 * @prev_cpu didn't work out. Let's see whether there's an idle CPU @p
-+	 * can be directly dispatched to. We'll first try to find the best idle
-+	 * domestic CPU and then move onto foreign.
-+	 */
-+
-+	/* If there is a domestic idle core, dispatch directly */
-+	if (has_idle_cores) {
-+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask,
-+					    SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0) {
-+			stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
-+			goto direct;
-+		}
-+	}
-+
-+	/*
-+	 * If @prev_cpu was domestic and is idle itself even though the core
-+	 * isn't, picking @prev_cpu may improve L1/2 locality.
-+	 */
-+	if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
-+		cpu = prev_cpu;
-+		goto direct;
-+	}
-+
-+	/* If there is any domestic idle CPU, dispatch directly */
-+	cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0);
-+	if (cpu >= 0) {
-+		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
-+		goto direct;
-+	}
-+
-+	/*
-+	 * Domestic domain is fully booked. If there are CPUs which are idle and
-+	 * under-utilized, ignore domain boundaries and push the task there. Try
-+	 * to find an idle core first.
-+	 */
-+	if (taskc->all_cpus && direct_greedy_cpumask &&
-+	    !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
-+		u32 dom_id = cpu_to_dom_id(prev_cpu);
-+		struct dom_ctx *domc;
-+
-+		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
-+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-+			goto enoent;
-+		}
-+
-+		/* Try to find an idle core in the previous and then any domain */
-+		if (has_idle_cores) {
-+			if (domc->direct_greedy_cpumask) {
-+				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-+							    domc->direct_greedy_cpumask,
-+							    SCX_PICK_IDLE_CORE);
-+				if (cpu >= 0) {
-+					stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
-+					goto direct;
-+				}
-+			}
-+
-+			if (direct_greedy_cpumask) {
-+				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-+							    direct_greedy_cpumask,
-+							    SCX_PICK_IDLE_CORE);
-+				if (cpu >= 0) {
-+					stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
-+					goto direct;
-+				}
-+			}
-+		}
-+
-+		/*
-+		 * No idle core. Is there any idle CPU?
-+		 */
-+		if (domc->direct_greedy_cpumask) {
-+			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-+						    domc->direct_greedy_cpumask, 0);
-+			if (cpu >= 0) {
-+				stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
-+				goto direct;
-+			}
-+		}
-+
-+		if (direct_greedy_cpumask) {
-+			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-+						    direct_greedy_cpumask, 0);
-+			if (cpu >= 0) {
-+				stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
-+				goto direct;
-+			}
-+		}
-+	}
-+
-+	/*
-+	 * We're going to queue on the domestic domain's DSQ. @prev_cpu may be
-+	 * in a different domain. Returning an out-of-domain CPU can lead to
-+	 * stalls as all in-domain CPUs may be idle by the time @p gets
-+	 * enqueued.
-+	 */
-+	if (prev_domestic)
-+		cpu = prev_cpu;
-+	else
-+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
-+
-+	scx_bpf_put_idle_cpumask(idle_smtmask);
-+	return cpu;
-+
-+direct:
-+	taskc->dispatch_local = true;
-+	scx_bpf_put_idle_cpumask(idle_smtmask);
-+	return cpu;
-+
-+enoent:
-+	scx_bpf_put_idle_cpumask(idle_smtmask);
-+	return -ENOENT;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct task_ctx *taskc;
-+	struct bpf_cpumask *p_cpumask;
-+	pid_t pid = p->pid;
-+	u32 *new_dom;
-+	s32 cpu;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+	if (!(p_cpumask = taskc->cpumask)) {
-+		scx_bpf_error("NULL cpmask");
-+		return;
-+	}
-+
-+	/*
-+	 * Migrate @p to a new domain if requested by userland through lb_data.
-+	 */
-+	new_dom = bpf_map_lookup_elem(&lb_data, &pid);
-+	if (new_dom && *new_dom != taskc->dom_id &&
-+	    task_set_domain(taskc, p, *new_dom, false)) {
-+		stat_add(RUSTY_STAT_LOAD_BALANCE, 1);
-+		taskc->dispatch_local = false;
-+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
-+		if (cpu >= 0)
-+			scx_bpf_kick_cpu(cpu, 0);
-+		goto dom_queue;
-+	}
-+
-+	if (taskc->dispatch_local) {
-+		taskc->dispatch_local = false;
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	/*
-+	 * @p is about to be queued on its domain's dsq. However, @p may be on a
-+	 * foreign CPU due to a greedy execution and not have gone through
-+	 * ->select_cpu() if it's being enqueued e.g. after slice exhaustion. If
-+	 * so, @p would be queued on its domain's dsq but none of the CPUs in
-+	 * the domain would be woken up which can induce temporary execution
-+	 * stalls. Kick a domestic CPU if @p is on a foreign domain.
-+	 */
-+	if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) {
-+		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
-+		scx_bpf_kick_cpu(cpu, 0);
-+		stat_add(RUSTY_STAT_REPATRIATE, 1);
-+	}
-+
-+dom_queue:
-+	if (fifo_sched) {
-+		scx_bpf_dispatch(p, taskc->dom_id, slice_ns, enq_flags);
-+	} else {
-+		u64 vtime = p->scx.dsq_vtime;
-+		u32 dom_id = taskc->dom_id;
-+		struct dom_ctx *domc;
-+
-+		domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-+		if (!domc) {
-+			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-+			return;
-+		}
-+
-+		/*
-+		 * Limit the amount of budget that an idling task can accumulate
-+		 * to one slice.
-+		 */
-+		if (vtime_before(vtime, domc->vtime_now - slice_ns))
-+			vtime = domc->vtime_now - slice_ns;
-+
-+		scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, vtime, enq_flags);
-+	}
-+
-+	/*
-+	 * If there are CPUs which are idle and not saturated, wake them up to
-+	 * see whether they'd be able to steal the just queued task. This path
-+	 * is taken only if DIRECT_GREEDY didn't trigger in select_cpu().
-+	 *
-+	 * While both mechanisms serve very similar purposes, DIRECT_GREEDY
-+	 * emplaces the task in a foreign CPU directly while KICK_GREEDY just
-+	 * wakes up a foreign CPU which will then first try to execute from its
-+	 * domestic domain first before snooping foreign ones.
-+	 *
-+	 * While KICK_GREEDY is a more expensive way of accelerating greedy
-+	 * execution, DIRECT_GREEDY shows negative performance impacts when the
-+	 * CPUs are highly loaded while KICK_GREEDY doesn't. Even under fairly
-+	 * high utilization, KICK_GREEDY can slightly improve work-conservation.
-+	 */
-+	if (taskc->all_cpus && kick_greedy_cpumask) {
-+		cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-+					    kick_greedy_cpumask, 0);
-+		if (cpu >= 0) {
-+			stat_add(RUSTY_STAT_KICK_GREEDY, 1);
-+			scx_bpf_kick_cpu(cpu, 0);
-+		}
-+	}
-+}
-+
-+static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
-+{
-+	s32 cpu;
-+
-+	if (dom_id >= MAX_DOMS)
-+		return false;
-+
-+	bpf_for(cpu, 0, nr_cpus) {
-+		if (bpf_cpumask_test_cpu(cpu, cpumask) &&
-+		    (dom_cpumasks[dom_id][cpu / 64] & (1LLU << (cpu % 64))))
-+			return true;
-+	}
-+	return false;
-+}
-+
-+static u32 dom_rr_next(s32 cpu)
-+{
-+	struct pcpu_ctx *pcpuc;
-+	u32 dom_id;
-+
-+	pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
-+	if (!pcpuc)
-+		return 0;
-+
-+	dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
-+
-+	if (dom_id == cpu_to_dom_id(cpu))
-+		dom_id = (dom_id + 1) % nr_doms;
-+
-+	pcpuc->dom_rr_cur = dom_id;
-+	return dom_id;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	u32 dom = cpu_to_dom_id(cpu);
-+
-+	if (scx_bpf_consume(dom)) {
-+		stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
-+		return;
-+	}
-+
-+	if (!greedy_threshold)
-+		return;
-+
-+	bpf_repeat(nr_doms - 1) {
-+		u32 dom_id = dom_rr_next(cpu);
-+
-+		if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
-+		    scx_bpf_consume(dom_id)) {
-+			stat_add(RUSTY_STAT_GREEDY, 1);
-+			break;
-+		}
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
-+{
-+	u64 now = bpf_ktime_get_ns();
-+	struct task_ctx *taskc;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	taskc->runnable = true;
-+	taskc->is_kworker = p->flags & PF_WQ_WORKER;
-+
-+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
-+	dom_load_adj(taskc->dom_id, p->scx.weight, now);
-+}
-+
-+void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
-+{
-+	struct task_ctx *taskc;
-+	struct dom_ctx *domc;
-+	u32 dom_id, dap_gen;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	taskc->running_at = bpf_ktime_get_ns();
-+	dom_id = taskc->dom_id;
-+	if (dom_id >= MAX_DOMS) {
-+		scx_bpf_error("Invalid dom ID");
-+		return;
-+	}
-+
-+	/*
-+	 * Record that @p has been active in @domc. Load balancer will only
-+	 * consider recently active tasks. Access synchronization rules aren't
-+	 * strict. We just need to be right most of the time.
-+	 */
-+	dap_gen = dom_active_pids[dom_id].gen;
-+	if (taskc->dom_active_pids_gen != dap_gen) {
-+		u64 idx = __sync_fetch_and_add(&dom_active_pids[dom_id].write_idx, 1) %
-+			MAX_DOM_ACTIVE_PIDS;
-+		s32 *pidp;
-+
-+		pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]);
-+		if (!pidp) {
-+			scx_bpf_error("dom_active_pids[%u][%llu] indexing failed",
-+				      dom_id, idx);
-+			return;
-+		}
-+
-+		*pidp = p->pid;
-+		taskc->dom_active_pids_gen = dap_gen;
-+	}
-+
-+	if (fifo_sched)
-+		return;
-+
-+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-+	if (!domc) {
-+		scx_bpf_error("Failed to lookup dom[%u]", dom_id);
-+		return;
-+	}
-+
-+	/*
-+	 * Global vtime always progresses forward as tasks start executing. The
-+	 * test and update can be performed concurrently from multiple CPUs and
-+	 * thus racy. Any error should be contained and temporary. Let's just
-+	 * live with it.
-+	 */
-+	if (vtime_before(domc->vtime_now, p->scx.dsq_vtime))
-+		domc->vtime_now = p->scx.dsq_vtime;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
-+{
-+	struct task_ctx *taskc;
-+
-+	if (fifo_sched)
-+		return;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	/* scale the execution time by the inverse of the weight and charge */
-+	p->scx.dsq_vtime +=
-+		(bpf_ktime_get_ns() - taskc->running_at) * 100 / p->scx.weight;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
-+{
-+	u64 now = bpf_ktime_get_ns();
-+	struct task_ctx *taskc;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	taskc->runnable = false;
-+
-+	ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
-+	dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
-+}
-+
-+void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
-+{
-+	struct task_ctx *taskc;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	taskc->weight = weight;
-+}
-+
-+static u32 task_pick_domain(struct task_ctx *taskc, struct task_struct *p,
-+			    const struct cpumask *cpumask)
-+{
-+	s32 cpu = bpf_get_smp_processor_id();
-+	u32 first_dom = MAX_DOMS, dom;
-+
-+	if (cpu < 0 || cpu >= MAX_CPUS)
-+		return MAX_DOMS;
-+
-+	taskc->dom_mask = 0;
-+
-+	dom = pcpu_ctx[cpu].dom_rr_cur++;
-+	bpf_repeat(nr_doms) {
-+		dom = (dom + 1) % nr_doms;
-+		if (cpumask_intersects_domain(cpumask, dom)) {
-+			taskc->dom_mask |= 1LLU << dom;
-+			/*
-+			 * AsThe starting point is round-robin'd and the first
-+			 * match should be spread across all the domains.
-+			 */
-+			if (first_dom == MAX_DOMS)
-+				first_dom = dom;
-+		}
-+	}
-+
-+	return first_dom;
-+}
-+
-+static void task_pick_and_set_domain(struct task_ctx *taskc,
-+				     struct task_struct *p,
-+				     const struct cpumask *cpumask,
-+				     bool init_dsq_vtime)
-+{
-+	u32 dom_id = 0;
-+
-+	if (nr_doms > 1)
-+		dom_id = task_pick_domain(taskc, p, cpumask);
-+
-+	if (!task_set_domain(taskc, p, dom_id, init_dsq_vtime))
-+		scx_bpf_error("Failed to set dom%d for %s[%d]",
-+			      dom_id, p->comm, p->pid);
-+}
-+
-+void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
-+		    const struct cpumask *cpumask)
-+{
-+	struct task_ctx *taskc;
-+
-+	if (!(taskc = lookup_task_ctx(p)))
-+		return;
-+
-+	task_pick_and_set_domain(taskc, p, cpumask, false);
-+	if (all_cpumask)
-+		taskc->all_cpus =
-+			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
-+}
-+
-+s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	struct bpf_cpumask *cpumask;
-+	struct task_ctx taskc = { .dom_active_pids_gen = -1 };
-+	struct task_ctx *map_value;
-+	long ret;
-+	pid_t pid;
-+
-+	pid = p->pid;
-+	ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST);
-+	if (ret) {
-+		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
-+		return ret;
-+	}
-+
-+	/*
-+	 * Read the entry from the map immediately so we can add the cpumask
-+	 * with bpf_kptr_xchg().
-+	 */
-+	map_value = bpf_map_lookup_elem(&task_data, &pid);
-+	if (!map_value)
-+		/* Should never happen -- it was just inserted above. */
-+		return -EINVAL;
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask) {
-+		bpf_map_delete_elem(&task_data, &pid);
-+		return -ENOMEM;
-+	}
-+
-+	cpumask = bpf_kptr_xchg(&map_value->cpumask, cpumask);
-+	if (cpumask) {
-+		/* Should never happen as we just inserted it above. */
-+		bpf_cpumask_release(cpumask);
-+		bpf_map_delete_elem(&task_data, &pid);
-+		return -EINVAL;
-+	}
-+
-+	task_pick_and_set_domain(map_value, p, p->cpus_ptr, true);
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
-+{
-+	pid_t pid = p->pid;
-+	long ret = bpf_map_delete_elem(&task_data, &pid);
-+	if (ret) {
-+		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
-+		return;
-+	}
-+}
-+
-+static s32 create_dom(u32 dom_id)
-+{
-+	struct dom_ctx domc_init = {}, *domc;
-+	struct bpf_cpumask *cpumask;
-+	u32 cpu;
-+	s32 ret;
-+
-+	ret = scx_bpf_create_dsq(dom_id, -1);
-+	if (ret < 0) {
-+		scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret);
-+		return ret;
-+	}
-+
-+	ret = bpf_map_update_elem(&dom_data, &dom_id, &domc_init, 0);
-+	if (ret) {
-+		scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret);
-+		return ret;
-+	}
-+
-+	domc = bpf_map_lookup_elem(&dom_data, &dom_id);
-+	if (!domc) {
-+		/* Should never happen, we just inserted it above. */
-+		scx_bpf_error("No dom%u", dom_id);
-+		return -ENOENT;
-+	}
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask) {
-+		scx_bpf_error("Failed to create BPF cpumask for domain %u", dom_id);
-+		return -ENOMEM;
-+	}
-+
-+	for (cpu = 0; cpu < MAX_CPUS; cpu++) {
-+		const volatile u64 *dmask;
-+
-+		dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]);
-+		if (!dmask) {
-+			scx_bpf_error("array index error");
-+			bpf_cpumask_release(cpumask);
-+			return -ENOENT;
-+		}
-+
-+		if (*dmask & (1LLU << (cpu % 64))) {
-+			bpf_cpumask_set_cpu(cpu, cpumask);
-+
-+			bpf_rcu_read_lock();
-+			if (all_cpumask)
-+				bpf_cpumask_set_cpu(cpu, all_cpumask);
-+			bpf_rcu_read_unlock();
-+		}
-+	}
-+
-+	cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask);
-+	if (cpumask) {
-+		scx_bpf_error("Domain %u cpumask already present", dom_id);
-+		bpf_cpumask_release(cpumask);
-+		return -EEXIST;
-+	}
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask) {
-+		scx_bpf_error("Failed to create BPF cpumask for domain %u",
-+			      dom_id);
-+		return -ENOMEM;
-+	}
-+
-+	cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask);
-+	if (cpumask) {
-+		scx_bpf_error("Domain %u direct_greedy_cpumask already present",
-+			      dom_id);
-+		bpf_cpumask_release(cpumask);
-+		return -EEXIST;
-+	}
-+
-+	return 0;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
-+{
-+	struct bpf_cpumask *cpumask;
-+	s32 i, ret;
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+	cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+	cpumask = bpf_kptr_xchg(&direct_greedy_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	cpumask = bpf_cpumask_create();
-+	if (!cpumask)
-+		return -ENOMEM;
-+	cpumask = bpf_kptr_xchg(&kick_greedy_cpumask, cpumask);
-+	if (cpumask)
-+		bpf_cpumask_release(cpumask);
-+
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+
-+	bpf_for(i, 0, nr_doms) {
-+		ret = create_dom(i);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	bpf_for(i, 0, nr_cpus)
-+		pcpu_ctx[i].dom_rr_cur = i;
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei)
-+{
-+	bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg);
-+	exit_kind = ei->kind;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops rusty = {
-+	.select_cpu		= (void *)rusty_select_cpu,
-+	.enqueue		= (void *)rusty_enqueue,
-+	.dispatch		= (void *)rusty_dispatch,
-+	.runnable		= (void *)rusty_runnable,
-+	.running		= (void *)rusty_running,
-+	.stopping		= (void *)rusty_stopping,
-+	.quiescent		= (void *)rusty_quiescent,
-+	.set_weight		= (void *)rusty_set_weight,
-+	.set_cpumask		= (void *)rusty_set_cpumask,
-+	.prep_enable		= (void *)rusty_prep_enable,
-+	.disable		= (void *)rusty_disable,
-+	.init			= (void *)rusty_init,
-+	.exit			= (void *)rusty_exit,
-+	.name			= "rusty",
-+};
-diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
-new file mode 100644
-index 000000000..8a7487cf4
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h
-@@ -0,0 +1,97 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#ifndef __RUSTY_H
-+#define __RUSTY_H
-+
-+#include <stdbool.h>
-+#ifndef __kptr
-+#ifdef __KERNEL__
-+#error "__kptr_ref not defined in the kernel"
-+#endif
-+#define __kptr
-+#endif
-+
-+#ifndef __KERNEL__
-+typedef unsigned char u8;
-+typedef unsigned int u32;
-+typedef unsigned long long u64;
-+#endif
-+
-+#include "../../../ravg.bpf.h"
-+
-+enum consts {
-+	MAX_CPUS		= 512,
-+	MAX_DOMS		= 64,	/* limited to avoid complex bitmask ops */
-+	CACHELINE_SIZE		= 64,
-+
-+	/*
-+	 * When userspace load balancer is trying to determine the tasks to push
-+	 * out from an overloaded domain, it looks at the the following number
-+	 * of recently active tasks of the domain. While this may lead to
-+	 * spurious migration victim selection failures in pathological cases,
-+	 * this isn't a practical problem as the LB rounds are best-effort
-+	 * anyway and will be retried until loads are balanced.
-+	 */
-+	MAX_DOM_ACTIVE_PIDS	= 1024,
-+};
-+
-+/* Statistics */
-+enum stat_idx {
-+	/* The following fields add up to all dispatched tasks */
-+	RUSTY_STAT_WAKE_SYNC,
-+	RUSTY_STAT_PREV_IDLE,
-+	RUSTY_STAT_GREEDY_IDLE,
-+	RUSTY_STAT_PINNED,
-+	RUSTY_STAT_DIRECT_DISPATCH,
-+	RUSTY_STAT_DIRECT_GREEDY,
-+	RUSTY_STAT_DIRECT_GREEDY_FAR,
-+	RUSTY_STAT_DSQ_DISPATCH,
-+	RUSTY_STAT_GREEDY,
-+
-+	/* Extra stats that don't contribute to total */
-+	RUSTY_STAT_REPATRIATE,
-+	RUSTY_STAT_KICK_GREEDY,
-+	RUSTY_STAT_LOAD_BALANCE,
-+
-+	/* Errors */
-+	RUSTY_STAT_TASK_GET_ERR,
-+
-+	RUSTY_NR_STATS,
-+};
-+
-+struct task_ctx {
-+	/* The domains this task can run on */
-+	u64 dom_mask;
-+
-+	struct bpf_cpumask __kptr *cpumask;
-+	u32 dom_id;
-+	u32 weight;
-+	bool runnable;
-+	u64 dom_active_pids_gen;
-+	u64 running_at;
-+
-+	/* The task is a workqueue worker thread */
-+	bool is_kworker;
-+
-+	/* Allowed on all CPUs and eligible for DIRECT_GREEDY optimization */
-+	bool all_cpus;
-+
-+	/* select_cpu() telling enqueue() to queue directly on the DSQ */
-+	bool dispatch_local;
-+
-+	struct ravg_data dcyc_rd;
-+};
-+
-+struct dom_ctx {
-+	u64 vtime_now;
-+	struct bpf_cpumask __kptr *cpumask;
-+	struct bpf_cpumask __kptr *direct_greedy_cpumask;
-+
-+	u64 load;
-+	struct ravg_data load_rd;
-+	u64 dbg_load_printed_at;
-+};
-+
-+#endif /* __RUSTY_H */
-diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
-new file mode 100644
-index 000000000..3b0bcd742
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/src/main.rs
-@@ -0,0 +1,1265 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#[path = "bpf/.output/rusty.skel.rs"]
-+mod rusty;
-+pub use rusty::*;
-+pub mod rusty_sys;
-+
-+use std::cell::Cell;
-+use std::collections::BTreeMap;
-+use std::collections::BTreeSet;
-+use std::ffi::CStr;
-+use std::ops::Bound::Included;
-+use std::ops::Bound::Unbounded;
-+use std::sync::atomic::AtomicBool;
-+use std::sync::atomic::Ordering;
-+use std::sync::Arc;
-+use std::time::Duration;
-+use std::time::Instant;
-+
-+use ::fb_procfs as procfs;
-+use anyhow::anyhow;
-+use anyhow::bail;
-+use anyhow::Context;
-+use anyhow::Result;
-+use bitvec::prelude::*;
-+use clap::Parser;
-+use libbpf_rs::skel::OpenSkel as _;
-+use libbpf_rs::skel::Skel as _;
-+use libbpf_rs::skel::SkelBuilder as _;
-+use log::debug;
-+use log::info;
-+use log::trace;
-+use log::warn;
-+use ordered_float::OrderedFloat;
-+
-+const RAVG_FRAC_BITS: u32 = rusty_sys::ravg_consts_RAVG_FRAC_BITS;
-+const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize;
-+const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize;
-+
-+include!("../../ravg_read.rs.h");
-+
-+/// scx_rusty: A multi-domain BPF / userspace hybrid scheduler
-+///
-+/// The BPF part does simple vtime or round robin scheduling in each domain
-+/// while tracking average load of each domain and duty cycle of each task.
-+///
-+/// The userspace part performs two roles. First, it makes higher frequency
-+/// (100ms) tuning decisions. It identifies CPUs which are not too heavily
-+/// loaded and mark them so that they can pull tasks from other overloaded
-+/// domains on the fly.
-+///
-+/// Second, it drives lower frequency (2s) load balancing. It determines
-+/// whether load balancing is necessary by comparing domain load averages.
-+/// If there are large enough load differences, it examines upto 1024
-+/// recently active tasks on the domain to determine which should be
-+/// migrated.
-+///
-+/// The overhead of userspace operations is low. Load balancing is not
-+/// performed frequently but work-conservation is still maintained through
-+/// tuning and greedy execution. Load balancing itself is not that expensive
-+/// either. It only accesses per-domain load metrics to determine the
-+/// domains that need load balancing and limited number of per-task metrics
-+/// for each pushing domain.
-+///
-+/// An earlier variant of this scheduler was used to balance across six
-+/// domains, each representing a chiplet in a six-chiplet AMD processor, and
-+/// could match the performance of production setup using CFS.
-+///
-+/// WARNING: Very high weight (low nice value) tasks can throw off load
-+/// balancing due to infeasible weight problem. This problem will be solved
-+/// in the near future.
-+///
-+/// WARNING: scx_rusty currently assumes that all domains have equal
-+/// processing power and at similar distances from each other. This
-+/// limitation will be removed in the future.
-+#[derive(Debug, Parser)]
-+struct Opts {
-+    /// Scheduling slice duration in microseconds.
-+    #[clap(short = 's', long, default_value = "20000")]
-+    slice_us: u64,
-+
-+    /// Monitoring and load balance interval in seconds.
-+    #[clap(short = 'i', long, default_value = "2.0")]
-+    interval: f64,
-+
-+    /// Tuner runs at higher frequency than the load balancer to dynamically
-+    /// tune scheduling behavior. Tuning interval in seconds.
-+    #[clap(short = 'I', long, default_value = "0.1")]
-+    tune_interval: f64,
-+
-+    /// The half-life of task and domain load running averages in seconds.
-+    #[clap(short = 'l', long, default_value = "1.0")]
-+    load_half_life: f64,
-+
-+    /// Build domains according to how CPUs are grouped at this cache level
-+    /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id.
-+    #[clap(short = 'c', long, default_value = "3")]
-+    cache_level: u32,
-+
-+    /// Instead of using cache locality, set the cpumask for each domain
-+    /// manually, provide multiple --cpumasks, one for each domain. E.g.
-+    /// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with
-+    /// the corresponding CPUs belonging to each domain. Each CPU must
-+    /// belong to precisely one domain.
-+    #[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
-+    cpumasks: Vec<String>,
-+
-+    /// When non-zero, enable greedy task stealing. When a domain is idle, a
-+    /// cpu will attempt to steal tasks from a domain with at least
-+    /// greedy_threshold tasks enqueued. These tasks aren't permanently
-+    /// stolen from the domain.
-+    #[clap(short = 'g', long, default_value = "1")]
-+    greedy_threshold: u32,
-+
-+    /// Disable load balancing. Unless disabled, periodically userspace will
-+    /// calculate the load factor of each domain and instruct BPF which
-+    /// processes to move.
-+    #[clap(long, action = clap::ArgAction::SetTrue)]
-+    no_load_balance: bool,
-+
-+    /// Put per-cpu kthreads directly into local dsq's.
-+    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
-+    kthreads_local: bool,
-+
-+    /// In recent kernels (>=v6.6), the kernel is responsible for balancing
-+    /// kworkers across L3 cache domains. Exclude them from load-balancing
-+    /// to avoid conflicting operations. Greedy executions still apply.
-+    #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)]
-+    balanced_kworkers: bool,
-+
-+    /// Use FIFO scheduling instead of weighted vtime scheduling.
-+    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
-+    fifo_sched: bool,
-+
-+    /// Idle CPUs with utilization lower than this will get remote tasks
-+    /// directly pushed on them. 0 disables, 100 enables always.
-+    #[clap(short = 'D', long, default_value = "90.0")]
-+    direct_greedy_under: f64,
-+
-+    /// Idle CPUs with utilization lower than this may get kicked to
-+    /// accelerate stealing when a task is queued on a saturated remote
-+    /// domain. 0 disables, 100 enables always.
-+    #[clap(short = 'K', long, default_value = "100.0")]
-+    kick_greedy_under: f64,
-+
-+    /// If specified, only tasks which have their scheduling policy set to
-+    /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
-+    /// tasks are switched.
-+    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
-+    partial: bool,
-+
-+    /// Enable verbose output including libbpf details. Specify multiple
-+    /// times to increase verbosity.
-+    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
-+    verbose: u8,
-+}
-+
-+fn now_monotonic() -> u64 {
-+    let mut time = libc::timespec {
-+        tv_sec: 0,
-+        tv_nsec: 0,
-+    };
-+    let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) };
-+    assert!(ret == 0);
-+    time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64
-+}
-+
-+fn clear_map(map: &libbpf_rs::Map) {
-+    for key in map.keys() {
-+        let _ = map.delete(&key);
-+    }
-+}
-+
-+fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String {
-+    cpumask
-+        .iter()
-+        .take((nr_cpus + 64) / 64)
-+        .rev()
-+        .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x))
-+}
-+
-+fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
-+    reader
-+        .read_stat()
-+        .context("Failed to read procfs")?
-+        .total_cpu
-+        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
-+}
-+
-+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
-+    match (curr, prev) {
-+        (
-+            procfs::CpuStat {
-+                user_usec: Some(prev_user),
-+                nice_usec: Some(prev_nice),
-+                system_usec: Some(prev_system),
-+                idle_usec: Some(prev_idle),
-+                iowait_usec: Some(prev_iowait),
-+                irq_usec: Some(prev_irq),
-+                softirq_usec: Some(prev_softirq),
-+                stolen_usec: Some(prev_stolen),
-+                ..
-+            },
-+            procfs::CpuStat {
-+                user_usec: Some(curr_user),
-+                nice_usec: Some(curr_nice),
-+                system_usec: Some(curr_system),
-+                idle_usec: Some(curr_idle),
-+                iowait_usec: Some(curr_iowait),
-+                irq_usec: Some(curr_irq),
-+                softirq_usec: Some(curr_softirq),
-+                stolen_usec: Some(curr_stolen),
-+                ..
-+            },
-+        ) => {
-+            let idle_usec = curr_idle - prev_idle;
-+            let iowait_usec = curr_iowait - prev_iowait;
-+            let user_usec = curr_user - prev_user;
-+            let system_usec = curr_system - prev_system;
-+            let nice_usec = curr_nice - prev_nice;
-+            let irq_usec = curr_irq - prev_irq;
-+            let softirq_usec = curr_softirq - prev_softirq;
-+            let stolen_usec = curr_stolen - prev_stolen;
-+
-+            let busy_usec =
-+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
-+            let total_usec = idle_usec + busy_usec + iowait_usec;
-+            if total_usec > 0 {
-+                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
-+            } else {
-+                Ok(1.0)
-+            }
-+        }
-+        _ => {
-+            bail!("Missing stats in cpustat");
-+        }
-+    }
-+}
-+
-+#[derive(Debug)]
-+struct Topology {
-+    nr_cpus: usize,
-+    nr_doms: usize,
-+    dom_cpus: Vec<BitVec<u64, Lsb0>>,
-+    cpu_dom: Vec<Option<usize>>,
-+}
-+
-+impl Topology {
-+    fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result<Self> {
-+        if cpumasks.len() > MAX_DOMS {
-+            bail!(
-+                "Number of requested domains ({}) is greater than MAX_DOMS ({})",
-+                cpumasks.len(),
-+                MAX_DOMS
-+            );
-+        }
-+        let mut cpu_dom = vec![None; nr_cpus];
-+        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; cpumasks.len()];
-+        for (dom, cpumask) in cpumasks.iter().enumerate() {
-+            let hex_str = {
-+                let mut tmp_str = cpumask
-+                    .strip_prefix("0x")
-+                    .unwrap_or(cpumask)
-+                    .replace('_', "");
-+                if tmp_str.len() % 2 != 0 {
-+                    tmp_str = "0".to_string() + &tmp_str;
-+                }
-+                tmp_str
-+            };
-+            let byte_vec = hex::decode(&hex_str)
-+                .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?;
-+
-+            for (index, &val) in byte_vec.iter().rev().enumerate() {
-+                let mut v = val;
-+                while v != 0 {
-+                    let lsb = v.trailing_zeros() as usize;
-+                    v &= !(1 << lsb);
-+                    let cpu = index * 8 + lsb;
-+                    if cpu > nr_cpus {
-+                        bail!(
-+                            concat!(
-+                                "Found cpu ({}) in cpumask ({}) which is larger",
-+                                " than the number of cpus on the machine ({})"
-+                            ),
-+                            cpu,
-+                            cpumask,
-+                            nr_cpus
-+                        );
-+                    }
-+                    if let Some(other_dom) = cpu_dom[cpu] {
-+                        bail!(
-+                            "Found cpu ({}) with domain ({}) but also in cpumask ({})",
-+                            cpu,
-+                            other_dom,
-+                            cpumask
-+                        );
-+                    }
-+                    cpu_dom[cpu] = Some(dom);
-+                    dom_cpus[dom].set(cpu, true);
-+                }
-+            }
-+            dom_cpus[dom].set_uninitialized(false);
-+        }
-+
-+        for (cpu, dom) in cpu_dom.iter().enumerate() {
-+            if dom.is_none() {
-+                bail!(
-+                    "CPU {} not assigned to any domain. Make sure it is covered by some --cpumasks argument.",
-+                    cpu
-+                );
-+            }
-+        }
-+
-+        Ok(Self {
-+            nr_cpus,
-+            nr_doms: dom_cpus.len(),
-+            dom_cpus,
-+            cpu_dom,
-+        })
-+    }
-+
-+    fn from_cache_level(level: u32, nr_cpus: usize) -> Result<Self> {
-+        let mut cpu_to_cache = vec![]; // (cpu_id, Option<cache_id>)
-+        let mut cache_ids = BTreeSet::<usize>::new();
-+        let mut nr_offline = 0;
-+
-+        // Build cpu -> cache ID mapping.
-+        for cpu in 0..nr_cpus {
-+            let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level);
-+            let id = match std::fs::read_to_string(&path) {
-+                Ok(val) => Some(val.trim().parse::<usize>().with_context(|| {
-+                    format!("Failed to parse {:?}'s content {:?}", &path, &val)
-+                })?),
-+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-+                    nr_offline += 1;
-+                    None
-+                }
-+                Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)),
-+            };
-+
-+            cpu_to_cache.push(id);
-+            if let Some(id) = id {
-+                cache_ids.insert(id);
-+            }
-+        }
-+
-+        info!(
-+            "CPUs: online/possible = {}/{}",
-+            nr_cpus - nr_offline,
-+            nr_cpus
-+        );
-+
-+        // Cache IDs may have holes. Assign consecutive domain IDs to
-+        // existing cache IDs.
-+        let mut cache_to_dom = BTreeMap::<usize, usize>::new();
-+        let mut nr_doms = 0;
-+        for cache_id in cache_ids.iter() {
-+            cache_to_dom.insert(*cache_id, nr_doms);
-+            nr_doms += 1;
-+        }
-+
-+        if nr_doms > MAX_DOMS {
-+            bail!(
-+                "Total number of doms {} is greater than MAX_DOMS ({})",
-+                nr_doms,
-+                MAX_DOMS
-+            );
-+        }
-+
-+        // Build and return dom -> cpumask and cpu -> dom mappings.
-+        let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; nr_doms];
-+        let mut cpu_dom = vec![];
-+
-+        for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) {
-+            match cache {
-+                Some(cache_id) => {
-+                    let dom_id = cache_to_dom[cache_id];
-+                    dom_cpus[dom_id].set(cpu, true);
-+                    cpu_dom.push(Some(dom_id));
-+                }
-+                None => {
-+                    dom_cpus[0].set(cpu, true);
-+                    cpu_dom.push(None);
-+                }
-+            }
-+        }
-+
-+        Ok(Self {
-+            nr_cpus,
-+            nr_doms: dom_cpus.len(),
-+            dom_cpus,
-+            cpu_dom,
-+        })
-+    }
-+}
-+
-+struct Tuner {
-+    top: Arc<Topology>,
-+    direct_greedy_under: f64,
-+    kick_greedy_under: f64,
-+    proc_reader: procfs::ProcReader,
-+    prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
-+    dom_utils: Vec<f64>,
-+}
-+
-+impl Tuner {
-+    fn new(top: Arc<Topology>, opts: &Opts) -> Result<Self> {
-+        let proc_reader = procfs::ProcReader::new();
-+        let prev_cpu_stats = proc_reader
-+            .read_stat()?
-+            .cpus_map
-+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
-+        Ok(Self {
-+            direct_greedy_under: opts.direct_greedy_under / 100.0,
-+            kick_greedy_under: opts.kick_greedy_under / 100.0,
-+            proc_reader,
-+            prev_cpu_stats,
-+            dom_utils: vec![0.0; top.nr_doms],
-+            top,
-+        })
-+    }
-+
-+    fn step(&mut self, skel: &mut RustySkel) -> Result<()> {
-+        let curr_cpu_stats = self
-+            .proc_reader
-+            .read_stat()?
-+            .cpus_map
-+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
-+        let ti = &mut skel.bss().tune_input;
-+        let mut dom_nr_cpus = vec![0; self.top.nr_doms];
-+        let mut dom_util_sum = vec![0.0; self.top.nr_doms];
-+
-+        for cpu in 0..self.top.nr_cpus {
-+            let cpu32 = cpu as u32;
-+            // None domain indicates the CPU was offline during
-+            // initialization and None CpuStat indicates the CPU has gone
-+            // down since then. Ignore both.
-+            if let (Some(dom), Some(curr), Some(prev)) = (
-+                self.top.cpu_dom[cpu],
-+                curr_cpu_stats.get(&cpu32),
-+                self.prev_cpu_stats.get(&cpu32),
-+            ) {
-+                dom_nr_cpus[dom] += 1;
-+                dom_util_sum[dom] += calc_util(curr, prev)?;
-+            }
-+        }
-+
-+        for dom in 0..self.top.nr_doms {
-+            // Calculate the domain avg util. If there are no active CPUs,
-+            // it doesn't really matter. Go with 0.0 as that's less likely
-+            // to confuse users.
-+            let util = match dom_nr_cpus[dom] {
-+                0 => 0.0,
-+                nr => dom_util_sum[dom] / nr as f64,
-+            };
-+
-+            self.dom_utils[dom] = util;
-+
-+            // This could be implemented better.
-+            let update_dom_bits = |target: &mut [u64; 8], val: bool| {
-+                for cpu in 0..self.top.nr_cpus {
-+                    if let Some(cdom) = self.top.cpu_dom[cpu] {
-+                        if cdom == dom {
-+                            if val {
-+                                target[cpu / 64] |= 1u64 << (cpu % 64);
-+                            } else {
-+                                target[cpu / 64] &= !(1u64 << (cpu % 64));
-+                            }
-+                        }
-+                    }
-+                }
-+            };
-+
-+            update_dom_bits(
-+                &mut ti.direct_greedy_cpumask,
-+                self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under,
-+            );
-+            update_dom_bits(
-+                &mut ti.kick_greedy_cpumask,
-+                self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under,
-+            );
-+        }
-+
-+        ti.gen += 1;
-+        self.prev_cpu_stats = curr_cpu_stats;
-+        Ok(())
-+    }
-+}
-+
-+#[derive(Debug)]
-+struct TaskInfo {
-+    pid: i32,
-+    dom_mask: u64,
-+    migrated: Cell<bool>,
-+    is_kworker: bool,
-+}
-+
-+struct LoadBalancer<'a, 'b, 'c> {
-+    skel: &'a mut RustySkel<'b>,
-+    top: Arc<Topology>,
-+    skip_kworkers: bool,
-+
-+    tasks_by_load: Vec<Option<BTreeMap<OrderedFloat<f64>, TaskInfo>>>,
-+    load_avg: f64,
-+    dom_loads: Vec<f64>,
-+
-+    imbal: Vec<f64>,
-+    doms_to_push: BTreeMap<OrderedFloat<f64>, u32>,
-+    doms_to_pull: BTreeMap<OrderedFloat<f64>, u32>,
-+
-+    nr_lb_data_errors: &'c mut u64,
-+}
-+
-+impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
-+    // If imbalance gets higher than this ratio, try to balance the loads.
-+    const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10;
-+
-+    // Aim to transfer this fraction of the imbalance on each round. We want
-+    // to be gradual to avoid unnecessary oscillations. While this can delay
-+    // convergence, greedy execution should be able to bridge the temporary
-+    // gap.
-+    const LOAD_IMBAL_XFER_TARGET_RATIO: f64 = 0.50;
-+
-+    // Don't push out more than this ratio of load on each round. While this
-+    // overlaps with XFER_TARGET_RATIO, XFER_TARGET_RATIO only defines the
-+    // target and doesn't limit the total load. As long as the transfer
-+    // reduces load imbalance between the two involved domains, it'd happily
-+    // transfer whatever amount that can be transferred. This limit is used
-+    // as the safety cap to avoid draining a given domain too much in a
-+    // single round.
-+    const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50;
-+
-+    fn new(
-+        skel: &'a mut RustySkel<'b>,
-+        top: Arc<Topology>,
-+        skip_kworkers: bool,
-+        nr_lb_data_errors: &'c mut u64,
-+    ) -> Self {
-+        Self {
-+            skel,
-+            skip_kworkers,
-+
-+            tasks_by_load: (0..top.nr_doms).map(|_| None).collect(),
-+            load_avg: 0f64,
-+            dom_loads: vec![0.0; top.nr_doms],
-+
-+            imbal: vec![0.0; top.nr_doms],
-+            doms_to_pull: BTreeMap::new(),
-+            doms_to_push: BTreeMap::new(),
-+
-+            nr_lb_data_errors,
-+
-+            top,
-+        }
-+    }
-+
-+    fn read_dom_loads(&mut self) -> Result<()> {
-+        let now_mono = now_monotonic();
-+        let load_half_life = self.skel.rodata().load_half_life;
-+        let maps = self.skel.maps();
-+        let dom_data = maps.dom_data();
-+        let mut load_sum = 0.0f64;
-+
-+        for i in 0..self.top.nr_doms {
-+            let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
-+
-+            if let Some(dom_ctx_map_elem) = dom_data
-+                .lookup(&key, libbpf_rs::MapFlags::ANY)
-+                .context("Failed to lookup dom_ctx")?
-+            {
-+                let dom_ctx = unsafe {
-+                    &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx)
-+                };
-+
-+                let rd = &dom_ctx.load_rd;
-+                self.dom_loads[i] = ravg_read(
-+                    rd.val,
-+                    rd.val_at,
-+                    rd.old,
-+                    rd.cur,
-+                    now_mono,
-+                    load_half_life,
-+                    RAVG_FRAC_BITS,
-+                );
-+
-+                load_sum += self.dom_loads[i];
-+            }
-+        }
-+
-+        self.load_avg = load_sum / self.top.nr_doms as f64;
-+
-+        Ok(())
-+    }
-+
-+    /// To balance dom loads, identify doms with lower and higher load than
-+    /// average.
-+    fn calculate_dom_load_balance(&mut self) -> Result<()> {
-+        for (dom, dom_load) in self.dom_loads.iter().enumerate() {
-+            let imbal = dom_load - self.load_avg;
-+            if imbal.abs() >= self.load_avg * Self::LOAD_IMBAL_HIGH_RATIO {
-+                if imbal > 0f64 {
-+                    self.doms_to_push.insert(OrderedFloat(imbal), dom as u32);
-+                } else {
-+                    self.doms_to_pull.insert(OrderedFloat(-imbal), dom as u32);
-+                }
-+                self.imbal[dom] = imbal;
-+            }
-+        }
-+        Ok(())
-+    }
-+
-+    /// @dom needs to push out tasks to balance loads. Make sure its
-+    /// tasks_by_load is populated so that the victim tasks can be picked.
-+    fn populate_tasks_by_load(&mut self, dom: u32) -> Result<()> {
-+        if self.tasks_by_load[dom as usize].is_some() {
-+            return Ok(());
-+        }
-+
-+        // Read active_pids and update write_idx and gen.
-+        //
-+        // XXX - We can't read task_ctx inline because self.skel.bss()
-+        // borrows mutably and thus conflicts with self.skel.maps().
-+        const MAX_PIDS: u64 = rusty_sys::consts_MAX_DOM_ACTIVE_PIDS as u64;
-+        let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
-+        let mut pids = vec![];
-+
-+        let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx);
-+        if widx - ridx > MAX_PIDS {
-+            ridx = widx - MAX_PIDS;
-+        }
-+
-+        for idx in ridx..widx {
-+            let pid = active_pids.pids[(idx % MAX_PIDS) as usize];
-+            pids.push(pid);
-+        }
-+
-+        active_pids.read_idx = active_pids.write_idx;
-+        active_pids.gen += 1;
-+
-+        // Read task_ctx and load.
-+        let load_half_life = self.skel.rodata().load_half_life;
-+        let maps = self.skel.maps();
-+        let task_data = maps.task_data();
-+        let now_mono = now_monotonic();
-+        let mut tasks_by_load = BTreeMap::new();
-+
-+        for pid in pids.iter() {
-+            let key = unsafe { std::mem::transmute::<i32, [u8; 4]>(*pid) };
-+
-+            if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? {
-+                let task_ctx =
-+                    unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) };
-+
-+                if task_ctx.dom_id != dom {
-+                    continue;
-+                }
-+
-+                let rd = &task_ctx.dcyc_rd;
-+                let load = task_ctx.weight as f64
-+                    * ravg_read(
-+                        rd.val,
-+                        rd.val_at,
-+                        rd.old,
-+                        rd.cur,
-+                        now_mono,
-+                        load_half_life,
-+                        RAVG_FRAC_BITS,
-+                    );
-+
-+                tasks_by_load.insert(
-+                    OrderedFloat(load),
-+                    TaskInfo {
-+                        pid: *pid,
-+                        dom_mask: task_ctx.dom_mask,
-+                        migrated: Cell::new(false),
-+                        is_kworker: task_ctx.is_kworker,
-+                    },
-+                );
-+            }
-+        }
-+
-+        debug!(
-+            "DOM[{:02}] read load for {} tasks",
-+            dom,
-+            &tasks_by_load.len(),
-+        );
-+        trace!("DOM[{:02}] tasks_by_load={:?}", dom, &tasks_by_load);
-+
-+        self.tasks_by_load[dom as usize] = Some(tasks_by_load);
-+        Ok(())
-+    }
-+
-+    // Find the first candidate pid which hasn't already been migrated and
-+    // can run in @pull_dom.
-+    fn find_first_candidate<'d, I>(
-+        tasks_by_load: I,
-+        pull_dom: u32,
-+        skip_kworkers: bool,
-+    ) -> Option<(f64, &'d TaskInfo)>
-+    where
-+        I: IntoIterator<Item = (&'d OrderedFloat<f64>, &'d TaskInfo)>,
-+    {
-+        match tasks_by_load
-+            .into_iter()
-+            .skip_while(|(_, task)| {
-+                task.migrated.get()
-+                    || (task.dom_mask & (1 << pull_dom) == 0)
-+                    || (skip_kworkers && task.is_kworker)
-+            })
-+            .next()
-+        {
-+            Some((OrderedFloat(load), task)) => Some((*load, task)),
-+            None => None,
-+        }
-+    }
-+
-+    fn pick_victim(
-+        &mut self,
-+        (push_dom, to_push): (u32, f64),
-+        (pull_dom, to_pull): (u32, f64),
-+    ) -> Result<Option<(&TaskInfo, f64)>> {
-+        let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO;
-+
-+        debug!(
-+            "considering dom {}@{:.2} -> {}@{:.2}",
-+            push_dom, to_push, pull_dom, to_pull
-+        );
-+
-+        let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs();
-+
-+        self.populate_tasks_by_load(push_dom)?;
-+
-+        // We want to pick a task to transfer from push_dom to pull_dom to
-+        // reduce the load imbalance between the two closest to $to_xfer.
-+        // IOW, pick a task which has the closest load value to $to_xfer
-+        // that can be migrated. Find such task by locating the first
-+        // migratable task while scanning left from $to_xfer and the
-+        // counterpart while scanning right and picking the better of the
-+        // two.
-+        let (load, task, new_imbal) = match (
-+            Self::find_first_candidate(
-+                self.tasks_by_load[push_dom as usize]
-+                    .as_ref()
-+                    .unwrap()
-+                    .range((Unbounded, Included(&OrderedFloat(to_xfer))))
-+                    .rev(),
-+                pull_dom,
-+                self.skip_kworkers,
-+            ),
-+            Self::find_first_candidate(
-+                self.tasks_by_load[push_dom as usize]
-+                    .as_ref()
-+                    .unwrap()
-+                    .range((Included(&OrderedFloat(to_xfer)), Unbounded)),
-+                pull_dom,
-+                self.skip_kworkers,
-+            ),
-+        ) {
-+            (None, None) => return Ok(None),
-+            (Some((load, task)), None) | (None, Some((load, task))) => {
-+                (load, task, calc_new_imbal(load))
-+            }
-+            (Some((load0, task0)), Some((load1, task1))) => {
-+                let (new_imbal0, new_imbal1) = (calc_new_imbal(load0), calc_new_imbal(load1));
-+                if new_imbal0 <= new_imbal1 {
-+                    (load0, task0, new_imbal0)
-+                } else {
-+                    (load1, task1, new_imbal1)
-+                }
-+            }
-+        };
-+
-+        // If the best candidate can't reduce the imbalance, there's nothing
-+        // to do for this pair.
-+        let old_imbal = to_push + to_pull;
-+        if old_imbal < new_imbal {
-+            debug!(
-+                "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}",
-+                task.pid, push_dom, pull_dom, old_imbal, new_imbal
-+            );
-+            return Ok(None);
-+        }
-+
-+        debug!(
-+            "migrating pid {}, dom {} -> {}, imbal={:.2} -> {:.2}",
-+            task.pid, push_dom, pull_dom, old_imbal, new_imbal,
-+        );
-+
-+        Ok(Some((task, load)))
-+    }
-+
-+    // Actually execute the load balancing. Concretely this writes pid -> dom
-+    // entries into the lb_data map for bpf side to consume.
-+    fn load_balance(&mut self) -> Result<()> {
-+        clear_map(self.skel.maps().lb_data());
-+
-+        debug!("imbal={:?}", &self.imbal);
-+        debug!("doms_to_push={:?}", &self.doms_to_push);
-+        debug!("doms_to_pull={:?}", &self.doms_to_pull);
-+
-+        // Push from the most imbalanced to least.
-+        while let Some((OrderedFloat(mut to_push), push_dom)) = self.doms_to_push.pop_last() {
-+            let push_max = self.dom_loads[push_dom as usize] * Self::LOAD_IMBAL_PUSH_MAX_RATIO;
-+            let mut pushed = 0f64;
-+
-+            // Transfer tasks from push_dom to reduce imbalance.
-+            loop {
-+                let last_pushed = pushed;
-+
-+                // Pull from the most imbalaned to least.
-+                let mut doms_to_pull = BTreeMap::<_, _>::new();
-+                std::mem::swap(&mut self.doms_to_pull, &mut doms_to_pull);
-+                let mut pull_doms = doms_to_pull.into_iter().rev().collect::<Vec<(_, _)>>();
-+
-+                for (to_pull, pull_dom) in pull_doms.iter_mut() {
-+                    if let Some((task, load)) =
-+                        self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))?
-+                    {
-+                        // Execute migration.
-+                        task.migrated.set(true);
-+                        to_push -= load;
-+                        *to_pull -= load;
-+                        pushed += load;
-+
-+                        // Ask BPF code to execute the migration.
-+                        let pid = task.pid;
-+                        let cpid = (pid as libc::pid_t).to_ne_bytes();
-+                        if let Err(e) = self.skel.maps_mut().lb_data().update(
-+                            &cpid,
-+                            &pull_dom.to_ne_bytes(),
-+                            libbpf_rs::MapFlags::NO_EXIST,
-+                        ) {
-+                            warn!(
-+                                "Failed to update lb_data map for pid={} error={:?}",
-+                                pid, &e
-+                            );
-+                            *self.nr_lb_data_errors += 1;
-+                        }
-+
-+                        // Always break after a successful migration so that
-+                        // the pulling domains are always considered in the
-+                        // descending imbalance order.
-+                        break;
-+                    }
-+                }
-+
-+                pull_doms
-+                    .into_iter()
-+                    .map(|(k, v)| self.doms_to_pull.insert(k, v))
-+                    .count();
-+
-+                // Stop repeating if nothing got transferred or pushed enough.
-+                if pushed == last_pushed || pushed >= push_max {
-+                    break;
-+                }
-+            }
-+        }
-+        Ok(())
-+    }
-+}
-+
-+struct Scheduler<'a> {
-+    skel: RustySkel<'a>,
-+    struct_ops: Option<libbpf_rs::Link>,
-+
-+    sched_interval: Duration,
-+    tune_interval: Duration,
-+    balance_load: bool,
-+    balanced_kworkers: bool,
-+
-+    top: Arc<Topology>,
-+    proc_reader: procfs::ProcReader,
-+
-+    prev_at: Instant,
-+    prev_total_cpu: procfs::CpuStat,
-+
-+    nr_lb_data_errors: u64,
-+
-+    tuner: Tuner,
-+}
-+
-+impl<'a> Scheduler<'a> {
-+    fn init(opts: &Opts) -> Result<Self> {
-+        // Open the BPF prog first for verification.
-+        let mut skel_builder = RustySkelBuilder::default();
-+        skel_builder.obj_builder.debug(opts.verbose > 0);
-+        let mut skel = skel_builder.open().context("Failed to open BPF program")?;
-+
-+        let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
-+        if nr_cpus > MAX_CPUS {
-+            bail!(
-+                "nr_cpus ({}) is greater than MAX_CPUS ({})",
-+                nr_cpus,
-+                MAX_CPUS
-+            );
-+        }
-+
-+        // Initialize skel according to @opts.
-+        let top = Arc::new(if !opts.cpumasks.is_empty() {
-+            Topology::from_cpumasks(&opts.cpumasks, nr_cpus)?
-+        } else {
-+            Topology::from_cache_level(opts.cache_level, nr_cpus)?
-+        });
-+
-+        skel.rodata().nr_doms = top.nr_doms as u32;
-+        skel.rodata().nr_cpus = top.nr_cpus as u32;
-+
-+        for (cpu, dom) in top.cpu_dom.iter().enumerate() {
-+            skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
-+        }
-+
-+        for (dom, cpus) in top.dom_cpus.iter().enumerate() {
-+            let raw_cpus_slice = cpus.as_raw_slice();
-+            let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
-+            let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len());
-+            left.clone_from_slice(cpus.as_raw_slice());
-+            info!(
-+                "DOM[{:02}] cpumask{} ({} cpus)",
-+                dom,
-+                &format_cpumask(dom_cpumask_slice, nr_cpus),
-+                cpus.count_ones()
-+            );
-+        }
-+
-+        skel.rodata().slice_ns = opts.slice_us * 1000;
-+        skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
-+        skel.rodata().kthreads_local = opts.kthreads_local;
-+        skel.rodata().fifo_sched = opts.fifo_sched;
-+        skel.rodata().switch_partial = opts.partial;
-+        skel.rodata().greedy_threshold = opts.greedy_threshold;
-+        skel.rodata().debug = opts.verbose as u32;
-+
-+        // Attach.
-+        let mut skel = skel.load().context("Failed to load BPF program")?;
-+        skel.attach().context("Failed to attach BPF program")?;
-+        let struct_ops = Some(
-+            skel.maps_mut()
-+                .rusty()
-+                .attach_struct_ops()
-+                .context("Failed to attach rusty struct ops")?,
-+        );
-+        info!("Rusty Scheduler Attached");
-+
-+        // Other stuff.
-+        let proc_reader = procfs::ProcReader::new();
-+        let prev_total_cpu = read_total_cpu(&proc_reader)?;
-+
-+        Ok(Self {
-+            skel,
-+            struct_ops, // should be held to keep it attached
-+
-+            sched_interval: Duration::from_secs_f64(opts.interval),
-+            tune_interval: Duration::from_secs_f64(opts.tune_interval),
-+            balance_load: !opts.no_load_balance,
-+            balanced_kworkers: opts.balanced_kworkers,
-+
-+            top: top.clone(),
-+            proc_reader,
-+
-+            prev_at: Instant::now(),
-+            prev_total_cpu,
-+
-+            nr_lb_data_errors: 0,
-+
-+            tuner: Tuner::new(top, opts)?,
-+        })
-+    }
-+
-+    fn get_cpu_busy(&mut self) -> Result<f64> {
-+        let total_cpu = read_total_cpu(&self.proc_reader)?;
-+        let busy = match (&self.prev_total_cpu, &total_cpu) {
-+            (
-+                procfs::CpuStat {
-+                    user_usec: Some(prev_user),
-+                    nice_usec: Some(prev_nice),
-+                    system_usec: Some(prev_system),
-+                    idle_usec: Some(prev_idle),
-+                    iowait_usec: Some(prev_iowait),
-+                    irq_usec: Some(prev_irq),
-+                    softirq_usec: Some(prev_softirq),
-+                    stolen_usec: Some(prev_stolen),
-+                    guest_usec: _,
-+                    guest_nice_usec: _,
-+                },
-+                procfs::CpuStat {
-+                    user_usec: Some(curr_user),
-+                    nice_usec: Some(curr_nice),
-+                    system_usec: Some(curr_system),
-+                    idle_usec: Some(curr_idle),
-+                    iowait_usec: Some(curr_iowait),
-+                    irq_usec: Some(curr_irq),
-+                    softirq_usec: Some(curr_softirq),
-+                    stolen_usec: Some(curr_stolen),
-+                    guest_usec: _,
-+                    guest_nice_usec: _,
-+                },
-+            ) => {
-+                let idle_usec = curr_idle - prev_idle;
-+                let iowait_usec = curr_iowait - prev_iowait;
-+                let user_usec = curr_user - prev_user;
-+                let system_usec = curr_system - prev_system;
-+                let nice_usec = curr_nice - prev_nice;
-+                let irq_usec = curr_irq - prev_irq;
-+                let softirq_usec = curr_softirq - prev_softirq;
-+                let stolen_usec = curr_stolen - prev_stolen;
-+
-+                let busy_usec =
-+                    user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
-+                let total_usec = idle_usec + busy_usec + iowait_usec;
-+                busy_usec as f64 / total_usec as f64
-+            }
-+            _ => {
-+                bail!("Some procfs stats are not populated!");
-+            }
-+        };
-+
-+        self.prev_total_cpu = total_cpu;
-+        Ok(busy)
-+    }
-+
-+    fn read_bpf_stats(&mut self) -> Result<Vec<u64>> {
-+        let mut maps = self.skel.maps_mut();
-+        let stats_map = maps.stats();
-+        let mut stats: Vec<u64> = Vec::new();
-+        let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus];
-+
-+        for stat in 0..rusty_sys::stat_idx_RUSTY_NR_STATS {
-+            let cpu_stat_vec = stats_map
-+                .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
-+                .with_context(|| format!("Failed to lookup stat {}", stat))?
-+                .expect("per-cpu stat should exist");
-+            let sum = cpu_stat_vec
-+                .iter()
-+                .map(|val| {
-+                    u64::from_ne_bytes(
-+                        val.as_slice()
-+                            .try_into()
-+                            .expect("Invalid value length in stat map"),
-+                    )
-+                })
-+                .sum();
-+            stats_map
-+                .update_percpu(&stat.to_ne_bytes(), &zero_vec, libbpf_rs::MapFlags::ANY)
-+                .context("Failed to zero stat")?;
-+            stats.push(sum);
-+        }
-+        Ok(stats)
-+    }
-+
-+    fn report(
-+        &mut self,
-+        stats: &[u64],
-+        cpu_busy: f64,
-+        processing_dur: Duration,
-+        load_avg: f64,
-+        dom_loads: &[f64],
-+        imbal: &[f64],
-+    ) {
-+        let stat = |idx| stats[idx as usize];
-+        let total = stat(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_PINNED)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
-+            + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY);
-+
-+        info!(
-+            "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms",
-+            cpu_busy * 100.0,
-+            stats[rusty_sys::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize],
-+            load_avg,
-+            stats[rusty_sys::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize],
-+            self.nr_lb_data_errors,
-+            processing_dur.as_millis(),
-+        );
-+
-+        let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0;
-+
-+        info!(
-+            "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}",
-+            total,
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PINNED),
-+        );
-+
-+        info!(
-+            "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}",
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR),
-+        );
-+
-+        info!(
-+            "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_KICK_GREEDY),
-+            stat_pct(rusty_sys::stat_idx_RUSTY_STAT_REPATRIATE),
-+        );
-+
-+        let ti = &self.skel.bss().tune_input;
-+        info!(
-+            "direct_greedy_cpumask={}",
-+            format_cpumask(&ti.direct_greedy_cpumask, self.top.nr_cpus)
-+        );
-+        info!(
-+            "  kick_greedy_cpumask={}",
-+            format_cpumask(&ti.kick_greedy_cpumask, self.top.nr_cpus)
-+        );
-+
-+        for i in 0..self.top.nr_doms {
-+            info!(
-+                "DOM[{:02}] util={:6.2} load={:8.2} imbal={}",
-+                i,
-+                self.tuner.dom_utils[i] * 100.0,
-+                dom_loads[i],
-+                if imbal[i] == 0.0 {
-+                    format!("{:9.2}", 0.0)
-+                } else {
-+                    format!("{:+9.2}", imbal[i])
-+                },
-+            );
-+        }
-+    }
-+
-+    fn lb_step(&mut self) -> Result<()> {
-+        let started_at = Instant::now();
-+        let bpf_stats = self.read_bpf_stats()?;
-+        let cpu_busy = self.get_cpu_busy()?;
-+
-+        let mut lb = LoadBalancer::new(
-+            &mut self.skel,
-+            self.top.clone(),
-+            self.balanced_kworkers,
-+            &mut self.nr_lb_data_errors,
-+        );
-+
-+        lb.read_dom_loads()?;
-+        lb.calculate_dom_load_balance()?;
-+
-+        if self.balance_load {
-+            lb.load_balance()?;
-+        }
-+
-+        // Extract fields needed for reporting and drop lb to release
-+        // mutable borrows.
-+        let (load_avg, dom_loads, imbal) = (lb.load_avg, lb.dom_loads, lb.imbal);
-+
-+        self.report(
-+            &bpf_stats,
-+            cpu_busy,
-+            Instant::now().duration_since(started_at),
-+            load_avg,
-+            &dom_loads,
-+            &imbal,
-+        );
-+
-+        self.prev_at = started_at;
-+        Ok(())
-+    }
-+
-+    fn read_bpf_exit_kind(&mut self) -> i32 {
-+        unsafe { std::ptr::read_volatile(&self.skel.bss().exit_kind as *const _) }
-+    }
-+
-+    fn report_bpf_exit_kind(&mut self) -> Result<()> {
-+        // Report msg if EXT_OPS_EXIT_ERROR.
-+        match self.read_bpf_exit_kind() {
-+            0 => Ok(()),
-+            etype if etype == 2 => {
-+                let cstr = unsafe { CStr::from_ptr(self.skel.bss().exit_msg.as_ptr() as *const _) };
-+                let msg = cstr
-+                    .to_str()
-+                    .context("Failed to convert exit msg to string")
-+                    .unwrap();
-+                bail!("BPF exit_kind={} msg={}", etype, msg);
-+            }
-+            etype => {
-+                info!("BPF exit_kind={}", etype);
-+                Ok(())
-+            }
-+        }
-+    }
-+
-+    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
-+        let now = Instant::now();
-+        let mut next_tune_at = now + self.tune_interval;
-+        let mut next_sched_at = now + self.sched_interval;
-+
-+        while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_kind() == 0 {
-+            let now = Instant::now();
-+
-+            if now >= next_tune_at {
-+                self.tuner.step(&mut self.skel)?;
-+                next_tune_at += self.tune_interval;
-+                if next_tune_at < now {
-+                    next_tune_at = now + self.tune_interval;
-+                }
-+            }
-+
-+            if now >= next_sched_at {
-+                self.lb_step()?;
-+                next_sched_at += self.sched_interval;
-+                if next_sched_at < now {
-+                    next_sched_at = now + self.sched_interval;
-+                }
-+            }
-+
-+            std::thread::sleep(
-+                next_sched_at
-+                    .min(next_tune_at)
-+                    .duration_since(Instant::now()),
-+            );
-+        }
-+
-+        self.report_bpf_exit_kind()
-+    }
-+}
-+
-+impl<'a> Drop for Scheduler<'a> {
-+    fn drop(&mut self) {
-+        if let Some(struct_ops) = self.struct_ops.take() {
-+            drop(struct_ops);
-+        }
-+    }
-+}
-+
-+fn main() -> Result<()> {
-+    let opts = Opts::parse();
-+
-+    let llv = match opts.verbose {
-+        0 => simplelog::LevelFilter::Info,
-+        1 => simplelog::LevelFilter::Debug,
-+        _ => simplelog::LevelFilter::Trace,
-+    };
-+    let mut lcfg = simplelog::ConfigBuilder::new();
-+    lcfg.set_time_level(simplelog::LevelFilter::Error)
-+        .set_location_level(simplelog::LevelFilter::Off)
-+        .set_target_level(simplelog::LevelFilter::Off)
-+        .set_thread_level(simplelog::LevelFilter::Off);
-+    simplelog::TermLogger::init(
-+        llv,
-+        lcfg.build(),
-+        simplelog::TerminalMode::Stderr,
-+        simplelog::ColorChoice::Auto,
-+    )?;
-+
-+    let mut sched = Scheduler::init(&opts)?;
-+
-+    let shutdown = Arc::new(AtomicBool::new(false));
-+    let shutdown_clone = shutdown.clone();
-+    ctrlc::set_handler(move || {
-+        shutdown_clone.store(true, Ordering::Relaxed);
-+    })
-+    .context("Error setting Ctrl-C handler")?;
-+
-+    sched.run(shutdown)
-+}
-diff --git a/tools/sched_ext/scx_rusty/src/rusty_sys.rs b/tools/sched_ext/scx_rusty/src/rusty_sys.rs
-new file mode 100644
-index 000000000..e948d81e7
---- /dev/null
-+++ b/tools/sched_ext/scx_rusty/src/rusty_sys.rs
-@@ -0,0 +1,10 @@
-+// Copyright (c) Meta Platforms, Inc. and affiliates.
-+
-+// This software may be used and distributed according to the terms of the
-+// GNU General Public License version 2.
-+#![allow(non_upper_case_globals)]
-+#![allow(non_camel_case_types)]
-+#![allow(non_snake_case)]
-+#![allow(dead_code)]
-+
-+include!(concat!(env!("OUT_DIR"), "/rusty_sys.rs"));
-diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
-new file mode 100644
-index 000000000..56b589d7f
---- /dev/null
-+++ b/tools/sched_ext/scx_simple.bpf.c
-@@ -0,0 +1,143 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A simple scheduler.
-+ *
-+ * By default, it operates as a simple global weighted vtime scheduler and can
-+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
-+ *
-+ * - Statistics tracking how many tasks are queued to local and global dsq's.
-+ * - Termination notification for userspace.
-+ *
-+ * While very simple, this scheduler should work reasonably well on CPUs with a
-+ * uniform L3 cache topology. While preemption is not implemented, the fact that
-+ * the scheduling queue is shared across all CPUs means that whatever is at the
-+ * front of the queue is likely to be executed fairly quickly given enough
-+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
-+ * but comes with the usual problems with FIFO scheduling where saturating
-+ * threads can easily drown out interactive ones.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include "scx_common.bpf.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile bool fifo_sched;
-+const volatile bool switch_partial;
-+
-+static u64 vtime_now;
-+struct user_exit_info uei;
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(key_size, sizeof(u32));
-+	__uint(value_size, sizeof(u64));
-+	__uint(max_entries, 2);			/* [local, global] */
-+} stats SEC(".maps");
-+
-+static void stat_inc(u32 idx)
-+{
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
-+	if (cnt_p)
-+		(*cnt_p)++;
-+}
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	/*
-+	 * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that
-+	 * running @p on its CPU directly shouldn't affect fairness. Just queue
-+	 * it on the local FIFO.
-+	 */
-+	if (enq_flags & SCX_ENQ_LOCAL) {
-+		stat_inc(0);	/* count local queueing */
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-+		return;
-+	}
-+
-+	stat_inc(1);	/* count global queueing */
-+
-+	if (fifo_sched) {
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+	} else {
-+		u64 vtime = p->scx.dsq_vtime;
-+
-+		/*
-+		 * Limit the amount of budget that an idling task can accumulate
-+		 * to one slice.
-+		 */
-+		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
-+			vtime = vtime_now - SCX_SLICE_DFL;
-+
-+		scx_bpf_dispatch_vtime(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, vtime,
-+				       enq_flags);
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
-+{
-+	if (fifo_sched)
-+		return;
-+
-+	/*
-+	 * Global vtime always progresses forward as tasks start executing. The
-+	 * test and update can be performed concurrently from multiple CPUs and
-+	 * thus racy. Any error should be contained and temporary. Let's just
-+	 * live with it.
-+	 */
-+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
-+		vtime_now = p->scx.dsq_vtime;
-+}
-+
-+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
-+{
-+	if (fifo_sched)
-+		return;
-+
-+	/*
-+	 * Scale the execution time by the inverse of the weight and charge.
-+	 *
-+	 * Note that the default yield implementation yields by setting
-+	 * @p->scx.slice to zero and the following would treat the yielding task
-+	 * as if it has consumed all its slice. If this penalizes yielding tasks
-+	 * too much, determine the execution time by taking explicit timestamps
-+	 * instead of depending on @p->scx.slice.
-+	 */
-+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
-+}
-+
-+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
-+		    struct scx_enable_args *args)
-+{
-+	p->scx.dsq_vtime = vtime_now;
-+}
-+
-+s32 BPF_STRUCT_OPS(simple_init)
-+{
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops simple_ops = {
-+	.enqueue		= (void *)simple_enqueue,
-+	.running		= (void *)simple_running,
-+	.stopping		= (void *)simple_stopping,
-+	.enable			= (void *)simple_enable,
-+	.init			= (void *)simple_init,
-+	.exit			= (void *)simple_exit,
-+	.name			= "simple",
-+};
-diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
-new file mode 100644
-index 000000000..900f1c3e7
---- /dev/null
-+++ b/tools/sched_ext/scx_simple.c
-@@ -0,0 +1,99 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "scx_common.h"
-+#include "scx_simple.skel.h"
-+
-+const char help_fmt[] =
-+"A simple sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-f] [-p]\n"
-+"\n"
-+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+
-+static void sigint_handler(int simple)
-+{
-+	exit_req = 1;
-+}
-+
-+static void read_stats(struct scx_simple *skel, __u64 *stats)
-+{
-+	int nr_cpus = libbpf_num_possible_cpus();
-+	__u64 cnts[2][nr_cpus];
-+	__u32 idx;
-+
-+	memset(stats, 0, sizeof(stats[0]) * 2);
-+
-+	for (idx = 0; idx < 2; idx++) {
-+		int ret, cpu;
-+
-+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-+					  &idx, cnts[idx]);
-+		if (ret < 0)
-+			continue;
-+		for (cpu = 0; cpu < nr_cpus; cpu++)
-+			stats[idx] += cnts[idx][cpu];
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_simple *skel;
-+	struct bpf_link *link;
-+	__u32 opt;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	skel = scx_simple__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	while ((opt = getopt(argc, argv, "fph")) != -1) {
-+		switch (opt) {
-+		case 'f':
-+			skel->rodata->fifo_sched = true;
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel");
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.simple_ops);
-+	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-+
-+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
-+		__u64 stats[2];
-+
-+		read_stats(skel, stats);
-+		printf("local=%llu global=%llu\n", stats[0], stats[1]);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	uei_print(&skel->bss->uei);
-+	scx_simple__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
-new file mode 100644
-index 000000000..9e107a874
---- /dev/null
-+++ b/tools/sched_ext/scx_userland.bpf.c
-@@ -0,0 +1,262 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A minimal userland scheduler.
-+ *
-+ * In terms of scheduling, this provides two different types of behaviors:
-+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
-+ *    All such tasks are direct-dispatched from the kernel, and are never
-+ *    enqueued in user space.
-+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
-+ *    other tasks.
-+ *
-+ * Some parts of this example user space scheduler could be implemented more
-+ * efficiently using more complex and sophisticated data structures. For
-+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
-+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
-+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
-+ * in user space, but an rbtree could be used instead.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <string.h>
-+#include "scx_common.bpf.h"
-+#include "scx_userland.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile bool switch_partial;
-+const volatile s32 usersched_pid;
-+
-+/* !0 for veristat, set during init */
-+const volatile u32 num_possible_cpus = 64;
-+
-+/* Stats that are printed by user space. */
-+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
-+
-+struct user_exit_info uei;
-+
-+/*
-+ * Whether the user space scheduler needs to be scheduled due to a task being
-+ * enqueued in user space.
-+ */
-+static bool usersched_needed;
-+
-+/*
-+ * The map containing tasks that are enqueued in user space from the kernel.
-+ *
-+ * This map is drained by the user space scheduler.
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, USERLAND_MAX_TASKS);
-+	__type(value, struct scx_userland_enqueued_task);
-+} enqueued SEC(".maps");
-+
-+/*
-+ * The map containing tasks that are dispatched to the kernel from user space.
-+ *
-+ * Drained by the kernel in userland_dispatch().
-+ */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, USERLAND_MAX_TASKS);
-+	__type(value, s32);
-+} dispatched SEC(".maps");
-+
-+/* Per-task scheduling context */
-+struct task_ctx {
-+	bool force_local; /* Dispatch directly to local DSQ */
-+};
-+
-+/* Map that contains task-local storage. */
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct task_ctx);
-+} task_ctx_stor SEC(".maps");
-+
-+static bool is_usersched_task(const struct task_struct *p)
-+{
-+	return p->pid == usersched_pid;
-+}
-+
-+static bool keep_in_kernel(const struct task_struct *p)
-+{
-+	return p->nr_cpus_allowed < num_possible_cpus;
-+}
-+
-+static struct task_struct *usersched_task(void)
-+{
-+	struct task_struct *p;
-+
-+	p = bpf_task_from_pid(usersched_pid);
-+	/*
-+	 * Should never happen -- the usersched task should always be managed
-+	 * by sched_ext.
-+	 */
-+	if (!p)
-+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
-+
-+	return p;
-+}
-+
-+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	if (keep_in_kernel(p)) {
-+		s32 cpu;
-+		struct task_ctx *tctx;
-+
-+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+		if (!tctx) {
-+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
-+			return -ESRCH;
-+		}
-+
-+		if (p->nr_cpus_allowed == 1 ||
-+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+			tctx->force_local = true;
-+			return prev_cpu;
-+		}
-+
-+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+		if (cpu >= 0) {
-+			tctx->force_local = true;
-+			return cpu;
-+		}
-+	}
-+
-+	return prev_cpu;
-+}
-+
-+static void dispatch_user_scheduler(void)
-+{
-+	struct task_struct *p;
-+
-+	usersched_needed = false;
-+	p = usersched_task();
-+	if (p) {
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-+		bpf_task_release(p);
-+	}
-+}
-+
-+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
-+{
-+	struct scx_userland_enqueued_task task;
-+
-+	memset(&task, 0, sizeof(task));
-+	task.pid = p->pid;
-+	task.sum_exec_runtime = p->se.sum_exec_runtime;
-+	task.weight = p->scx.weight;
-+
-+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
-+		/*
-+		 * If we fail to enqueue the task in user space, put it
-+		 * directly on the global DSQ.
-+		 */
-+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+	} else {
-+		__sync_fetch_and_add(&nr_user_enqueues, 1);
-+		usersched_needed = true;
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	if (keep_in_kernel(p)) {
-+		u64 dsq_id = SCX_DSQ_GLOBAL;
-+		struct task_ctx *tctx;
-+
-+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+		if (!tctx) {
-+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
-+			return;
-+		}
-+
-+		if (tctx->force_local)
-+			dsq_id = SCX_DSQ_LOCAL;
-+		tctx->force_local = false;
-+		scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
-+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
-+		return;
-+	} else if (!is_usersched_task(p)) {
-+		enqueue_task_in_user_space(p, enq_flags);
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	if (usersched_needed)
-+		dispatch_user_scheduler();
-+
-+	bpf_repeat(4096) {
-+		s32 pid;
-+		struct task_struct *p;
-+
-+		if (bpf_map_pop_elem(&dispatched, &pid))
-+			break;
-+
-+		/*
-+		 * The task could have exited by the time we get around to
-+		 * dispatching it. Treat this as a normal occurrence, and simply
-+		 * move onto the next iteration.
-+		 */
-+		p = bpf_task_from_pid(pid);
-+		if (!p)
-+			continue;
-+
-+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-+		bpf_task_release(p);
-+	}
-+}
-+
-+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
-+		   struct scx_enable_args *args)
-+{
-+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-+		return 0;
-+	else
-+		return -ENOMEM;
-+}
-+
-+s32 BPF_STRUCT_OPS(userland_init)
-+{
-+	if (num_possible_cpus == 0) {
-+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
-+			      num_possible_cpus);
-+		return -EINVAL;
-+	}
-+
-+	if (usersched_pid <= 0) {
-+		scx_bpf_error("User scheduler pid uninitialized (%d)",
-+			      usersched_pid);
-+		return -EINVAL;
-+	}
-+
-+	if (!switch_partial)
-+		scx_bpf_switch_all();
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
-+{
-+	uei_record(&uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops userland_ops = {
-+	.select_cpu		= (void *)userland_select_cpu,
-+	.enqueue		= (void *)userland_enqueue,
-+	.dispatch		= (void *)userland_dispatch,
-+	.prep_enable		= (void *)userland_prep_enable,
-+	.init			= (void *)userland_init,
-+	.exit			= (void *)userland_exit,
-+	.timeout_ms		= 3000,
-+	.name			= "userland",
-+};
-diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
-new file mode 100644
-index 000000000..a750f10df
---- /dev/null
-+++ b/tools/sched_ext/scx_userland.c
-@@ -0,0 +1,366 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A demo sched_ext user space scheduler which provides vruntime semantics
-+ * using a simple ordered-list implementation.
-+ *
-+ * Each CPU in the system resides in a single, global domain. This precludes
-+ * the need to do any load balancing between domains. The scheduler could
-+ * easily be extended to support multiple domains, with load balancing
-+ * happening in user space.
-+ *
-+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
-+ * program only schedules tasks which may run on any CPU.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <sched.h>
-+#include <signal.h>
-+#include <assert.h>
-+#include <libgen.h>
-+#include <pthread.h>
-+#include <bpf/bpf.h>
-+#include <sys/mman.h>
-+#include <sys/queue.h>
-+#include <sys/syscall.h>
-+
-+#include "scx_common.h"
-+#include "scx_userland.h"
-+#include "scx_userland.skel.h"
-+
-+const char help_fmt[] =
-+"A minimal userland sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-b BATCH] [-p]\n"
-+"\n"
-+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
-+"  -p            Don't switch all, switch only tasks on SCHED_EXT policy\n"
-+"  -h            Display this help and exit\n";
-+
-+/* Defined in UAPI */
-+#define SCHED_EXT 7
-+
-+/* Number of tasks to batch when dispatching to user space. */
-+static __u32 batch_size = 8;
-+
-+static volatile int exit_req;
-+static int enqueued_fd, dispatched_fd;
-+
-+static struct scx_userland *skel;
-+static struct bpf_link *ops_link;
-+
-+/* Stats collected in user space. */
-+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
-+
-+/* The data structure containing tasks that are enqueued in user space. */
-+struct enqueued_task {
-+	LIST_ENTRY(enqueued_task) entries;
-+	__u64 sum_exec_runtime;
-+	double vruntime;
-+};
-+
-+/*
-+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
-+ * a more optimal data structure, such as an rbtree as is done in CFS. We
-+ * currently elect to use a sorted list to simplify the example for
-+ * illustrative purposes.
-+ */
-+LIST_HEAD(listhead, enqueued_task);
-+
-+/*
-+ * A vruntime-sorted list of tasks. The head of the list contains the task with
-+ * the lowest vruntime. That is, the task that has the "highest" claim to be
-+ * scheduled.
-+ */
-+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
-+
-+/*
-+ * The statically allocated array of tasks. We use a statically allocated list
-+ * here to avoid having to allocate on the enqueue path, which could cause a
-+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
-+ * for newly enabled tasks that are passed to the scheduler from the
-+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
-+ */
-+struct enqueued_task tasks[USERLAND_MAX_TASKS];
-+
-+static double min_vruntime;
-+
-+static void sigint_handler(int userland)
-+{
-+	exit_req = 1;
-+}
-+
-+static __u32 task_pid(const struct enqueued_task *task)
-+{
-+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
-+}
-+
-+static int dispatch_task(__s32 pid)
-+{
-+	int err;
-+
-+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
-+	if (err) {
-+		fprintf(stderr, "Failed to dispatch task %d\n", pid);
-+		exit_req = 1;
-+	} else {
-+		nr_vruntime_dispatches++;
-+	}
-+
-+	return err;
-+}
-+
-+static struct enqueued_task *get_enqueued_task(__s32 pid)
-+{
-+	if (pid >= USERLAND_MAX_TASKS)
-+		return NULL;
-+
-+	return &tasks[pid];
-+}
-+
-+static double calc_vruntime_delta(__u64 weight, __u64 delta)
-+{
-+	double weight_f = (double)weight / 100.0;
-+	double delta_f = (double)delta;
-+
-+	return delta_f / weight_f;
-+}
-+
-+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
-+{
-+	__u64 delta;
-+
-+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
-+
-+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
-+	if (min_vruntime > enqueued->vruntime)
-+		enqueued->vruntime = min_vruntime;
-+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
-+}
-+
-+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
-+{
-+	struct enqueued_task *curr, *enqueued, *prev;
-+
-+	curr = get_enqueued_task(bpf_task->pid);
-+	if (!curr)
-+		return ENOENT;
-+
-+	update_enqueued(curr, bpf_task);
-+	nr_vruntime_enqueues++;
-+
-+	/*
-+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
-+	 * structure such as an rbtree could easily be used as well. We elect
-+	 * to use a list here simply because it's less code, and thus the
-+	 * example is less convoluted and better serves to illustrate what a
-+	 * user space scheduler could look like.
-+	 */
-+
-+	if (LIST_EMPTY(&vruntime_head)) {
-+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
-+		return 0;
-+	}
-+
-+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
-+		if (curr->vruntime <= enqueued->vruntime) {
-+			LIST_INSERT_BEFORE(enqueued, curr, entries);
-+			return 0;
-+		}
-+		prev = enqueued;
-+	}
-+
-+	LIST_INSERT_AFTER(prev, curr, entries);
-+
-+	return 0;
-+}
-+
-+static void drain_enqueued_map(void)
-+{
-+	while (1) {
-+		struct scx_userland_enqueued_task task;
-+		int err;
-+
-+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
-+			return;
-+
-+		err = vruntime_enqueue(&task);
-+		if (err) {
-+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
-+				task.pid, strerror(err));
-+			exit_req = 1;
-+			return;
-+		}
-+	}
-+}
-+
-+static void dispatch_batch(void)
-+{
-+	__u32 i;
-+
-+	for (i = 0; i < batch_size; i++) {
-+		struct enqueued_task *task;
-+		int err;
-+		__s32 pid;
-+
-+		task = LIST_FIRST(&vruntime_head);
-+		if (!task)
-+			return;
-+
-+		min_vruntime = task->vruntime;
-+		pid = task_pid(task);
-+		LIST_REMOVE(task, entries);
-+		err = dispatch_task(pid);
-+		if (err) {
-+			fprintf(stderr, "Failed to dispatch task %d in %u\n",
-+				pid, i);
-+			return;
-+		}
-+	}
-+}
-+
-+static void *run_stats_printer(void *arg)
-+{
-+	while (!exit_req) {
-+		__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
-+
-+		nr_failed_enqueues = skel->bss->nr_failed_enqueues;
-+		nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
-+		nr_user_enqueues = skel->bss->nr_user_enqueues;
-+		total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
-+
-+		printf("o-----------------------o\n");
-+		printf("| BPF ENQUEUES          |\n");
-+		printf("|-----------------------|\n");
-+		printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
-+		printf("|  user:     %10llu |\n", nr_user_enqueues);
-+		printf("|  failed:   %10llu |\n", nr_failed_enqueues);
-+		printf("|  -------------------- |\n");
-+		printf("|  total:    %10llu |\n", total);
-+		printf("|                       |\n");
-+		printf("|-----------------------|\n");
-+		printf("| VRUNTIME / USER       |\n");
-+		printf("|-----------------------|\n");
-+		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
-+		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
-+		printf("o-----------------------o\n");
-+		printf("\n\n");
-+		sleep(1);
-+	}
-+
-+	return NULL;
-+}
-+
-+static int spawn_stats_thread(void)
-+{
-+	pthread_t stats_printer;
-+
-+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
-+}
-+
-+static void bootstrap(int argc, char **argv)
-+{
-+	int err;
-+	__u32 opt;
-+	struct sched_param sched_param = {
-+		.sched_priority = sched_get_priority_max(SCHED_EXT),
-+	};
-+	bool switch_partial = false;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	/*
-+	 * Enforce that the user scheduler task is managed by sched_ext. The
-+	 * task eagerly drains the list of enqueued tasks in its main work
-+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
-+	 * user space scheduler task when at least one other task in the system
-+	 * needs to be scheduled.
-+	 */
-+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
-+	SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");
-+
-+	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
-+		switch (opt) {
-+		case 'b':
-+			batch_size = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'p':
-+			switch_partial = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			exit(opt != 'h');
-+		}
-+	}
-+
-+	/*
-+	 * It's not always safe to allocate in a user space scheduler, as an
-+	 * enqueued task could hold a lock that we require in order to be able
-+	 * to allocate.
-+	 */
-+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
-+	SCX_BUG_ON(err, "Failed to prefault and lock address space");
-+
-+	skel = scx_userland__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
-+	assert(skel->rodata->num_possible_cpus > 0);
-+	skel->rodata->usersched_pid = getpid();
-+	assert(skel->rodata->usersched_pid > 0);
-+	skel->rodata->switch_partial = switch_partial;
-+
-+	SCX_BUG_ON(scx_userland__load(skel), "Failed to load skel");
-+
-+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
-+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
-+	assert(enqueued_fd > 0);
-+	assert(dispatched_fd > 0);
-+
-+	SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");
-+
-+	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
-+	SCX_BUG_ON(!ops_link, "Failed to attach struct_ops");
-+}
-+
-+static void sched_main_loop(void)
-+{
-+	while (!exit_req) {
-+		/*
-+		 * Perform the following work in the main user space scheduler
-+		 * loop:
-+		 *
-+		 * 1. Drain all tasks from the enqueued map, and enqueue them
-+		 *    to the vruntime sorted list.
-+		 *
-+		 * 2. Dispatch a batch of tasks from the vruntime sorted list
-+		 *    down to the kernel.
-+		 *
-+		 * 3. Yield the CPU back to the system. The BPF scheduler will
-+		 *    reschedule the user space scheduler once another task has
-+		 *    been enqueued to user space.
-+		 */
-+		drain_enqueued_map();
-+		dispatch_batch();
-+		sched_yield();
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	bootstrap(argc, argv);
-+	sched_main_loop();
-+
-+	exit_req = 1;
-+	bpf_link__destroy(ops_link);
-+	uei_print(&skel->bss->uei);
-+	scx_userland__destroy(skel);
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h
-new file mode 100644
-index 000000000..639c6809c
---- /dev/null
-+++ b/tools/sched_ext/scx_userland.h
-@@ -0,0 +1,19 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/* Copyright (c) 2022 Meta, Inc */
-+
-+#ifndef __SCX_USERLAND_COMMON_H
-+#define __SCX_USERLAND_COMMON_H
-+
-+#define USERLAND_MAX_TASKS 8192
-+
-+/*
-+ * An instance of a task that has been enqueued by the kernel for consumption
-+ * by a user space global scheduler thread.
-+ */
-+struct scx_userland_enqueued_task {
-+	__s32 pid;
-+	u64 sum_exec_runtime;
-+	u64 weight;
-+};
-+
-+#endif  // __SCX_USERLAND_COMMON_H
-diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h
-new file mode 100644
-index 000000000..f0e45bf3c
---- /dev/null
-+++ b/tools/sched_ext/user_exit_info.h
-@@ -0,0 +1,50 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Define struct user_exit_info which is shared between BPF and userspace parts
-+ * to communicate exit status and other information.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __USER_EXIT_INFO_H
-+#define __USER_EXIT_INFO_H
-+
-+struct user_exit_info {
-+	int		kind;
-+	char		reason[128];
-+	char		msg[1024];
-+};
-+
-+#ifdef __bpf__
-+
-+#include "vmlinux.h"
-+#include <bpf/bpf_core_read.h>
-+
-+static inline void uei_record(struct user_exit_info *uei,
-+			      const struct scx_exit_info *ei)
-+{
-+	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
-+	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
-+	/* use __sync to force memory barrier */
-+	__sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind);
-+}
-+
-+#else	/* !__bpf__ */
-+
-+static inline bool uei_exited(struct user_exit_info *uei)
-+{
-+	/* use __sync to force memory barrier */
-+	return __sync_val_compare_and_swap(&uei->kind, -1, -1);
-+}
-+
-+static inline void uei_print(const struct user_exit_info *uei)
-+{
-+	fprintf(stderr, "EXIT: %s", uei->reason);
-+	if (uei->msg[0] != '\0')
-+		fprintf(stderr, " (%s)", uei->msg);
-+	fputs("\n", stderr);
-+}
-+
-+#endif	/* __bpf__ */
-+#endif	/* __USER_EXIT_INFO_H */
--- 
-2.43.0.rc2
-
diff --git a/sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch b/sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch
similarity index 61%
rename from sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch
rename to sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch
index 75c48bf..50f27db 100644
--- a/sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch
+++ b/sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch
@@ -1,26 +1,37 @@
-From 71dd30c3e2ab2852b0290ae1f34ce1c7f8655040 Mon Sep 17 00:00:00 2001
-From: graysky <therealgraysky@proton.me>
-Date: Wed, 21 Feb 2024 08:38:13 -0500
+From a4ebe91654460da51b0327f3d0a051aaeab2d423 Mon Sep 17 00:00:00 2001
+From: graysky <therealgraysky AT proton DOT me>
+Date: Mon, 16 Sep 2024 05:55:58 -0400
 
 FEATURES
-This patch adds additional CPU options to the Linux kernel accessible under:
- Processor type and features  --->
-  Processor family --->
+This patch adds additional tunings via new x86-64 ISA levels and
+more micro-architecture options to the Linux kernel in three classes.
 
-With the release of gcc 11.1 and clang 12.0, several generic 64-bit levels are
-offered which are good for supported Intel or AMD CPUs:
-â¢ x86-64-v2
-â¢ x86-64-v3
-â¢ x86-64-v4
+1. New generic x86-64 ISA levels
+
+These are selectable under:
+	Processor type and features ---> x86-64 compiler ISA level
+
+â¢ x86-64     A value of (1) is the default
+â¢ x86-64-v2  A value of (2) brings support for vector
+             instructions up to Streaming SIMD Extensions 4.2 (SSE4.2)
+	     and Supplemental Streaming SIMD Extensions 3 (SSSE3), the
+	     POPCNT instruction, and CMPXCHG16B.
+â¢ x86-64-v3  A value of (3) adds vector instructions up to AVX2, MOVBE,
+             and additional bit-manipulation instructions.
+
+There is also x86-64-v4 but including this makes little sense as
+the kernel does not use any of the AVX512 instructions anyway.
 
 Users of glibc 2.33 and above can see which level is supported by running:
-  /lib/ld-linux-x86-64.so.2 --help | grep supported
+	/lib/ld-linux-x86-64.so.2 --help | grep supported
 Or
-  /lib64/ld-linux-x86-64.so.2 --help | grep supported
+	/lib64/ld-linux-x86-64.so.2 --help | grep supported
+
+2. New micro-architectures
 
-Alternatively, compare the flags from /proc/cpuinfo to this list.[1]
+These are selectable under:
+	Processor type and features ---> Processor family 
 
-CPU-specific microarchitectures include:
 â¢ AMD Improved K8-family
 â¢ AMD K10-family
 â¢ AMD Family 10h (Barcelona)
@@ -32,8 +43,9 @@ CPU-specific microarchitectures include:
 â¢ AMD Family 15h (Excavator)
 â¢ AMD Family 17h (Zen)
 â¢ AMD Family 17h (Zen 2)
-â¢ AMD Family 19h (Zen 3)â 
-â¢ AMD Family 19h (Zen 4)Â§
+â¢ AMD Family 19h (Zen 3)**
+â¢ AMD Family 19h (Zen 4)â¡
+â¢ AMD Family 1Ah (Zen 5)Â§
 â¢ Intel Silvermont low-power processors
 â¢ Intel Goldmont low-power processors (Apollo Lake and Denverton)
 â¢ Intel Goldmont Plus low-power processors (Gemini Lake)
@@ -50,24 +62,27 @@ CPU-specific microarchitectures include:
 â¢ Intel Xeon (Cascade Lake)
 â¢ Intel Xeon (Cooper Lake)*
 â¢ Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)*
-â¢ Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)â¡
-â¢ Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)â¡
-â¢ Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)â¡
-â¢ Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)Â§
-â¢ Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)Â§
-â¢ Intel 5th Gen 10nm++ Xeon (Emerald Rapids)Â§
+â¢ Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)â 
+â¢ Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)â 
+â¢ Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)â 
+â¢ Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)â¡
+â¢ Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)â¡
+â¢ Intel 5th Gen 10nm++ Xeon (Emerald Rapids)â¡
 
 Notes: If not otherwise noted, gcc >=9.1 is required for support.
        *Requires gcc >=10.1 or clang >=10.0
-       â Required gcc >=10.3 or clang >=12.0
-       â¡Required gcc >=11.1 or clang >=12.0
-       Â§Required gcc >=13.0 or clang >=15.0.5
+      **Required gcc >=10.3 or clang >=12.0
+       â Required gcc >=11.1 or clang >=12.0
+       â¡Required gcc >=13.0 or clang >=15.0.5
+       Â§Required gcc  >14.0 or clang >=19.0?
 
-It also offers to compile passing the 'native' option which, "selects the CPU
+3. Auto-detected micro-architecture levels
+
+Compile by passing the '-march=native' option which, "selects the CPU
 to generate code for at compilation time by determining the processor type of
 the compiling machine. Using -march=native enables all instruction subsets
 supported by the local machine and will produce code optimized for the local
-machine under the constraints of the selected instruction set."[2]
+machine under the constraints of the selected instruction set."[1]
 
 Users of Intel CPUs should select the 'Intel-Native' option and users of AMD
 CPUs should select the 'AMD-Native' option.
@@ -75,9 +90,9 @@ CPUs should select the 'AMD-Native' option.
 MINOR NOTES RELATING TO INTEL ATOM PROCESSORS
 This patch also changes -march=atom to -march=bonnell in accordance with the
 gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I
-believe it should use the newer -march=bonnell flag for atom processors.[3]
+believe it should use the newer -march=bonnell flag for atom processors.[2]
 
-It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The
+It is not recommended to compile on Atom-CPUs with the 'native' option.[3] The
 recommendation is to use the 'atom' option instead.
 
 BENEFITS
@@ -85,41 +100,43 @@ Small but real speed increases are measurable using a make endpoint comparing
 a generic kernel to one built with one of the respective microarchs.
 
 See the following experimental evidence supporting this statement:
-https://github.com/graysky2/kernel_gcc_patch
+https://github.com/graysky2/kernel_compiler_patch?tab=readme-ov-file#benchmarks
 
 REQUIREMENTS
-linux version 5.17+
+linux version 6.8-rc3+
 gcc version >=9.0 or clang version >=9.0
 
 ACKNOWLEDGMENTS
-This patch builds on the seminal work by Jeroen.[5]
+This patch builds on the seminal work by Jeroen.[4]
 
 REFERENCES
-1.  https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/77566eb03bc6a326811cb7e9
-2.  https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options
-3.  https://bugzilla.kernel.org/show_bug.cgi?id=77461
-4.  https://github.com/graysky2/kernel_gcc_patch/issues/15
-5.  http://www.linuxforge.net/docs/linux/linux-gcc.php
+1.  https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options
+2.  https://bugzilla.kernel.org/show_bug.cgi?id=77461
+3.  https://github.com/graysky2/kernel_gcc_patch/issues/15
+4.  http://www.linuxforge.net/docs/linux/linux-gcc.php
+
 ---
- arch/x86/Kconfig.cpu            | 424 ++++++++++++++++++++++++++++++--
- arch/x86/Makefile               |  44 +++-
- arch/x86/include/asm/vermagic.h |  74 ++++++
- 3 files changed, 526 insertions(+), 16 deletions(-)
+ arch/x86/Kconfig.cpu            | 359 ++++++++++++++++++++++++++++++--
+ arch/x86/Makefile               |  89 +++++++-
+ arch/x86/include/asm/vermagic.h |  70 +++++++
+ 3 files changed, 500 insertions(+), 18 deletions(-)
 
 diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
-index 2a7279d80460a..6924a0f5f1c26 100644
+index ce5ed2c2db0c..1cd49fac2ac9 100644
 --- a/arch/x86/Kconfig.cpu
 +++ b/arch/x86/Kconfig.cpu
-@@ -157,7 +157,7 @@ config MPENTIUM4
- 
+@@ -155,9 +155,8 @@ config MPENTIUM4
+ 		-Paxville
+ 		-Dempsey
  
+-
  config MK6
 -	bool "K6/K6-II/K6-III"
 +	bool "AMD K6/K6-II/K6-III"
  	depends on X86_32
  	help
  	  Select this for an AMD K6-family processor.  Enables use of
-@@ -165,7 +165,7 @@ config MK6
+@@ -165,7 +164,7 @@ config MK6
  	  flags to GCC.
  
  config MK7
@@ -128,7 +145,7 @@ index 2a7279d80460a..6924a0f5f1c26 100644
  	depends on X86_32
  	help
  	  Select this for an AMD Athlon K7-family processor.  Enables use of
-@@ -173,12 +173,106 @@ config MK7
+@@ -173,12 +172,114 @@ config MK7
  	  flags to GCC.
  
  config MK8
@@ -232,44 +249,59 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +	  Select this for AMD Family 19h Zen 4 processors.
 +
 +	  Enables -march=znver4
++
++config MZEN5
++	bool "AMD Zen 5"
++	depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000)
++	help
++	  Select this for AMD Family 19h Zen 5 processors.
++
++	  Enables -march=znver5
 +
  config MCRUSOE
  	bool "Crusoe"
  	depends on X86_32
-@@ -270,7 +364,7 @@ config MPSC
+@@ -269,8 +370,17 @@ config MPSC
+ 	  using the cpu family field
  	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
  
++config MATOM
++	bool "Intel Atom"
++	help
++
++	  Select this for the Intel Atom platform. Intel Atom CPUs have an
++	  in-order pipelining architecture and thus can benefit from
++	  accordingly optimized code. Use a recent GCC with specific Atom
++	  support in order to fully benefit from selecting this option.
++
  config MCORE2
 -	bool "Core 2/newer Xeon"
 +	bool "Intel Core 2"
  	help
  
  	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
-@@ -278,6 +372,8 @@ config MCORE2
+@@ -278,14 +388,191 @@ config MCORE2
  	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
  	  (not a typo)
  
+-config MATOM
+-	bool "Intel Atom"
 +	  Enables -march=core2
 +
- config MATOM
- 	bool "Intel Atom"
- 	help
-@@ -287,6 +383,212 @@ config MATOM
- 	  accordingly optimized code. Use a recent GCC with specific Atom
- 	  support in order to fully benefit from selecting this option.
- 
 +config MNEHALEM
 +	bool "Intel Nehalem"
-+	select X86_P6_NOP
-+	help
-+
+ 	help
+ 
+-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+-	  in-order pipelining architecture and thus can benefit from
+-	  accordingly optimized code. Use a recent GCC with specific Atom
+-	  support in order to fully benefit from selecting this option.
 +	  Select this for 1st Gen Core processors in the Nehalem family.
 +
 +	  Enables -march=nehalem
 +
 +config MWESTMERE
 +	bool "Intel Westmere"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for the Intel Westmere formerly Nehalem-C family.
@@ -278,7 +310,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MSILVERMONT
 +	bool "Intel Silvermont"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for the Intel Silvermont platform.
@@ -287,7 +318,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MGOLDMONT
 +	bool "Intel Goldmont"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
@@ -296,7 +326,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MGOLDMONTPLUS
 +	bool "Intel Goldmont Plus"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
@@ -305,7 +334,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MSANDYBRIDGE
 +	bool "Intel Sandy Bridge"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
@@ -314,7 +342,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MIVYBRIDGE
 +	bool "Intel Ivy Bridge"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
@@ -323,7 +350,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MHASWELL
 +	bool "Intel Haswell"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 4th Gen Core processors in the Haswell family.
@@ -332,7 +358,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MBROADWELL
 +	bool "Intel Broadwell"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 5th Gen Core processors in the Broadwell family.
@@ -341,7 +366,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MSKYLAKE
 +	bool "Intel Skylake"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 6th Gen Core processors in the Skylake family.
@@ -350,7 +374,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MSKYLAKEX
 +	bool "Intel Skylake X"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 6th Gen Core processors in the Skylake X family.
@@ -359,7 +382,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MCANNONLAKE
 +	bool "Intel Cannon Lake"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 8th Gen Core processors
@@ -368,7 +390,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MICELAKE
 +	bool "Intel Ice Lake"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for 10th Gen Core processors in the Ice Lake family.
@@ -377,7 +398,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +
 +config MCASCADELAKE
 +	bool "Intel Cascade Lake"
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for Xeon processors in the Cascade Lake family.
@@ -387,7 +407,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MCOOPERLAKE
 +	bool "Intel Cooper Lake"
 +	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for Xeon processors in the Cooper Lake family.
@@ -397,7 +416,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MTIGERLAKE
 +	bool "Intel Tiger Lake"
 +	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
@@ -407,7 +425,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MSAPPHIRERAPIDS
 +	bool "Intel Sapphire Rapids"
 +	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
@@ -417,7 +434,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MROCKETLAKE
 +	bool "Intel Rocket Lake"
 +	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for eleventh-generation processors in the Rocket Lake family.
@@ -427,7 +443,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MALDERLAKE
 +	bool "Intel Alder Lake"
 +	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for twelfth-generation processors in the Alder Lake family.
@@ -437,7 +452,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MRAPTORLAKE
 +	bool "Intel Raptor Lake"
 +	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for thirteenth-generation processors in the Raptor Lake family.
@@ -447,7 +461,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MMETEORLAKE
 +	bool "Intel Meteor Lake"
 +	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for fourteenth-generation processors in the Meteor Lake family.
@@ -457,44 +470,18 @@ index 2a7279d80460a..6924a0f5f1c26 100644
 +config MEMERALDRAPIDS
 +	bool "Intel Emerald Rapids"
 +	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
-+	select X86_P6_NOP
 +	help
 +
 +	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
 +
 +	  Enables -march=emeraldrapids
-+
+ 
  config GENERIC_CPU
  	bool "Generic-x86-64"
- 	depends on X86_64
-@@ -294,6 +596,50 @@ config GENERIC_CPU
+@@ -294,6 +581,26 @@ config GENERIC_CPU
  	  Generic x86-64 CPU.
  	  Run equally well on all x86-64 CPUs.
  
-+config GENERIC_CPU2
-+	bool "Generic-x86-64-v2"
-+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	depends on X86_64
-+	help
-+	  Generic x86-64 CPU.
-+	  Run equally well on all x86-64 CPUs with min support of x86-64-v2.
-+
-+config GENERIC_CPU3
-+	bool "Generic-x86-64-v3"
-+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	depends on X86_64
-+	help
-+	  Generic x86-64-v3 CPU with v3 instructions.
-+	  Run equally well on all x86-64 CPUs with min support of x86-64-v3.
-+
-+config GENERIC_CPU4
-+	bool "Generic-x86-64-v4"
-+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-+	depends on X86_64
-+	help
-+	  Generic x86-64 CPU with v4 instructions.
-+	  Run equally well on all x86-64 CPUs with min support of x86-64-v4.
-+
 +config MNATIVE_INTEL
 +	bool "Intel-Native optimizations autodetected by the compiler"
 +	help
@@ -518,133 +505,78 @@ index 2a7279d80460a..6924a0f5f1c26 100644
  endchoice
  
  config X86_GENERIC
-@@ -318,9 +664,17 @@ config X86_INTERNODE_CACHE_SHIFT
+@@ -308,6 +615,30 @@ config X86_GENERIC
+ 	  This is really intended for distributors who need more
+ 	  generic optimizations.
+ 
++config X86_64_VERSION
++	int "x86-64 compiler ISA level"
++	range 1 3
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64 && GENERIC_CPU
++	help
++	  Specify a specific x86-64 compiler ISA level.
++
++	  There are three x86-64 ISA levels that work on top of
++	  the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4.
++
++	  x86-64-v2 brings support for vector instructions up to Streaming SIMD
++	  Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3
++	  (SSSE3), the POPCNT instruction, and CMPXCHG16B.
++
++	  x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional
++	  bit-manipulation instructions.
++
++	  x86-64-v4 is not included since the kernel does not use AVX512 instructions
++
++	  You can find the best version for your CPU by running one of the following:
++	  /lib/ld-linux-x86-64.so.2 --help | grep supported
++	  /lib64/ld-linux-x86-64.so.2 --help | grep supported
++
+ #
+ # Define implied options from the CPU selection here
+ config X86_INTERNODE_CACHE_SHIFT
+@@ -318,7 +649,7 @@ config X86_INTERNODE_CACHE_SHIFT
  config X86_L1_CACHE_SHIFT
  	int
  	default "7" if MPENTIUM4 || MPSC
 -	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
-+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
-+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
-+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
-+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
-+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
-+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
-+	|| MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 \
-+	|| GENERIC_CPU3 || GENERIC_CPU4
++	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
  	default "4" if MELAN || M486SX || M486 || MGEODEGX1
--	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
-+	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
-+	|| MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- 
- config X86_F00F_BUG
- 	def_bool y
-@@ -332,15 +686,27 @@ config X86_INVD_BUG
+ 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
  
- config X86_ALIGNMENT_16
- 	def_bool y
--	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
-+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
-+	|| M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
+@@ -336,11 +667,11 @@ config X86_ALIGNMENT_16
  
  config X86_INTEL_USERCOPY
  	def_bool y
 -	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
-+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
-+	|| MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
-+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
-+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
-+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
++	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
  
  config X86_USE_PPRO_CHECKSUM
  	def_bool y
 -	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
-+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
-+	|| MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
-+	|| MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
-+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
-+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
-+	|| MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
-+	|| MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
-+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
++	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
  
  #
  # P6_NOPs are a relatively minor optimization that require a family >=
-@@ -356,11 +722,22 @@ config X86_USE_PPRO_CHECKSUM
- config X86_P6_NOP
- 	def_bool y
- 	depends on X86_64
--	depends on (MCORE2 || MPENTIUM4 || MPSC)
-+	depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
-+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
-+	|| MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
-+	|| MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
-+	|| MNATIVE_INTEL)
- 
- config X86_TSC
- 	def_bool y
--	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
-+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
-+	|| MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
-+	|| MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
-+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
-+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
-+	|| MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
-+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
-+	|| MNATIVE_INTEL || MNATIVE_AMD) || X86_64
- 
- config X86_HAVE_PAE
- 	def_bool y
-@@ -368,18 +745,37 @@ config X86_HAVE_PAE
- 
- config X86_CMPXCHG64
- 	def_bool y
--	depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
-+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
-+	|| M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
-+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
-+	|| MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
-+	|| MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
-+	|| MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
-+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
- 
- # this should be set for all -march=.. options where the compiler
- # generates cmov.
- config X86_CMOV
- 	def_bool y
--	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
-+	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
-+	|| MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
-+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
-+	|| MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
-+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
-+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
-+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD)
- 
- config X86_MINIMUM_CPU_FAMILY
- 	int
- 	default "64" if X86_64
--	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8)
-+	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
-+	|| MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8 ||  MK8SSE3 \
-+	|| MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
-+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
-+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
-+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
-+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
-+	|| MNATIVE_INTEL || MNATIVE_AMD)
- 	default "5" if X86_32 && X86_CMPXCHG64
- 	default "4"
- 
 diff --git a/arch/x86/Makefile b/arch/x86/Makefile
-index da8f3caf27815..c873d10df15d0 100644
+index 3419ffa2a350..c778e8a006e2 100644
 --- a/arch/x86/Makefile
 +++ b/arch/x86/Makefile
-@@ -152,8 +152,48 @@ else
-         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+@@ -152,15 +152,96 @@ else
          cflags-$(CONFIG_MK8)		+= -march=k8
          cflags-$(CONFIG_MPSC)		+= -march=nocona
--        cflags-$(CONFIG_MCORE2)		+= -march=core2
+         cflags-$(CONFIG_MCORE2)		+= -march=core2
 -        cflags-$(CONFIG_MATOM)		+= -march=atom
+-        cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
++        cflags-$(CONFIG_MATOM)		+= -march=bonnell
++        ifeq ($(CONFIG_X86_64_VERSION),1)
++          cflags-$(CONFIG_GENERIC_CPU)		+= -mtune=generic
++          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
++        else
++          cflags-$(CONFIG_GENERIC_CPU)		+= -march=x86-64-v$(CONFIG_X86_64_VERSION)
++          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION)
++        endif
 +        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
 +        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
 +        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
@@ -658,10 +590,10 @@ index da8f3caf27815..c873d10df15d0 100644
 +        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
 +        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
 +        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
++        cflags-$(CONFIG_MZEN5) 	+= -march=znver5
 +        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
-+        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native
++        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native -mno-tbm
 +        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
-+        cflags-$(CONFIG_MCORE2) 	+= -march=core2
 +        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
 +        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
 +        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
@@ -684,14 +616,56 @@ index da8f3caf27815..c873d10df15d0 100644
 +        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
 +        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
 +        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
-+        cflags-$(CONFIG_GENERIC_CPU2) 	+= -march=x86-64-v2
-+        cflags-$(CONFIG_GENERIC_CPU3) 	+= -march=x86-64-v3
-+        cflags-$(CONFIG_GENERIC_CPU4) 	+= -march=x86-64-v4
-         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
          KBUILD_CFLAGS += $(cflags-y)
  
+         rustflags-$(CONFIG_MK8)		+= -Ctarget-cpu=k8
+         rustflags-$(CONFIG_MPSC)	+= -Ctarget-cpu=nocona
+         rustflags-$(CONFIG_MCORE2)	+= -Ctarget-cpu=core2
+-        rustflags-$(CONFIG_MATOM)	+= -Ctarget-cpu=atom
+-        rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
++        rustflags-$(CONFIG_MK8SSE3)	+= -Ctarget-cpu=k8-sse3
++        rustflags-$(CONFIG_MK10) 		+= -Ctarget-cpu=amdfam10
++        rustflags-$(CONFIG_MBARCELONA) 	+= -Ctarget-cpu=barcelona
++        rustflags-$(CONFIG_MBOBCAT) 	+= -Ctarget-cpu=btver1
++        rustflags-$(CONFIG_MJAGUAR) 	+= -Ctarget-cpu=btver2
++        rustflags-$(CONFIG_MBULLDOZER) 	+= -Ctarget-cpu=bdver1
++        rustflags-$(CONFIG_MPILEDRIVER)	+= -Ctarget-cpu=bdver2
++        rustflags-$(CONFIG_MSTEAMROLLER) 	+= -Ctarget-cpu=bdver3
++        rustflags-$(CONFIG_MEXCAVATOR) 	+= -Ctarget-cpu=bdver4
++        rustflags-$(CONFIG_MZEN) 		+= -Ctarget-cpu=znver1
++        rustflags-$(CONFIG_MZEN2) 	+= -Ctarget-cpu=znver2
++        rustflags-$(CONFIG_MZEN3) 	+= -Ctarget-cpu=znver3
++        rustflags-$(CONFIG_MZEN4) 	+= -Ctarget-cpu=znver4
++        rustflags-$(CONFIG_MZEN5) 	+= -Ctarget-cpu=znver5
++        rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native
++        rustflags-$(CONFIG_MNATIVE_AMD) 	+= -Ctarget-cpu=native -mno-tbm
++        rustflags-$(CONFIG_MNEHALEM) 	+= -Ctarget-cpu=nehalem
++        rustflags-$(CONFIG_MWESTMERE) 	+= -Ctarget-cpu=westmere
++        rustflags-$(CONFIG_MSILVERMONT) 	+= -Ctarget-cpu=silvermont
++        rustflags-$(CONFIG_MGOLDMONT) 	+= -Ctarget-cpu=goldmont
++        rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus
++        rustflags-$(CONFIG_MSANDYBRIDGE) 	+= -Ctarget-cpu=sandybridge
++        rustflags-$(CONFIG_MIVYBRIDGE) 	+= -Ctarget-cpu=ivybridge
++        rustflags-$(CONFIG_MHASWELL) 	+= -Ctarget-cpu=haswell
++        rustflags-$(CONFIG_MBROADWELL) 	+= -Ctarget-cpu=broadwell
++        rustflags-$(CONFIG_MSKYLAKE) 	+= -Ctarget-cpu=skylake
++        rustflags-$(CONFIG_MSKYLAKEX) 	+= -Ctarget-cpu=skylake-avx512
++        rustflags-$(CONFIG_MCANNONLAKE) 	+= -Ctarget-cpu=cannonlake
++        rustflags-$(CONFIG_MICELAKE) 	+= -Ctarget-cpu=icelake-client
++        rustflags-$(CONFIG_MCASCADELAKE) 	+= -Ctarget-cpu=cascadelake
++        rustflags-$(CONFIG_MCOOPERLAKE) 	+= -Ctarget-cpu=cooperlake
++        rustflags-$(CONFIG_MTIGERLAKE) 	+= -Ctarget-cpu=tigerlake
++        rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids
++        rustflags-$(CONFIG_MROCKETLAKE) 	+= -Ctarget-cpu=rocketlake
++        rustflags-$(CONFIG_MALDERLAKE) 	+= -Ctarget-cpu=alderlake
++        rustflags-$(CONFIG_MRAPTORLAKE) 	+= -Ctarget-cpu=raptorlake
++        rustflags-$(CONFIG_MMETEORLAKE) 	+= -Ctarget-cpu=meteorlake
++        rustflags-$(CONFIG_MEMERALDRAPIDS)	+= -Ctarget-cpu=emeraldrapids
+         KBUILD_RUSTFLAGS += $(rustflags-y)
+ 
+         KBUILD_CFLAGS += -mno-red-zone
 diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
-index 75884d2cdec37..02c1386eb653e 100644
+index 75884d2cdec3..f4e29563473d 100644
 --- a/arch/x86/include/asm/vermagic.h
 +++ b/arch/x86/include/asm/vermagic.h
 @@ -17,6 +17,54 @@
@@ -749,7 +723,7 @@ index 75884d2cdec37..02c1386eb653e 100644
  #elif defined CONFIG_MATOM
  #define MODULE_PROC_FAMILY "ATOM "
  #elif defined CONFIG_M686
-@@ -35,6 +83,32 @@
+@@ -35,6 +83,28 @@
  #define MODULE_PROC_FAMILY "K7 "
  #elif defined CONFIG_MK8
  #define MODULE_PROC_FAMILY "K8 "
@@ -775,13 +749,9 @@ index 75884d2cdec37..02c1386eb653e 100644
 +#define MODULE_PROC_FAMILY "ZEN "
 +#elif defined CONFIG_MZEN2
 +#define MODULE_PROC_FAMILY "ZEN2 "
-+#elif defined CONFIG_MZEN3
-+#define MODULE_PROC_FAMILY "ZEN3 "
-+#elif defined CONFIG_MZEN4
-+#define MODULE_PROC_FAMILY "ZEN4 "
  #elif defined CONFIG_MELAN
  #define MODULE_PROC_FAMILY "ELAN "
  #elif defined CONFIG_MCRUSOE
 -- 
-2.43.2
+2.46.2
 
diff --git a/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip b/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip
deleted file mode 100644
index 63038a4..0000000
--- a/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip
+++ /dev/null
@@ -1,94 +0,0 @@
-From c5214e13ad60bd0022bab45cbac2c9db6bc1e0d4 Mon Sep 17 00:00:00 2001
-From: Peter Zijlstra <peterz@infradead.org>
-Date: Tue, 30 May 2023 13:20:46 +0200
-Subject: sched/fair: Multi-LLC select_idle_sibling()
-
-Tejun reported that when he targets workqueues towards a specific LLC
-on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets
-significant idle time.
-
-This is, of course, because of how select_idle_sibling() will not
-consider anything outside of the local LLC, and since all these tasks
-are short running the periodic idle load balancer is ineffective.
-
-And while it is good to keep work cache local, it is better to not
-have significant idle time. Therefore, have select_idle_sibling() try
-other LLCs inside the same node when the local one comes up empty.
-
-Reported-by: Tejun Heo <tj@kernel.org>
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
----
- kernel/sched/fair.c     | 37 +++++++++++++++++++++++++++++++++++++
- kernel/sched/features.h |  1 +
- 2 files changed, 38 insertions(+)
-
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 48b6f0ca13acc..cd80e30b9d679 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -7027,6 +7027,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
- 	return idle_cpu;
- }
- 
-+/*
-+ * For the multiple-LLC per node case, make sure to try the other LLC's if the
-+ * local LLC comes up empty.
-+ */
-+static int
-+select_idle_node(struct task_struct *p, struct sched_domain *sd, int target)
-+{
-+	struct sched_domain *parent = sd->parent;
-+	struct sched_group *sg;
-+
-+	/* Make sure to not cross nodes. */
-+	if (!parent || parent->flags & SD_NUMA)
-+		return -1;
-+
-+	sg = parent->groups;
-+	do {
-+		int cpu = cpumask_first(sched_group_span(sg));
-+
-+		if (!cpus_share_cache(cpu, target)) {
-+			int i = select_idle_cpu(p, per_cpu(sd_llc, cpu),
-+						test_idle_cores(cpu), cpu);
-+			if ((unsigned)i < nr_cpumask_bits)
-+				return i;
-+		}
-+
-+		sg = sg->next;
-+	} while (sg != parent->groups);
-+
-+	return -1;
-+}
-+
- /*
-  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
-  * the task fits. If no CPU is big enough, but there are idle ones, try to
-@@ -7199,6 +7230,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
- 	if ((unsigned)i < nr_cpumask_bits)
- 		return i;
- 
-+	if (sched_feat(SIS_NODE)) {
-+		i = select_idle_node(p, sd, target);
-+		if ((unsigned)i < nr_cpumask_bits)
-+			return i;
-+	}
-+
- 	return target;
- }
- 
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd33..9e390eb82e384 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
-  */
- SCHED_FEAT(SIS_PROP, false)
- SCHED_FEAT(SIS_UTIL, true)
-+SCHED_FEAT(SIS_NODE, true)
- 
- /*
-  * Issue a WARN when we do multiple update_rq_clock() calls
--- 
-cgit 
-
diff --git a/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch b/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
deleted file mode 100644
index e0fb4ec..0000000
--- a/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch
+++ /dev/null
@@ -1,75 +0,0 @@
-From 76f1df5c1a512d1f459678d17c4b78a74d304cc9 Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Mon, 20 Mar 2023 18:39:46 +0100
-Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity
-
-4.10:
-During some personal testing with the Dolphin emulator, MuQSS has
-serious problems scaling its frequencies causing poor performance where
-boosting the CPU frequencies would have fixed them.  Reducing the
-up_threshold to 45 with MuQSS appears to fix the issue, letting the
-introduction to "Star Wars: Rogue Leader" run at 100% speed versus about
-80% on my test system.
-
-Also, lets refactor the definitions and include some indentation to help
-the reader discern what the scope of all the macros are.
-
-5.4:
-On the last custom kernel benchmark from Phoronix with Xanmod, Michael
-configured all the kernels to run using ondemand instead of the kernel's
-[default selection][1].  This reminded me that another option outside of
-the kernels control is the user's choice to change the cpufreq governor,
-for better or for worse.
-
-In Liquorix, performance is the default governor whether you're running
-acpi-cpufreq or intel-pstate.  I expect laptop users to install TLP or
-LMT to control the power balance on their system, especially when
-they're plugged in or on battery.  However, it's pretty clear to me a
-lot of people would choose ondemand over performance since it's not
-obvious it has huge performance ramifications with MuQSS, and ondemand
-otherwise is "good enough" for most people.
-
-Lets codify lower up thresholds for MuQSS to more closely synergize with
-its aggressive thread migration behavior.  This way when ondemand is
-configured, you get sort of a "performance-lite" type of result but with
-the power savings you expect when leaving the running system idle.
-
-[1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel
-
-5.14:
-Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot
-more of mainline scheduling and do a good job of pinning single threaded
-tasks to their respective core, there's still applications that
-confusingly run steady near 50% and benefit from going full speed or
-turbo when they need to run (emulators for more recent consoles come to
-mind).
-
-Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60.
-
-5.15:
-Remove MuQSS cpufreq configuration.
----
- drivers/cpufreq/cpufreq_ondemand.c | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
-index c52d19d67557..38d3d2fc9c4e 100644
---- a/drivers/cpufreq/cpufreq_ondemand.c
-+++ b/drivers/cpufreq/cpufreq_ondemand.c
-@@ -18,10 +18,10 @@
- #include "cpufreq_ondemand.h"
- 
- /* On-demand governor macros */
--#define DEF_FREQUENCY_UP_THRESHOLD		(80)
--#define DEF_SAMPLING_DOWN_FACTOR		(1)
-+#define DEF_FREQUENCY_UP_THRESHOLD		(55)
-+#define DEF_SAMPLING_DOWN_FACTOR		(5)
- #define MAX_SAMPLING_DOWN_FACTOR		(100000)
--#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
-+#define MICRO_FREQUENCY_UP_THRESHOLD		(60)
- #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
- #define MIN_FREQUENCY_UP_THRESHOLD		(1)
- #define MAX_FREQUENCY_UP_THRESHOLD		(100)
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch b/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
deleted file mode 100644
index c78aa89..0000000
--- a/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 915bbf3cf328160cb27c7b6f98ec4958f0e537e7 Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Mon, 20 Mar 2023 18:45:37 +0100
-Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State
- drivers
-
-Although both P-State drivers depend on schedutil in Kconfig, both code
-bases do not use any schedutil code.  This arbitrarily enables schedutil
-when unwanted in some configurations.
----
- drivers/cpufreq/Kconfig.x86 | 2 --
- 1 file changed, 2 deletions(-)
-
-diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
-index 00476e94db90..c3a219218fac 100644
---- a/drivers/cpufreq/Kconfig.x86
-+++ b/drivers/cpufreq/Kconfig.x86
-@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
- 	select ACPI_PROCESSOR if ACPI
- 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
- 	select CPU_FREQ_GOV_PERFORMANCE
--	select CPU_FREQ_GOV_SCHEDUTIL if SMP
- 	help
- 	  This driver provides a P state for Intel core processors.
- 	  The driver implements an internal governor and will become
-@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
- 	depends on X86 && ACPI
- 	select ACPI_PROCESSOR
- 	select ACPI_CPPC_LIB if X86_64
--	select CPU_FREQ_GOV_SCHEDUTIL if SMP
- 	help
- 	  This driver adds a CPUFreq driver which utilizes a fine grain
- 	  processor performance frequency control range instead of legacy
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
deleted file mode 100644
index 100bbd9..0000000
--- a/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 015323f4b5e73a7076b5c60bd79c7cc480f65f37 Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Tue, 21 Mar 2023 00:19:25 +0100
-Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3
-
-This reverts a6036a4 (kbuild: drop
-support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3), removes the
-dependency on CONFIG_ARC and adds RUSTFLAGS
----
- Makefile     | 3 +++
- init/Kconfig | 6 ++++++
- 2 files changed, 9 insertions(+)
-
-diff --git a/Makefile b/Makefile
-index 3f6628780eb2..64c2842330db 100644
---- a/Makefile
-+++ b/Makefile
-@@ -834,6 +834,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
- ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
- KBUILD_CFLAGS += -O2
- KBUILD_RUSTFLAGS += -Copt-level=2
-+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
-+KBUILD_CFLAGS += -O3
-+KBUILD_RUSTFLAGS += -Copt-level=3
- else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
- KBUILD_CFLAGS += -Os
- KBUILD_RUSTFLAGS += -Copt-level=s
-diff --git a/init/Kconfig b/init/Kconfig
-index 44e90b28a30f..6731063983ec 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -1420,6 +1420,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
- 	  with the "-O2" compiler flag for best performance and most
- 	  helpful compile-time warnings.
- 
-+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
-+	bool "Optimize more for performance (-O3)"
-+	help
-+	  Choosing this option will pass "-O3" to your compiler to optimize
-+	  the kernel yet more for performance.
-+
- config CC_OPTIMIZE_FOR_SIZE
- 	bool "Optimize for size (-Os)"
- 	help
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch b/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch
deleted file mode 100644
index 357ed7b..0000000
--- a/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From ced477387463f385e2a0e01824ae4d512fe5b323 Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Tue, 21 Mar 2023 00:36:58 +0100
-Subject: ZEN: arch/x86: Disable AVX2 and tree vectorization
-
-From ClearLinux's own patches, disable both AVX2 and tree vectorization
-when using O3 and higher than generic amd64 architectures.
-
-Source: https://github.com/clearlinux-pkgs/linux/blob/main/0133-novector.patch
----
- arch/x86/Makefile | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/arch/x86/Makefile b/arch/x86/Makefile
-index 73ed982d4100..5d687a64d710 100644
---- a/arch/x86/Makefile
-+++ b/arch/x86/Makefile
-@@ -67,7 +67,7 @@ export BITS
- #
- #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
- #
--KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
-+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -fno-tree-vectorize
- KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
- 
- ifeq ($(CONFIG_X86_KERNEL_IBT),y)
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch b/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
deleted file mode 100644
index ddee93a..0000000
--- a/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 19847222d00356eb18a22008b1e9c42237bef979 Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Tue, 21 Mar 2023 00:35:39 +0100
-Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops
-
-Queueing in dm-crypt for crypto operations reduces performance on modern
-systems.  As discussed in an article from Cloudflare, they discovered
-that queuing was introduced because the crypto subsystem used to be
-synchronous.  Since it's now asynchronous, we get double queueing when
-using the subsystem through dm-crypt.  This is obviously undesirable and
-reduces throughput and increases latency.
-
-Disable queueing when using our Zen Interactive configuration.
-
-Fixes: zen-kernel#282
----
- drivers/md/dm-crypt.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
-index 2653516bcdef..7fac0f569cef 100644
---- a/drivers/md/dm-crypt.c
-+++ b/drivers/md/dm-crypt.c
-@@ -3207,6 +3207,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
- 			goto bad;
- 	}
- 
-+	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
-+	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
-+
- 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
- 	if (ret < 0)
- 		goto bad;
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch b/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch
deleted file mode 100644
index 0ae59fa..0000000
--- a/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch
+++ /dev/null
@@ -1,1199 +0,0 @@
-From e707ce895085656b53783187aaacbb89867090de Mon Sep 17 00:00:00 2001
-From: Andre Ramnitz <tux.rising@gmail.com>
-Date: Tue, 21 Mar 2023 00:38:15 +0100
-Subject: ZEN: Add VHBA driver
-
-remote https://github.com/cdemu/cdemu
-tag    vhba-module-20211218
----
- drivers/scsi/Kconfig       |    2 +
- drivers/scsi/Makefile      |    1 +
- drivers/scsi/vhba/Kconfig  |    9 +
- drivers/scsi/vhba/Makefile |    4 +
- drivers/scsi/vhba/vhba.c   | 1124 ++++++++++++++++++++++++++++++++++++
- 5 files changed, 1140 insertions(+)
- create mode 100644 drivers/scsi/vhba/Kconfig
- create mode 100644 drivers/scsi/vhba/Makefile
- create mode 100644 drivers/scsi/vhba/vhba.c
-
-diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
-index 03e71e3d5e5b..d4c6603e238b 100644
---- a/drivers/scsi/Kconfig
-+++ b/drivers/scsi/Kconfig
-@@ -1524,4 +1524,6 @@ endif # SCSI_LOWLEVEL
- 
- source "drivers/scsi/device_handler/Kconfig"
- 
-+source "drivers/scsi/vhba/Kconfig"
-+
- endmenu
-diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
-index f055bfd54a68..e16e95f2c3de 100644
---- a/drivers/scsi/Makefile
-+++ b/drivers/scsi/Makefile
-@@ -151,6 +151,7 @@ obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
- obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
- 
- obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
-+obj-$(CONFIG_VHBA)		+= vhba/
- 
- # This goes last, so that "real" scsi devices probe earlier
- obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
-diff --git a/drivers/scsi/vhba/Kconfig b/drivers/scsi/vhba/Kconfig
-new file mode 100644
-index 000000000000..e70a381fe3df
---- /dev/null
-+++ b/drivers/scsi/vhba/Kconfig
-@@ -0,0 +1,9 @@
-+config VHBA
-+	tristate "Virtual (SCSI) Host Bus Adapter"
-+	depends on SCSI
-+	help
-+	  This is the in-kernel part of CDEmu, a CD/DVD-ROM device
-+	  emulator.
-+
-+	  This driver can also be built as a module. If so, the module
-+	  will be called vhba.
-diff --git a/drivers/scsi/vhba/Makefile b/drivers/scsi/vhba/Makefile
-new file mode 100644
-index 000000000000..ad8b7c6442af
---- /dev/null
-+++ b/drivers/scsi/vhba/Makefile
-@@ -0,0 +1,4 @@
-+VHBA_VERSION := 20211218
-+
-+obj-$(CONFIG_VHBA)		+= vhba.o
-+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
-diff --git a/drivers/scsi/vhba/vhba.c b/drivers/scsi/vhba/vhba.c
-new file mode 100644
-index 000000000000..676af31c33ad
---- /dev/null
-+++ b/drivers/scsi/vhba/vhba.c
-@@ -0,0 +1,1124 @@
-+/*
-+ * vhba.c
-+ *
-+ * Copyright (C) 2007-2012 Chia-I Wu <olvaffe AT gmail DOT com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License along
-+ * with this program; if not, write to the Free Software Foundation, Inc.,
-+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-+ */
-+
-+#define pr_fmt(fmt) "vhba: " fmt
-+
-+#include <linux/version.h>
-+
-+#include <linux/init.h>
-+#include <linux/module.h>
-+#include <linux/highmem.h>
-+#include <linux/fs.h>
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-+#include <linux/sched/signal.h>
-+#else
-+#include <linux/sched.h>
-+#endif
-+#include <linux/platform_device.h>
-+#include <linux/miscdevice.h>
-+#include <linux/poll.h>
-+#include <linux/slab.h>
-+#include <linux/scatterlist.h>
-+#ifdef CONFIG_COMPAT
-+#include <linux/compat.h>
-+#endif
-+#include <asm/uaccess.h>
-+#include <scsi/scsi.h>
-+#include <scsi/scsi_host.h>
-+#include <scsi/scsi_cmnd.h>
-+#include <scsi/scsi_device.h>
-+#include <scsi/scsi_tcq.h>
-+
-+
-+MODULE_AUTHOR("Chia-I Wu");
-+MODULE_VERSION(VHBA_VERSION);
-+MODULE_DESCRIPTION("Virtual SCSI HBA");
-+MODULE_LICENSE("GPL");
-+
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
-+#define sdev_dbg(sdev, fmt, a...) \
-+    dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)
-+#define scmd_dbg(scmd, fmt, a...) \
-+    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
-+#endif
-+
-+#define VHBA_MAX_SECTORS_PER_IO 256
-+#define VHBA_MAX_BUS 16
-+#define VHBA_MAX_ID 16
-+#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1))
-+#define VHBA_KBUF_SIZE PAGE_SIZE
-+
-+#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
-+#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
-+
-+
-+static int vhba_can_queue = 32;
-+module_param_named(can_queue, vhba_can_queue, int, 0);
-+
-+
-+enum vhba_req_state {
-+    VHBA_REQ_FREE,
-+    VHBA_REQ_PENDING,
-+    VHBA_REQ_READING,
-+    VHBA_REQ_SENT,
-+    VHBA_REQ_WRITING,
-+};
-+
-+struct vhba_command {
-+    struct scsi_cmnd *cmd;
-+    /* metatags are per-host. not to be confused with
-+       queue tags that are usually per-lun */
-+    unsigned long metatag;
-+    int status;
-+    struct list_head entry;
-+};
-+
-+struct vhba_device {
-+    unsigned int num;
-+    spinlock_t cmd_lock;
-+    struct list_head cmd_list;
-+    wait_queue_head_t cmd_wq;
-+    atomic_t refcnt;
-+
-+    unsigned char *kbuf;
-+    size_t kbuf_size;
-+};
-+
-+struct vhba_host {
-+    struct Scsi_Host *shost;
-+    spinlock_t cmd_lock;
-+    int cmd_next;
-+    struct vhba_command *commands;
-+    spinlock_t dev_lock;
-+    struct vhba_device *devices[VHBA_MAX_DEVICES];
-+    int num_devices;
-+    DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES);
-+    int chgtype[VHBA_MAX_DEVICES];
-+    struct work_struct scan_devices;
-+};
-+
-+#define MAX_COMMAND_SIZE 16
-+
-+struct vhba_request {
-+    __u32 metatag;
-+    __u32 lun;
-+    __u8 cdb[MAX_COMMAND_SIZE];
-+    __u8 cdb_len;
-+    __u32 data_len;
-+};
-+
-+struct vhba_response {
-+    __u32 metatag;
-+    __u32 status;
-+    __u32 data_len;
-+};
-+
-+
-+
-+struct vhba_command *vhba_alloc_command (void);
-+void vhba_free_command (struct vhba_command *vcmd);
-+
-+static struct platform_device vhba_platform_device;
-+
-+
-+
-+/* These functions define a symmetric 1:1 mapping between device numbers and
-+   the bus and id. We have reserved the last id per bus for the host itself. */
-+void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id)
-+{
-+    *bus = devnum / (VHBA_MAX_ID-1);
-+    *id  = devnum % (VHBA_MAX_ID-1);
-+}
-+
-+unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id)
-+{
-+    return (bus * (VHBA_MAX_ID-1)) + id;
-+}
-+
-+struct vhba_device *vhba_device_alloc (void)
-+{
-+    struct vhba_device *vdev;
-+
-+    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
-+    if (!vdev) {
-+        return NULL;
-+    }
-+
-+    spin_lock_init(&vdev->cmd_lock);
-+    INIT_LIST_HEAD(&vdev->cmd_list);
-+    init_waitqueue_head(&vdev->cmd_wq);
-+    atomic_set(&vdev->refcnt, 1);
-+
-+    vdev->kbuf = NULL;
-+    vdev->kbuf_size = 0;
-+
-+    return vdev;
-+}
-+
-+void vhba_device_put (struct vhba_device *vdev)
-+{
-+    if (atomic_dec_and_test(&vdev->refcnt)) {
-+        kfree(vdev);
-+    }
-+}
-+
-+struct vhba_device *vhba_device_get (struct vhba_device *vdev)
-+{
-+    atomic_inc(&vdev->refcnt);
-+
-+    return vdev;
-+}
-+
-+int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
-+{
-+    struct vhba_host *vhost;
-+    struct vhba_command *vcmd;
-+    unsigned long flags;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    vcmd = vhba_alloc_command();
-+    if (!vcmd) {
-+        return SCSI_MLQUEUE_HOST_BUSY;
-+    }
-+
-+    vcmd->cmd = cmd;
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
-+    vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag;
-+#else
-+    vcmd->metatag = vcmd->cmd->request->tag;
-+#endif
-+    list_add_tail(&vcmd->entry, &vdev->cmd_list);
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    wake_up_interruptible(&vdev->cmd_wq);
-+
-+    return 0;
-+}
-+
-+int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
-+{
-+    struct vhba_command *vcmd;
-+    int retval;
-+    unsigned long flags;
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
-+        if (vcmd->cmd == cmd) {
-+            list_del_init(&vcmd->entry);
-+            break;
-+        }
-+    }
-+
-+    /* command not found */
-+    if (&vcmd->entry == &vdev->cmd_list) {
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+        return SUCCESS;
-+    }
-+
-+    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+        scmd_dbg(cmd, "wait for I/O before aborting\n");
-+        schedule_timeout(1);
-+        spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    }
-+
-+    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
-+
-+    vhba_free_command(vcmd);
-+
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    return retval;
-+}
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
-+int vhba_slave_alloc(struct scsi_device *sdev)
-+{
-+    struct Scsi_Host *shost = sdev->host;
-+
-+    sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth);
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
-+    if (!shost_use_blk_mq(shost) && shost->bqt) {
-+#else
-+    if (shost->bqt) {
-+#endif
-+        blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt);
-+    }
-+    scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth);
-+
-+    return 0;
-+}
-+#endif
-+
-+void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id)
-+{
-+    struct scsi_device *sdev;
-+
-+    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
-+    if (!sdev) {
-+        scsi_add_device(vhost->shost, bus, id, 0);
-+    } else {
-+        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id);
-+        scsi_device_put(sdev);
-+    }
-+}
-+
-+void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id)
-+{
-+    struct scsi_device *sdev;
-+
-+    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
-+    if (sdev) {
-+        scsi_remove_device(sdev);
-+        scsi_device_put(sdev);
-+    } else {
-+        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id);
-+    }
-+}
-+
-+void vhba_scan_devices (struct work_struct *work)
-+{
-+    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
-+    unsigned long flags;
-+    int change, exists;
-+    unsigned int devnum;
-+    unsigned int bus, id;
-+
-+    for (;;) {
-+        spin_lock_irqsave(&vhost->dev_lock, flags);
-+
-+        devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES);
-+        if (devnum >= VHBA_MAX_DEVICES) {
-+            spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+            break;
-+        }
-+        change = vhost->chgtype[devnum];
-+        exists = vhost->devices[devnum] != NULL;
-+
-+        vhost->chgtype[devnum] = 0;
-+        clear_bit(devnum, vhost->chgmap);
-+
-+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+
-+        devnum_to_bus_and_id(devnum, &bus, &id);
-+
-+        if (change < 0) {
-+            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id);
-+            vhba_scan_devices_remove(vhost, bus, id);
-+        } else if (change > 0) {
-+            dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id);
-+            vhba_scan_devices_add(vhost, bus, id);
-+        } else {
-+            /* quick sequence of add/remove or remove/add; we determine
-+               which one it was by checking if device structure exists */
-+            if (exists) {
-+                /* remove followed by add: remove and (re)add */
-+                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id);
-+                vhba_scan_devices_remove(vhost, bus, id);
-+                vhba_scan_devices_add(vhost, bus, id);
-+            } else {
-+                /* add followed by remove: no-op */
-+                dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id);
-+            }
-+        }
-+    }
-+}
-+
-+int vhba_add_device (struct vhba_device *vdev)
-+{
-+    struct vhba_host *vhost;
-+    unsigned int devnum;
-+    unsigned long flags;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    vhba_device_get(vdev);
-+
-+    spin_lock_irqsave(&vhost->dev_lock, flags);
-+    if (vhost->num_devices >= VHBA_MAX_DEVICES) {
-+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+        vhba_device_put(vdev);
-+        return -EBUSY;
-+    }
-+
-+    for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) {
-+        if (vhost->devices[devnum] == NULL) {
-+            vdev->num = devnum;
-+            vhost->devices[devnum] = vdev;
-+            vhost->num_devices++;
-+            set_bit(devnum, vhost->chgmap);
-+            vhost->chgtype[devnum]++;
-+            break;
-+        }
-+    }
-+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+
-+    schedule_work(&vhost->scan_devices);
-+
-+    return 0;
-+}
-+
-+int vhba_remove_device (struct vhba_device *vdev)
-+{
-+    struct vhba_host *vhost;
-+    unsigned long flags;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    spin_lock_irqsave(&vhost->dev_lock, flags);
-+    set_bit(vdev->num, vhost->chgmap);
-+    vhost->chgtype[vdev->num]--;
-+    vhost->devices[vdev->num] = NULL;
-+    vhost->num_devices--;
-+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+
-+    vhba_device_put(vdev);
-+
-+    schedule_work(&vhost->scan_devices);
-+
-+    return 0;
-+}
-+
-+struct vhba_device *vhba_lookup_device (int devnum)
-+{
-+    struct vhba_host *vhost;
-+    struct vhba_device *vdev = NULL;
-+    unsigned long flags;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    if (likely(devnum < VHBA_MAX_DEVICES)) {
-+        spin_lock_irqsave(&vhost->dev_lock, flags);
-+        vdev = vhost->devices[devnum];
-+        if (vdev) {
-+            vdev = vhba_device_get(vdev);
-+        }
-+
-+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
-+    }
-+
-+    return vdev;
-+}
-+
-+struct vhba_command *vhba_alloc_command (void)
-+{
-+    struct vhba_host *vhost;
-+    struct vhba_command *vcmd;
-+    unsigned long flags;
-+    int i;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    spin_lock_irqsave(&vhost->cmd_lock, flags);
-+
-+    vcmd = vhost->commands + vhost->cmd_next++;
-+    if (vcmd->status != VHBA_REQ_FREE) {
-+        for (i = 0; i < vhba_can_queue; i++) {
-+            vcmd = vhost->commands + i;
-+
-+            if (vcmd->status == VHBA_REQ_FREE) {
-+                vhost->cmd_next = i + 1;
-+                break;
-+            }
-+        }
-+
-+        if (i == vhba_can_queue) {
-+            vcmd = NULL;
-+        }
-+    }
-+
-+    if (vcmd) {
-+        vcmd->status = VHBA_REQ_PENDING;
-+    }
-+
-+    vhost->cmd_next %= vhba_can_queue;
-+
-+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
-+
-+    return vcmd;
-+}
-+
-+void vhba_free_command (struct vhba_command *vcmd)
-+{
-+    struct vhba_host *vhost;
-+    unsigned long flags;
-+
-+    vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    spin_lock_irqsave(&vhost->cmd_lock, flags);
-+    vcmd->status = VHBA_REQ_FREE;
-+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
-+}
-+
-+int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd)
-+{
-+    struct vhba_device *vdev;
-+    int retval;
-+    unsigned int devnum;
-+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
-+    scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag);
-+#else
-+    scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag);
-+#endif
-+
-+    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
-+    vdev = vhba_lookup_device(devnum);
-+    if (!vdev) {
-+        scmd_dbg(cmd, "no such device\n");
-+
-+        cmd->result = DID_NO_CONNECT << 16;
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
-+        scsi_done(cmd);
-+#else
-+        cmd->scsi_done(cmd);
-+#endif
-+
-+        return 0;
-+    }
-+
-+    retval = vhba_device_queue(vdev, cmd);
-+
-+    vhba_device_put(vdev);
-+
-+    return retval;
-+}
-+
-+int vhba_abort (struct scsi_cmnd *cmd)
-+{
-+    struct vhba_device *vdev;
-+    int retval = SUCCESS;
-+    unsigned int devnum;
-+
-+    scmd_dbg(cmd, "abort %p\n", cmd);
-+
-+    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
-+    vdev = vhba_lookup_device(devnum);
-+    if (vdev) {
-+        retval = vhba_device_dequeue(vdev, cmd);
-+        vhba_device_put(vdev);
-+    } else {
-+        cmd->result = DID_NO_CONNECT << 16;
-+    }
-+
-+    return retval;
-+}
-+
-+static struct scsi_host_template vhba_template = {
-+    .module = THIS_MODULE,
-+    .name = "vhba",
-+    .proc_name = "vhba",
-+    .queuecommand = vhba_queuecommand,
-+    .eh_abort_handler = vhba_abort,
-+    .this_id = -1,
-+    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
-+    .sg_tablesize = 256,
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
-+    .slave_alloc = vhba_slave_alloc,
-+#endif
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
-+    .tag_alloc_policy = BLK_TAG_ALLOC_RR,
-+#endif
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
-+    .use_blk_tags = 1,
-+#endif
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
-+    .max_segment_size = VHBA_KBUF_SIZE,
-+#endif
-+};
-+
-+ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
-+{
-+    struct vhba_request vreq;
-+    ssize_t ret;
-+
-+    scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n",
-+        metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
-+
-+    ret = sizeof(vreq);
-+    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
-+        ret += scsi_bufflen(cmd);
-+    }
-+
-+    if (ret > buf_len) {
-+        scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
-+        return -EIO;
-+    }
-+
-+    vreq.metatag = metatag;
-+    vreq.lun = cmd->device->lun;
-+    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
-+    vreq.cdb_len = cmd->cmd_len;
-+    vreq.data_len = scsi_bufflen(cmd);
-+
-+    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
-+        return -EFAULT;
-+    }
-+
-+    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
-+        buf += sizeof(vreq);
-+
-+        if (scsi_sg_count(cmd)) {
-+            unsigned char *kaddr, *uaddr;
-+            struct scatterlist *sglist = scsi_sglist(cmd);
-+            struct scatterlist *sg;
-+            int i;
-+
-+            uaddr = (unsigned char *) buf;
-+
-+            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
-+                size_t len = sg->length;
-+
-+                if (len > vdev->kbuf_size) {
-+                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
-+                    len = vdev->kbuf_size;
-+                }
-+
-+                kaddr = kmap_atomic(sg_page(sg));
-+                memcpy(vdev->kbuf, kaddr + sg->offset, len);
-+                kunmap_atomic(kaddr);
-+
-+                if (copy_to_user(uaddr, vdev->kbuf, len)) {
-+                    return -EFAULT;
-+                }
-+                uaddr += len;
-+            }
-+        } else {
-+            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
-+                return -EFAULT;
-+            }
-+        }
-+    }
-+
-+    return ret;
-+}
-+
-+ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
-+{
-+    ssize_t ret = 0;
-+
-+    scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n",
-+         metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd));
-+
-+    if (res->status) {
-+        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
-+            scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
-+            res->data_len = SCSI_SENSE_BUFFERSIZE;
-+        }
-+
-+        if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) {
-+            return -EFAULT;
-+        }
-+
-+        cmd->result = res->status;
-+
-+        ret += res->data_len;
-+    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
-+        size_t to_read;
-+
-+        if (res->data_len > scsi_bufflen(cmd)) {
-+            scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
-+            res->data_len = scsi_bufflen(cmd);
-+        }
-+
-+        to_read = res->data_len;
-+
-+        if (scsi_sg_count(cmd)) {
-+            unsigned char *kaddr, *uaddr;
-+            struct scatterlist *sglist = scsi_sglist(cmd);
-+            struct scatterlist *sg;
-+            int i;
-+
-+            uaddr = (unsigned char *)buf;
-+
-+            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
-+                size_t len = (sg->length < to_read) ? sg->length : to_read;
-+
-+                if (len > vdev->kbuf_size) {
-+                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
-+                    len = vdev->kbuf_size;
-+                }
-+
-+                if (copy_from_user(vdev->kbuf, uaddr, len)) {
-+                    return -EFAULT;
-+                }
-+                uaddr += len;
-+
-+                kaddr = kmap_atomic(sg_page(sg));
-+                memcpy(kaddr + sg->offset, vdev->kbuf, len);
-+                kunmap_atomic(kaddr);
-+
-+                to_read -= len;
-+                if (to_read == 0) {
-+                    break;
-+                }
-+            }
-+        } else {
-+            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
-+                return -EFAULT;
-+            }
-+
-+            to_read -= res->data_len;
-+        }
-+
-+        scsi_set_resid(cmd, to_read);
-+
-+        ret += res->data_len - to_read;
-+    }
-+
-+    return ret;
-+}
-+
-+struct vhba_command *next_command (struct vhba_device *vdev)
-+{
-+    struct vhba_command *vcmd;
-+
-+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
-+        if (vcmd->status == VHBA_REQ_PENDING) {
-+            break;
-+        }
-+    }
-+
-+    if (&vcmd->entry == &vdev->cmd_list) {
-+        vcmd = NULL;
-+    }
-+
-+    return vcmd;
-+}
-+
-+struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag)
-+{
-+    struct vhba_command *vcmd;
-+
-+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
-+        if (vcmd->metatag == metatag) {
-+            break;
-+        }
-+    }
-+
-+    if (&vcmd->entry == &vdev->cmd_list) {
-+        vcmd = NULL;
-+    }
-+
-+    return vcmd;
-+}
-+
-+struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
-+{
-+    struct vhba_command *vcmd;
-+    DEFINE_WAIT(wait);
-+
-+    while (!(vcmd = next_command(vdev))) {
-+        if (signal_pending(current)) {
-+            break;
-+        }
-+
-+        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
-+
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+        schedule();
-+
-+        spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    }
-+
-+    finish_wait(&vdev->cmd_wq, &wait);
-+    if (vcmd) {
-+        vcmd->status = VHBA_REQ_READING;
-+    }
-+
-+    return vcmd;
-+}
-+
-+ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
-+{
-+    struct vhba_device *vdev;
-+    struct vhba_command *vcmd;
-+    ssize_t ret;
-+    unsigned long flags;
-+
-+    vdev = file->private_data;
-+
-+    /* Get next command */
-+    if (file->f_flags & O_NONBLOCK) {
-+        /* Non-blocking variant */
-+        spin_lock_irqsave(&vdev->cmd_lock, flags);
-+        vcmd = next_command(vdev);
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+        if (!vcmd) {
-+            return -EWOULDBLOCK;
-+        }
-+    } else {
-+        /* Blocking variant */
-+        spin_lock_irqsave(&vdev->cmd_lock, flags);
-+        vcmd = wait_command(vdev, flags);
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+        if (!vcmd) {
-+            return -ERESTARTSYS;
-+        }
-+    }
-+
-+    ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len);
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    if (ret >= 0) {
-+        vcmd->status = VHBA_REQ_SENT;
-+        *offset += ret;
-+    } else {
-+        vcmd->status = VHBA_REQ_PENDING;
-+    }
-+
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    return ret;
-+}
-+
-+ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
-+{
-+    struct vhba_device *vdev;
-+    struct vhba_command *vcmd;
-+    struct vhba_response res;
-+    ssize_t ret;
-+    unsigned long flags;
-+
-+    if (buf_len < sizeof(res)) {
-+        return -EIO;
-+    }
-+
-+    if (copy_from_user(&res, buf, sizeof(res))) {
-+        return -EFAULT;
-+    }
-+
-+    vdev = file->private_data;
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    vcmd = match_command(vdev, res.metatag);
-+    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
-+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+        pr_debug("ctl dev #%u not expecting response\n", vdev->num);
-+        return -EIO;
-+    }
-+    vcmd->status = VHBA_REQ_WRITING;
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    if (ret >= 0) {
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
-+        scsi_done(vcmd->cmd);
-+#else
-+        vcmd->cmd->scsi_done(vcmd->cmd);
-+#endif
-+        ret += sizeof(res);
-+
-+        /* don't compete with vhba_device_dequeue */
-+        if (!list_empty(&vcmd->entry)) {
-+            list_del_init(&vcmd->entry);
-+            vhba_free_command(vcmd);
-+        }
-+    } else {
-+        vcmd->status = VHBA_REQ_SENT;
-+    }
-+
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    return ret;
-+}
-+
-+long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
-+{
-+    struct vhba_device *vdev = file->private_data;
-+    struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device);
-+
-+    switch (cmd) {
-+        case 0xBEEF001: {
-+            unsigned int ident[4]; /* host, channel, id, lun */
-+
-+            ident[0] = vhost->shost->host_no;
-+            devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]);
-+            ident[3] = 0; /* lun */
-+
-+            if (copy_to_user((void *) arg, ident, sizeof(ident))) {
-+                return -EFAULT;
-+            }
-+
-+            return 0;
-+        }
-+        case 0xBEEF002: {
-+            unsigned int devnum = vdev->num;
-+
-+            if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) {
-+                return -EFAULT;
-+            }
-+
-+            return 0;
-+        }
-+    }
-+
-+    return -ENOTTY;
-+}
-+
-+#ifdef CONFIG_COMPAT
-+long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
-+{
-+    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
-+    return vhba_ctl_ioctl(file, cmd, compat_arg);
-+}
-+#endif
-+
-+unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
-+{
-+    struct vhba_device *vdev = file->private_data;
-+    unsigned int mask = 0;
-+    unsigned long flags;
-+
-+    poll_wait(file, &vdev->cmd_wq, wait);
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    if (next_command(vdev)) {
-+        mask |= POLLIN | POLLRDNORM;
-+    }
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    return mask;
-+}
-+
-+int vhba_ctl_open (struct inode *inode, struct file *file)
-+{
-+    struct vhba_device *vdev;
-+    int retval;
-+
-+    pr_debug("ctl dev open\n");
-+
-+    /* check if vhba is probed */
-+    if (!platform_get_drvdata(&vhba_platform_device)) {
-+        return -ENODEV;
-+    }
-+
-+    vdev = vhba_device_alloc();
-+    if (!vdev) {
-+        return -ENOMEM;
-+    }
-+
-+    vdev->kbuf_size = VHBA_KBUF_SIZE;
-+    vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL);
-+    if (!vdev->kbuf) {
-+        return -ENOMEM;
-+    }
-+
-+    if (!(retval = vhba_add_device(vdev))) {
-+        file->private_data = vdev;
-+    }
-+
-+    vhba_device_put(vdev);
-+
-+    return retval;
-+}
-+
-+int vhba_ctl_release (struct inode *inode, struct file *file)
-+{
-+    struct vhba_device *vdev;
-+    struct vhba_command *vcmd;
-+    unsigned long flags;
-+
-+    vdev = file->private_data;
-+
-+    pr_debug("ctl dev release\n");
-+
-+    vhba_device_get(vdev);
-+    vhba_remove_device(vdev);
-+
-+    spin_lock_irqsave(&vdev->cmd_lock, flags);
-+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
-+        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
-+
-+        scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd);
-+        vcmd->cmd->result = DID_NO_CONNECT << 16;
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
-+        scsi_done(vcmd->cmd);
-+#else
-+        vcmd->cmd->scsi_done(vcmd->cmd);
-+#endif
-+        vhba_free_command(vcmd);
-+    }
-+    INIT_LIST_HEAD(&vdev->cmd_list);
-+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
-+
-+    kfree(vdev->kbuf);
-+    vdev->kbuf = NULL;
-+
-+    vhba_device_put(vdev);
-+
-+    return 0;
-+}
-+
-+static struct file_operations vhba_ctl_fops = {
-+    .owner = THIS_MODULE,
-+    .open = vhba_ctl_open,
-+    .release = vhba_ctl_release,
-+    .read = vhba_ctl_read,
-+    .write = vhba_ctl_write,
-+    .poll = vhba_ctl_poll,
-+    .unlocked_ioctl = vhba_ctl_ioctl,
-+#ifdef CONFIG_COMPAT
-+    .compat_ioctl = vhba_ctl_compat_ioctl,
-+#endif
-+};
-+
-+static struct miscdevice vhba_miscdev = {
-+    .minor = MISC_DYNAMIC_MINOR,
-+    .name = "vhba_ctl",
-+    .fops = &vhba_ctl_fops,
-+};
-+
-+int vhba_probe (struct platform_device *pdev)
-+{
-+    struct Scsi_Host *shost;
-+    struct vhba_host *vhost;
-+    int i;
-+
-+    vhba_can_queue = clamp(vhba_can_queue, 1, 256);
-+
-+    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
-+    if (!shost) {
-+        return -ENOMEM;
-+    }
-+
-+    shost->max_channel = VHBA_MAX_BUS-1;
-+    shost->max_id = VHBA_MAX_ID;
-+    /* we don't support lun > 0 */
-+    shost->max_lun = 1;
-+    shost->max_cmd_len = MAX_COMMAND_SIZE;
-+    shost->can_queue = vhba_can_queue;
-+    shost->cmd_per_lun = vhba_can_queue;
-+
-+    vhost = (struct vhba_host *)shost->hostdata;
-+    memset(vhost, 0, sizeof(struct vhba_host));
-+
-+    vhost->shost = shost;
-+    vhost->num_devices = 0;
-+    spin_lock_init(&vhost->dev_lock);
-+    spin_lock_init(&vhost->cmd_lock);
-+    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
-+    vhost->cmd_next = 0;
-+    vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL);
-+    if (!vhost->commands) {
-+        return -ENOMEM;
-+    }
-+
-+    for (i = 0; i < vhba_can_queue; i++) {
-+        vhost->commands[i].status = VHBA_REQ_FREE;
-+    }
-+
-+    platform_set_drvdata(pdev, vhost);
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
-+    i = scsi_init_shared_tag_map(shost, vhba_can_queue);
-+    if (i) return i;
-+#endif
-+
-+    if (scsi_add_host(shost, &pdev->dev)) {
-+        scsi_host_put(shost);
-+        return -ENOMEM;
-+    }
-+
-+    return 0;
-+}
-+
-+int vhba_remove (struct platform_device *pdev)
-+{
-+    struct vhba_host *vhost;
-+    struct Scsi_Host *shost;
-+
-+    vhost = platform_get_drvdata(pdev);
-+    shost = vhost->shost;
-+
-+    scsi_remove_host(shost);
-+    scsi_host_put(shost);
-+
-+    kfree(vhost->commands);
-+
-+    return 0;
-+}
-+
-+void vhba_release (struct device * dev)
-+{
-+    return;
-+}
-+
-+static struct platform_device vhba_platform_device = {
-+    .name = "vhba",
-+    .id = -1,
-+    .dev = {
-+        .release = vhba_release,
-+    },
-+};
-+
-+static struct platform_driver vhba_platform_driver = {
-+    .driver = {
-+        .owner = THIS_MODULE,
-+        .name = "vhba",
-+    },
-+    .probe = vhba_probe,
-+    .remove = vhba_remove,
-+};
-+
-+int __init vhba_init (void)
-+{
-+    int ret;
-+
-+    ret = platform_device_register(&vhba_platform_device);
-+    if (ret < 0) {
-+        return ret;
-+    }
-+
-+    ret = platform_driver_register(&vhba_platform_driver);
-+    if (ret < 0) {
-+        platform_device_unregister(&vhba_platform_device);
-+        return ret;
-+    }
-+
-+    ret = misc_register(&vhba_miscdev);
-+    if (ret < 0) {
-+        platform_driver_unregister(&vhba_platform_driver);
-+        platform_device_unregister(&vhba_platform_device);
-+        return ret;
-+    }
-+
-+    return 0;
-+}
-+
-+void __exit vhba_exit(void)
-+{
-+    misc_deregister(&vhba_miscdev);
-+    platform_driver_unregister(&vhba_platform_driver);
-+    platform_device_unregister(&vhba_platform_device);
-+}
-+
-+module_init(vhba_init);
-+module_exit(vhba_exit);
-+
--- 
-2.39.2
-
diff --git a/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index de7b51d..0000000
--- a/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 1/7] x86: Drop CPU_SUP_INTEL from SCHED_MC_PRIO for the expansion.
-Date: Tue, 5 Dec 2023 14:35:31 +0800	[thread overview]
-Message-ID: <20231205063537.872834-2-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-amd-pstate driver also uses SCHED_MC_PRIO, so decouple the requirement
-of CPU_SUP_INTEL from the dependencies to allow compilation in kernels
-without Intel CPU support.
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
----
- arch/x86/Kconfig | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 3762f41bb092..3e57773f946a 100644
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -1054,8 +1054,9 @@ config SCHED_MC
- 
- config SCHED_MC_PRIO
- 	bool "CPU core priorities scheduler support"
--	depends on SCHED_MC && CPU_SUP_INTEL
--	select X86_INTEL_PSTATE
-+	depends on SCHED_MC
-+	select X86_INTEL_PSTATE if CPU_SUP_INTEL
-+	select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI
- 	select CPU_FREQ
- 	default y
- 	help
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index fe399e3..0000000
--- a/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,92 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Wyes Karny <wyes.karny@amd.com>,
-	Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 2/7] acpi: cppc: Add get the highest performance cppc control
-Date: Tue, 5 Dec 2023 14:35:32 +0800	[thread overview]
-Message-ID: <20231205063537.872834-3-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-Add support for getting the highest performance to the
-generic CPPC driver. This enables downstream drivers
-such as amd-pstate to discover and use these values.
-
-Please refer to the ACPI_Spec for details on continuous
-performance control of CPPC.
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Wyes Karny <wyes.karny@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Acked-by: Huang Rui <ray.huang@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
-Link: https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html?highlight=cppc#highest-performance
----
- drivers/acpi/cppc_acpi.c | 13 +++++++++++++
- include/acpi/cppc_acpi.h |  5 +++++
- 2 files changed, 18 insertions(+)
-
-diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
-index 7ff269a78c20..ad388a0e8484 100644
---- a/drivers/acpi/cppc_acpi.c
-+++ b/drivers/acpi/cppc_acpi.c
-@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
- 	return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
- }
- 
-+/**
-+ * cppc_get_highest_perf - Get the highest performance register value.
-+ * @cpunum: CPU from which to get highest performance.
-+ * @highest_perf: Return address.
-+ *
-+ * Return: 0 for success, -EIO otherwise.
-+ */
-+int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
-+{
-+	return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf);
-+}
-+EXPORT_SYMBOL_GPL(cppc_get_highest_perf);
-+
- /**
-  * cppc_get_epp_perf - Get the epp register value.
-  * @cpunum: CPU from which to get epp preference value.
-diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
-index 6126c977ece0..c0b69ffe7bdb 100644
---- a/include/acpi/cppc_acpi.h
-+++ b/include/acpi/cppc_acpi.h
-@@ -139,6 +139,7 @@ struct cppc_cpudata {
- #ifdef CONFIG_ACPI_CPPC_LIB
- extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
- extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf);
-+extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf);
- extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
- extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
- extern int cppc_set_enable(int cpu, bool enable);
-@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
- {
- 	return -ENOTSUPP;
- }
-+static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
-+{
-+	return -ENOTSUPP;
-+}
- static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
- {
- 	return -ENOTSUPP;
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index e891fcd..0000000
--- a/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,322 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Wyes Karny <wyes.karny@amd.com>
-Subject: [PATCH V12 3/7] cpufreq: amd-pstate: Enable amd-pstate preferred core supporting.
-Date: Tue, 5 Dec 2023 14:35:33 +0800	[thread overview]
-Message-ID: <20231205063537.872834-4-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-amd-pstate driver utilizes the functions and data structures
-provided by the ITMT architecture to enable the scheduler to
-favor scheduling on cores which can be get a higher frequency
-with lower voltage. We call it amd-pstate preferrred core.
-
-Here sched_set_itmt_core_prio() is called to set priorities and
-sched_set_itmt_support() is called to enable ITMT feature.
-amd-pstate driver uses the highest performance value to indicate
-the priority of CPU. The higher value has a higher priority.
-
-The initial core rankings are set up by amd-pstate when the
-system boots.
-
-Add a variable hw_prefcore in cpudata structure. It will check
-if the processor and power firmware support preferred core
-feature.
-
-Add one new early parameter `disable` to allow user to disable
-the preferred core.
-
-Only when hardware supports preferred core and user set `enabled`
-in early parameter, amd pstate driver supports preferred core featue.
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Wyes Karny <wyes.karny@amd.com>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Co-developed-by: Perry Yuan <Perry.Yuan@amd.com>
-Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
----
- drivers/cpufreq/amd-pstate.c | 131 ++++++++++++++++++++++++++++++++---
- include/linux/amd-pstate.h   |   4 ++
- 2 files changed, 127 insertions(+), 8 deletions(-)
-
-diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index 1f6186475715..9c2790753f99 100644
---- a/drivers/cpufreq/amd-pstate.c
-+++ b/drivers/cpufreq/amd-pstate.c
-@@ -37,6 +37,7 @@
- #include <linux/uaccess.h>
- #include <linux/static_call.h>
- #include <linux/amd-pstate.h>
-+#include <linux/topology.h>
- 
- #include <acpi/processor.h>
- #include <acpi/cppc_acpi.h>
-@@ -49,6 +50,7 @@
- 
- #define AMD_PSTATE_TRANSITION_LATENCY	20000
- #define AMD_PSTATE_TRANSITION_DELAY	1000
-+#define AMD_PSTATE_PREFCORE_THRESHOLD	166
- 
- /*
-  * TODO: We need more time to fine tune processors with shared memory solution
-@@ -64,6 +66,7 @@ static struct cpufreq_driver amd_pstate_driver;
- static struct cpufreq_driver amd_pstate_epp_driver;
- static int cppc_state = AMD_PSTATE_UNDEFINED;
- static bool cppc_enabled;
-+static bool amd_pstate_prefcore = true;
- 
- /*
-  * AMD Energy Preference Performance (EPP)
-@@ -297,13 +300,14 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
- 	if (ret)
- 		return ret;
- 
--	/*
--	 * TODO: Introduce AMD specific power feature.
--	 *
--	 * CPPC entry doesn't indicate the highest performance in some ASICs.
-+	/* For platforms that do not support the preferred core feature, the
-+	 * highest_pef may be configured with 166 or 255, to avoid max frequency
-+	 * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
-+	 * the default max perf.
- 	 */
--	highest_perf = amd_get_highest_perf();
--	if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1))
-+	if (cpudata->hw_prefcore)
-+		highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD;
-+	else
- 		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
- 
- 	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-@@ -324,8 +328,9 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
- 	if (ret)
- 		return ret;
- 
--	highest_perf = amd_get_highest_perf();
--	if (highest_perf > cppc_perf.highest_perf)
-+	if (cpudata->hw_prefcore)
-+		highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD;
-+	else
- 		highest_perf = cppc_perf.highest_perf;
- 
- 	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-@@ -706,6 +711,80 @@ static void amd_perf_ctl_reset(unsigned int cpu)
- 	wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0);
- }
- 
-+/*
-+ * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks
-+ * due to locking, so queue the work for later.
-+ */
-+static void amd_pstste_sched_prefcore_workfn(struct work_struct *work)
-+{
-+	sched_set_itmt_support();
-+}
-+static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
-+
-+/*
-+ * Get the highest performance register value.
-+ * @cpu: CPU from which to get highest performance.
-+ * @highest_perf: Return address.
-+ *
-+ * Return: 0 for success, -EIO otherwise.
-+ */
-+static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
-+{
-+	int ret;
-+
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		u64 cap1;
-+
-+		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
-+		if (ret)
-+			return ret;
-+		WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
-+	} else {
-+		u64 cppc_highest_perf;
-+
-+		ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
-+		if (ret)
-+			return ret;
-+		WRITE_ONCE(*highest_perf, cppc_highest_perf);
-+	}
-+
-+	return (ret);
-+}
-+
-+#define CPPC_MAX_PERF	U8_MAX
-+
-+static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
-+{
-+	int ret, prio;
-+	u32 highest_perf;
-+
-+	ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
-+	if (ret)
-+		return;
-+
-+	cpudata->hw_prefcore = true;
-+	/* check if CPPC preferred core feature is enabled*/
-+	if (highest_perf < CPPC_MAX_PERF)
-+		prio = (int)highest_perf;
-+	else {
-+		pr_debug("AMD CPPC preferred core is unsupported!\n");
-+		cpudata->hw_prefcore = false;
-+		return;
-+	}
-+
-+	if (!amd_pstate_prefcore)
-+		return;
-+
-+	/*
-+	 * The priorities can be set regardless of whether or not
-+	 * sched_set_itmt_support(true) has been called and it is valid to
-+	 * update them at any time after it has been called.
-+	 */
-+	sched_set_itmt_core_prio(prio, cpudata->cpu);
-+
-+	schedule_work(&sched_prefcore_work);
-+}
-+
- static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
- {
- 	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
-@@ -727,6 +806,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
- 
- 	cpudata->cpu = policy->cpu;
- 
-+	amd_pstate_init_prefcore(cpudata);
-+
- 	ret = amd_pstate_init_perf(cpudata);
- 	if (ret)
- 		goto free_cpudata1;
-@@ -877,6 +958,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
- 	return sysfs_emit(buf, "%u\n", perf);
- }
- 
-+static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy,
-+					   char *buf)
-+{
-+	bool hw_prefcore;
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	hw_prefcore = READ_ONCE(cpudata->hw_prefcore);
-+
-+	return sysfs_emit(buf, "%s\n", str_enabled_disabled(hw_prefcore));
-+}
-+
- static ssize_t show_energy_performance_available_preferences(
- 				struct cpufreq_policy *policy, char *buf)
- {
-@@ -1074,18 +1166,27 @@ static ssize_t status_store(struct device *a, struct device_attribute *b,
- 	return ret < 0 ? ret : count;
- }
- 
-+static ssize_t prefcore_show(struct device *dev,
-+			     struct device_attribute *attr, char *buf)
-+{
-+	return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore));
-+}
-+
- cpufreq_freq_attr_ro(amd_pstate_max_freq);
- cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
- 
- cpufreq_freq_attr_ro(amd_pstate_highest_perf);
-+cpufreq_freq_attr_ro(amd_pstate_hw_prefcore);
- cpufreq_freq_attr_rw(energy_performance_preference);
- cpufreq_freq_attr_ro(energy_performance_available_preferences);
- static DEVICE_ATTR_RW(status);
-+static DEVICE_ATTR_RO(prefcore);
- 
- static struct freq_attr *amd_pstate_attr[] = {
- 	&amd_pstate_max_freq,
- 	&amd_pstate_lowest_nonlinear_freq,
- 	&amd_pstate_highest_perf,
-+	&amd_pstate_hw_prefcore,
- 	NULL,
- };
- 
-@@ -1093,6 +1194,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
- 	&amd_pstate_max_freq,
- 	&amd_pstate_lowest_nonlinear_freq,
- 	&amd_pstate_highest_perf,
-+	&amd_pstate_hw_prefcore,
- 	&energy_performance_preference,
- 	&energy_performance_available_preferences,
- 	NULL,
-@@ -1100,6 +1202,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
- 
- static struct attribute *pstate_global_attributes[] = {
- 	&dev_attr_status.attr,
-+	&dev_attr_prefcore.attr,
- 	NULL
- };
- 
-@@ -1151,6 +1254,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
- 	cpudata->cpu = policy->cpu;
- 	cpudata->epp_policy = 0;
- 
-+	amd_pstate_init_prefcore(cpudata);
-+
- 	ret = amd_pstate_init_perf(cpudata);
- 	if (ret)
- 		goto free_cpudata1;
-@@ -1568,7 +1673,17 @@ static int __init amd_pstate_param(char *str)
- 
- 	return amd_pstate_set_driver(mode_idx);
- }
-+
-+static int __init amd_prefcore_param(char *str)
-+{
-+	if (!strcmp(str, "disable"))
-+		amd_pstate_prefcore = false;
-+
-+	return 0;
-+}
-+
- early_param("amd_pstate", amd_pstate_param);
-+early_param("amd_prefcore", amd_prefcore_param);
- 
- MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
- MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
-diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
-index 6ad02ad9c7b4..68fc1bd8d851 100644
---- a/include/linux/amd-pstate.h
-+++ b/include/linux/amd-pstate.h
-@@ -52,6 +52,9 @@ struct amd_aperf_mperf {
-  * @prev: Last Aperf/Mperf/tsc count value read from register
-  * @freq: current cpu frequency value
-  * @boost_supported: check whether the Processor or SBIOS supports boost mode
-+ * @hw_prefcore: check whether HW supports preferred core featue.
-+ * 		  Only when hw_prefcore and early prefcore param are true,
-+ * 		  AMD P-State driver supports preferred core featue.
-  * @epp_policy: Last saved policy used to set energy-performance preference
-  * @epp_cached: Cached CPPC energy-performance preference value
-  * @policy: Cpufreq policy value
-@@ -85,6 +88,7 @@ struct amd_cpudata {
- 
- 	u64	freq;
- 	bool	boost_supported;
-+	bool	hw_prefcore;
- 
- 	/* EPP feature related attributes*/
- 	s16	epp_policy;
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index 912e49f..0000000
--- a/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 4/7] cpufreq: Add a notification message that the highest perf has changed
-Date: Tue, 5 Dec 2023 14:35:34 +0800	[thread overview]
-Message-ID: <20231205063537.872834-5-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-ACPI 6.5 section 8.4.6.1.1.1 specifies that Notify event 0x85 can be
-emmitted to cause the the OSPM to re-evaluate the highest performance
-register. Add support for this event.
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
-Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-device-notification-values
----
- drivers/acpi/processor_driver.c |  6 ++++++
- drivers/cpufreq/cpufreq.c       | 13 +++++++++++++
- include/linux/cpufreq.h         |  5 +++++
- 3 files changed, 24 insertions(+)
-
-diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
-index 4bd16b3f0781..29b2fb68a35d 100644
---- a/drivers/acpi/processor_driver.c
-+++ b/drivers/acpi/processor_driver.c
-@@ -27,6 +27,7 @@
- #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
- #define ACPI_PROCESSOR_NOTIFY_POWER	0x81
- #define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
-+#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED	0x85
- 
- MODULE_AUTHOR("Paul Diefenbaugh");
- MODULE_DESCRIPTION("ACPI Processor Driver");
-@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data)
- 		acpi_bus_generate_netlink_event(device->pnp.device_class,
- 						  dev_name(&device->dev), event, 0);
- 		break;
-+	case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED:
-+		cpufreq_update_highest_perf(pr->id);
-+		acpi_bus_generate_netlink_event(device->pnp.device_class,
-+						  dev_name(&device->dev), event, 0);
-+		break;
- 	default:
- 		acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event);
- 		break;
-diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
-index 934d35f570b7..14a4cbc6dd05 100644
---- a/drivers/cpufreq/cpufreq.c
-+++ b/drivers/cpufreq/cpufreq.c
-@@ -2717,6 +2717,19 @@ void cpufreq_update_limits(unsigned int cpu)
- }
- EXPORT_SYMBOL_GPL(cpufreq_update_limits);
- 
-+/**
-+ * cpufreq_update_highest_perf - Update highest performance for a given CPU.
-+ * @cpu: CPU to update the highest performance for.
-+ *
-+ * Invoke the driver's ->update_highest_perf callback if present
-+ */
-+void cpufreq_update_highest_perf(unsigned int cpu)
-+{
-+	if (cpufreq_driver->update_highest_perf)
-+		cpufreq_driver->update_highest_perf(cpu);
-+}
-+EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf);
-+
- /*********************************************************************
-  *               BOOST						     *
-  *********************************************************************/
-diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
-index 1c5ca92a0555..f62257b2a42f 100644
---- a/include/linux/cpufreq.h
-+++ b/include/linux/cpufreq.h
-@@ -235,6 +235,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
- void refresh_frequency_limits(struct cpufreq_policy *policy);
- void cpufreq_update_policy(unsigned int cpu);
- void cpufreq_update_limits(unsigned int cpu);
-+void cpufreq_update_highest_perf(unsigned int cpu);
- bool have_governor_per_policy(void);
- bool cpufreq_supports_freq_invariance(void);
- struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
-@@ -263,6 +264,7 @@ static inline bool cpufreq_supports_freq_invariance(void)
- 	return false;
- }
- static inline void disable_cpufreq(void) { }
-+static inline void cpufreq_update_highest_perf(unsigned int cpu) { }
- #endif
- 
- #ifdef CONFIG_CPU_FREQ_STAT
-@@ -380,6 +382,9 @@ struct cpufreq_driver {
- 	/* Called to update policy limits on firmware notifications. */
- 	void		(*update_limits)(unsigned int cpu);
- 
-+	/* Called to update highest performance on firmware notifications. */
-+	void		(*update_highest_perf)(unsigned int cpu);
-+
- 	/* optional */
- 	int		(*bios_limit)(int cpu, unsigned int *limit);
- 
--- 
-2.34.1
-
diff --git a/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index 12e3a68..0000000
--- a/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,182 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Wyes Karny <wyes.karny@amd.com>,
-	Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 5/7] cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically
-Date: Tue, 5 Dec 2023 14:35:35 +0800	[thread overview]
-Message-ID: <20231205063537.872834-6-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-Preferred core rankings can be changed dynamically by the
-platform based on the workload and platform conditions and
-accounting for thermals and aging.
-When this occurs, cpu priority need to be set.
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Wyes Karny <wyes.karny@amd.com>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
----
- drivers/cpufreq/amd-pstate.c | 44 ++++++++++++++++++++++++++++++++++++
- include/linux/amd-pstate.h   |  6 +++++
- 2 files changed, 50 insertions(+)
-
-diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index 9c2790753f99..25f0fb53d320 100644
---- a/drivers/cpufreq/amd-pstate.c
-+++ b/drivers/cpufreq/amd-pstate.c
-@@ -315,6 +315,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
- 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
- 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
- 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
-+	WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1));
- 	WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1));
- 	return 0;
- }
-@@ -339,6 +340,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
- 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
- 		   cppc_perf.lowest_nonlinear_perf);
- 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
-+	WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf);
- 	WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf);
- 
- 	if (cppc_state == AMD_PSTATE_ACTIVE)
-@@ -785,6 +787,32 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
- 	schedule_work(&sched_prefcore_work);
- }
- 
-+static void amd_pstate_update_highest_perf(unsigned int cpu)
-+{
-+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	u32 prev_high = 0, cur_high = 0;
-+	int ret;
-+
-+	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
-+		goto free_cpufreq_put;
-+
-+	ret = amd_pstate_get_highest_perf(cpu, &cur_high);
-+	if (ret)
-+		goto free_cpufreq_put;
-+
-+	prev_high = READ_ONCE(cpudata->prefcore_ranking);
-+	if (prev_high != cur_high) {
-+		WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
-+
-+		if (cur_high < CPPC_MAX_PERF)
-+			sched_set_itmt_core_prio((int)cur_high, cpu);
-+	}
-+
-+free_cpufreq_put:
-+	cpufreq_cpu_put(policy);
-+}
-+
- static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
- {
- 	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
-@@ -958,6 +986,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
- 	return sysfs_emit(buf, "%u\n", perf);
- }
- 
-+static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy,
-+						char *buf)
-+{
-+	u32 perf;
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	perf = READ_ONCE(cpudata->prefcore_ranking);
-+
-+	return sysfs_emit(buf, "%u\n", perf);
-+}
-+
- static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy,
- 					   char *buf)
- {
-@@ -1176,6 +1215,7 @@ cpufreq_freq_attr_ro(amd_pstate_max_freq);
- cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
- 
- cpufreq_freq_attr_ro(amd_pstate_highest_perf);
-+cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking);
- cpufreq_freq_attr_ro(amd_pstate_hw_prefcore);
- cpufreq_freq_attr_rw(energy_performance_preference);
- cpufreq_freq_attr_ro(energy_performance_available_preferences);
-@@ -1186,6 +1226,7 @@ static struct freq_attr *amd_pstate_attr[] = {
- 	&amd_pstate_max_freq,
- 	&amd_pstate_lowest_nonlinear_freq,
- 	&amd_pstate_highest_perf,
-+	&amd_pstate_prefcore_ranking,
- 	&amd_pstate_hw_prefcore,
- 	NULL,
- };
-@@ -1194,6 +1235,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
- 	&amd_pstate_max_freq,
- 	&amd_pstate_lowest_nonlinear_freq,
- 	&amd_pstate_highest_perf,
-+	&amd_pstate_prefcore_ranking,
- 	&amd_pstate_hw_prefcore,
- 	&energy_performance_preference,
- 	&energy_performance_available_preferences,
-@@ -1538,6 +1580,7 @@ static struct cpufreq_driver amd_pstate_driver = {
- 	.suspend	= amd_pstate_cpu_suspend,
- 	.resume		= amd_pstate_cpu_resume,
- 	.set_boost	= amd_pstate_set_boost,
-+	.update_highest_perf	= amd_pstate_update_highest_perf,
- 	.name		= "amd-pstate",
- 	.attr		= amd_pstate_attr,
- };
-@@ -1552,6 +1595,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
- 	.online		= amd_pstate_epp_cpu_online,
- 	.suspend	= amd_pstate_epp_suspend,
- 	.resume		= amd_pstate_epp_resume,
-+	.update_highest_perf	= amd_pstate_update_highest_perf,
- 	.name		= "amd-pstate-epp",
- 	.attr		= amd_pstate_epp_attr,
- };
-diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
-index 68fc1bd8d851..d21838835abd 100644
---- a/include/linux/amd-pstate.h
-+++ b/include/linux/amd-pstate.h
-@@ -39,11 +39,16 @@ struct amd_aperf_mperf {
-  * @cppc_req_cached: cached performance request hints
-  * @highest_perf: the maximum performance an individual processor may reach,
-  *		  assuming ideal conditions
-+ *		  For platforms that do not support the preferred core feature, the
-+ *		  highest_pef may be configured with 166 or 255, to avoid max frequency
-+ *		  calculated wrongly. we take the fixed value as the highest_perf.
-  * @nominal_perf: the maximum sustained performance level of the processor,
-  *		  assuming ideal operating conditions
-  * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
-  *			   savings are achieved
-  * @lowest_perf: the absolute lowest performance level of the processor
-+ * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher
-+ * 		  priority.
-  * @max_freq: the frequency that mapped to highest_perf
-  * @min_freq: the frequency that mapped to lowest_perf
-  * @nominal_freq: the frequency that mapped to nominal_perf
-@@ -73,6 +78,7 @@ struct amd_cpudata {
- 	u32	nominal_perf;
- 	u32	lowest_nonlinear_perf;
- 	u32	lowest_perf;
-+	u32     prefcore_ranking;
- 	u32     min_limit_perf;
- 	u32     max_limit_perf;
- 	u32     min_limit_freq;
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index d5a3807..0000000
--- a/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,125 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Wyes Karny <wyes.karny@amd.com>,
-	Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 6/7] Documentation: amd-pstate: introduce amd-pstate preferred core
-Date: Tue, 5 Dec 2023 14:35:36 +0800	[thread overview]
-Message-ID: <20231205063537.872834-7-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-Introduce amd-pstate preferred core.
-
-check preferred core state set by the kernel parameter:
-$ cat /sys/devices/system/cpu/amd-pstate/prefcore
-
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
-Reviewed-by: Wyes Karny <wyes.karny@amd.com>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Signed-off-by: Meng Li <li.meng@amd.com>
----
- Documentation/admin-guide/pm/amd-pstate.rst | 59 ++++++++++++++++++++-
- 1 file changed, 57 insertions(+), 2 deletions(-)
-
-diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
-index 1cf40f69278c..0b832ff529db 100644
---- a/Documentation/admin-guide/pm/amd-pstate.rst
-+++ b/Documentation/admin-guide/pm/amd-pstate.rst
-@@ -300,8 +300,8 @@ platforms. The AMD P-States mechanism is the more performance and energy
- efficiency frequency management method on AMD processors.
- 
- 
--AMD Pstate Driver Operation Modes
--=================================
-+``amd-pstate`` Driver Operation Modes
-+======================================
- 
- ``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
- non-autonomous (passive) mode and guided autonomous (guided) mode.
-@@ -353,6 +353,48 @@ is activated.  In this mode, driver requests minimum and maximum performance
- level and the platform autonomously selects a performance level in this range
- and appropriate to the current workload.
- 
-+``amd-pstate`` Preferred Core
-+=================================
-+
-+The core frequency is subjected to the process variation in semiconductors.
-+Not all cores are able to reach the maximum frequency respecting the
-+infrastructure limits. Consequently, AMD has redefined the concept of
-+maximum frequency of a part. This means that a fraction of cores can reach
-+maximum frequency. To find the best process scheduling policy for a given
-+scenario, OS needs to know the core ordering informed by the platform through
-+highest performance capability register of the CPPC interface.
-+
-+``amd-pstate`` preferred core enables the scheduler to prefer scheduling on
-+cores that can achieve a higher frequency with lower voltage. The preferred
-+core rankings can dynamically change based on the workload, platform conditions,
-+thermals and ageing.
-+
-+The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate``
-+driver will also determine whether or not ``amd-pstate`` preferred core is
-+supported by the platform.
-+
-+``amd-pstate`` driver will provide an initial core ordering when the system boots.
-+The platform uses the CPPC interfaces to communicate the core ranking to the
-+operating system and scheduler to make sure that OS is choosing the cores
-+with highest performance firstly for scheduling the process. When ``amd-pstate``
-+driver receives a message with the highest performance change, it will
-+update the core ranking and set the cpu's priority.
-+
-+``amd-pstate`` Preferred Core Switch
-+=================================
-+Kernel Parameters
-+-----------------
-+
-+``amd-pstate`` peferred core`` has two states: enable and disable.
-+Enable/disable states can be chosen by different kernel parameters.
-+Default enable ``amd-pstate`` preferred core.
-+
-+``amd_prefcore=disable``
-+
-+For systems that support ``amd-pstate`` preferred core, the core rankings will
-+always be advertised by the platform. But OS can choose to ignore that via the
-+kernel parameter ``amd_prefcore=disable``.
-+
- User Space Interface in ``sysfs`` - General
- ===========================================
- 
-@@ -385,6 +427,19 @@ control its functionality at the system level.  They are located in the
-         to the operation mode represented by that string - or to be
-         unregistered in the "disable" case.
- 
-+``prefcore``
-+	Preferred core state of the driver: "enabled" or "disabled".
-+
-+	"enabled"
-+		Enable the ``amd-pstate`` preferred core.
-+
-+	"disabled"
-+		Disable the ``amd-pstate`` preferred core
-+
-+
-+        This attribute is read-only to check the state of preferred core set
-+        by the kernel parameter.
-+
- ``cpupower`` tool support for ``amd-pstate``
- ===============================================
- 
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch
deleted file mode 100644
index 40153f4..0000000
--- a/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From: Meng Li <li.meng@amd.com>
-To: "Rafael J . Wysocki" <rafael.j.wysocki@intel.com>,
-	Huang Rui <ray.huang@amd.com>
-Cc: <linux-pm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
-	<x86@kernel.org>, <linux-acpi@vger.kernel.org>,
-	Shuah Khan <skhan@linuxfoundation.org>,
-	<linux-kselftest@vger.kernel.org>,
-	"Nathan Fontenot" <nathan.fontenot@amd.com>,
-	Deepak Sharma <deepak.sharma@amd.com>,
-	Alex Deucher <alexander.deucher@amd.com>,
-	Mario Limonciello <mario.limonciello@amd.com>,
-	Shimmer Huang <shimmer.huang@amd.com>,
-	"Perry Yuan" <Perry.Yuan@amd.com>,
-	Xiaojian Du <Xiaojian.Du@amd.com>,
-	Viresh Kumar <viresh.kumar@linaro.org>,
-	Borislav Petkov <bp@alien8.de>,
-	"Oleksandr Natalenko" <oleksandr@natalenko.name>,
-	Meng Li <li.meng@amd.com>, Wyes Karny <wyes.karny@amd.com>,
-	Perry Yuan <perry.yuan@amd.com>
-Subject: [PATCH V12 7/7] Documentation: introduce amd-pstate preferrd core mode kernel command line options
-Date: Tue, 5 Dec 2023 14:35:37 +0800	[thread overview]
-Message-ID: <20231205063537.872834-8-li.meng@amd.com> (raw)
-In-Reply-To: <20231205063537.872834-1-li.meng@amd.com>
-
-amd-pstate driver support enable/disable preferred core.
-Default enabled on platforms supporting amd-pstate preferred core.
-Disable amd-pstate preferred core with
-"amd_prefcore=disable" added to the kernel command line.
-
-Signed-off-by: Meng Li <li.meng@amd.com>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Wyes Karny <wyes.karny@amd.com>
-Reviewed-by: Huang Rui <ray.huang@amd.com>
-Reviewed-by: Perry Yuan <perry.yuan@amd.com>
-Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
----
- Documentation/admin-guide/kernel-parameters.txt | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 65731b060e3f..cbfa63a87e4a 100644
---- a/Documentation/admin-guide/kernel-parameters.txt
-+++ b/Documentation/admin-guide/kernel-parameters.txt
-@@ -363,6 +363,11 @@
- 			  selects a performance level in this range and appropriate
- 			  to the current workload.
- 
-+	amd_prefcore=
-+			[X86]
-+			disable
-+			  Disable amd-pstate preferred core.
-+
- 	amijoy.map=	[HW,JOY] Amiga joystick support
- 			Map of devices attached to JOY0DAT and JOY1DAT
- 			Format: <a>,<b>
--- 
-2.34.1
diff --git a/sys-kernel/gentoo-sources-7.0/0001-bore.patch b/sys-kernel/gentoo-sources-7.0/0001-bore.patch
new file mode 100644
index 0000000..51617f0
--- /dev/null
+++ b/sys-kernel/gentoo-sources-7.0/0001-bore.patch
@@ -0,0 +1,1217 @@
+From 187d3236f77a721f684e3211dc50585973b04ab4 Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Fri, 10 Apr 2026 08:27:29 +0200
+Subject: [PATCH] bore
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ include/linux/sched.h      |  34 +++
+ include/linux/sched/bore.h |  41 ++++
+ init/Kconfig               |  17 ++
+ kernel/Kconfig.hz          |  17 ++
+ kernel/exit.c              |   4 +
+ kernel/fork.c              |  13 ++
+ kernel/futex/waitwake.c    |  11 +
+ kernel/sched/Makefile      |   1 +
+ kernel/sched/bore.c        | 434 +++++++++++++++++++++++++++++++++++++
+ kernel/sched/core.c        |  12 +
+ kernel/sched/debug.c       |  61 ++++++
+ kernel/sched/fair.c        | 126 ++++++++++-
+ kernel/sched/sched.h       |   9 +
+ 13 files changed, 769 insertions(+), 11 deletions(-)
+ create mode 100644 include/linux/sched/bore.h
+ create mode 100644 kernel/sched/bore.c
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 5a5d3dbc9..b2b2d8c66 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -817,6 +817,37 @@ struct kmap_ctrl {
+ #endif
+ };
+ 
++#ifdef CONFIG_SCHED_BORE
++#define BORE_BC_TIMESTAMP_SHIFT 16
++
++struct bore_bc {
++	union {
++		struct {
++			u64		timestamp:	48;
++			u64		penalty:	16;
++		};
++		u64			value;
++	};
++};
++
++struct bore_ctx {
++	u64				burst_time;
++	u16				prev_penalty;
++	u16				curr_penalty;
++	union {
++		u16			penalty;
++		struct {
++			u8		_;
++			u8		score;
++		};
++	};
++	bool			stop_update;
++	bool			futex_waiting;
++	struct bore_bc	subtree;
++	struct bore_bc	group;
++};
++#endif /* CONFIG_SCHED_BORE */
++
+ struct task_struct {
+ #ifdef CONFIG_THREAD_INFO_IN_TASK
+ 	/*
+@@ -875,6 +906,9 @@ struct task_struct {
+ #ifdef CONFIG_SCHED_CLASS_EXT
+ 	struct sched_ext_entity		scx;
+ #endif
++#ifdef CONFIG_SCHED_BORE
++	struct bore_ctx			bore;
++#endif /* CONFIG_SCHED_BORE */
+ 	const struct sched_class	*sched_class;
+ 
+ #ifdef CONFIG_SCHED_CORE
+diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h
+new file mode 100644
+index 000000000..9215c13a9
+--- /dev/null
++++ b/include/linux/sched/bore.h
+@@ -0,0 +1,41 @@
++#ifndef _KERNEL_SCHED_BORE_H
++#define _KERNEL_SCHED_BORE_H
++
++#include <linux/sched.h>
++#include <linux/sched/cputime.h>
++#include <linux/atomic.h>
++#include <linux/list.h>
++#include <linux/rcupdate.h>
++#include <linux/jump_label.h>
++
++#define SCHED_BORE_AUTHOR   "Masahito Suzuki"
++#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification"
++
++#define SCHED_BORE_VERSION  "6.6.3"
++
++extern u8   __read_mostly sched_bore;
++DECLARE_STATIC_KEY_TRUE(sched_bore_key);
++extern u8   __read_mostly sched_burst_inherit_type;
++extern u8   __read_mostly sched_burst_smoothness;
++extern u8   __read_mostly sched_burst_penalty_offset;
++extern uint __read_mostly sched_burst_penalty_scale;
++extern uint __read_mostly sched_burst_cache_lifetime;
++
++extern u8   effective_prio_bore(struct task_struct *p);
++extern void update_curr_bore(struct task_struct *p, u64 delta_exec);
++extern void restart_burst_bore(struct task_struct *p);
++extern void restart_burst_rescale_deadline_bore(struct task_struct *p);
++extern void task_fork_bore(struct task_struct *p, struct task_struct *parent,
++													u64 clone_flags, u64 now);
++extern void sched_init_bore(void);
++extern void reset_task_bore(struct task_struct *p);
++
++extern int  sched_bore_update_handler(const struct ctl_table *table,
++	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
++extern int  sched_burst_inherit_type_update_handler(const struct ctl_table *table,
++	int write, void __user *buffer, size_t *lenp, loff_t *ppos);
++
++extern void reweight_entity(
++	struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight);
++
++#endif /* _KERNEL_SCHED_BORE_H */
+diff --git a/init/Kconfig b/init/Kconfig
+index 7484cd703..4cf628106 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1446,6 +1446,23 @@ config CHECKPOINT_RESTORE
+ 
+ 	  If unsure, say N here.
+ 
++config SCHED_BORE
++	bool "Burst-Oriented Response Enhancer"
++	default y
++	help
++	  In Desktop and Mobile computing, one might prefer interactive
++	  tasks to keep responsive no matter what they run in the background.
++
++	  Enabling this kernel feature modifies the scheduler to discriminate
++	  tasks by their burst time (runtime since it last went sleeping or
++	  yielding state) and prioritize those that run less bursty.
++	  Such tasks usually include window compositor, widgets backend,
++	  terminal emulator, video playback, games and so on.
++	  With a little impact to scheduling fairness, it may improve
++	  responsiveness especially under heavy background workload.
++
++	  If unsure, say Y here.
++
+ config SCHED_AUTOGROUP
+ 	bool "Automatic process group scheduling"
+ 	select CGROUPS
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index ce1435cb0..9eee2005e 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -57,3 +57,20 @@ config HZ
+ 
+ config SCHED_HRTICK
+ 	def_bool HIGH_RES_TIMERS
++
++config MIN_BASE_SLICE_NS
++	int "Default value for min_base_slice_ns"
++	default 2000000
++	help
++	 The BORE Scheduler automatically calculates the optimal base
++	 slice for the configured HZ using the following equation:
++	 
++	 base_slice_ns =
++	 	1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ)
++	 
++	 This option sets the default lower bound limit of the base slice
++	 to prevent the loss of task throughput due to overscheduling.
++	 
++	 Setting this value too high can cause the system to boot with
++	 an unnecessarily large base slice, resulting in high scheduling
++	 latency and poor system responsiveness.
+diff --git a/kernel/exit.c b/kernel/exit.c
+index ede3117fa..3f3af470d 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -147,7 +147,11 @@ static void __unhash_process(struct release_task_post *post, struct task_struct
+ 		detach_pid(post->pids, p, PIDTYPE_SID);
+ 
+ 		list_del_rcu(&p->tasks);
++#ifdef CONFIG_SCHED_BORE
++		list_del_rcu(&p->sibling);
++#else /* !CONFIG_SCHED_BORE */
+ 		list_del_init(&p->sibling);
++#endif /* CONFIG_SCHED_BORE */
+ 		__this_cpu_dec(process_counts);
+ 	}
+ 	list_del_rcu(&p->thread_node);
+diff --git a/kernel/fork.c b/kernel/fork.c
+index bc2bf58b9..207276c30 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -117,6 +117,10 @@
+ /* For dup_mmap(). */
+ #include "../mm/internal.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif /* CONFIG_SCHED_BORE */
++
+ #include <trace/events/sched.h>
+ 
+ #define CREATE_TRACE_POINTS
+@@ -2362,6 +2366,11 @@ __latent_entropy struct task_struct *copy_process(
+ 	p->start_time = ktime_get_ns();
+ 	p->start_boottime = ktime_get_boottime_ns();
+ 
++#ifdef CONFIG_SCHED_BORE
++	if (likely(p->pid))
++		task_fork_bore(p, current, clone_flags, p->start_time);
++#endif /* CONFIG_SCHED_BORE */
++
+ 	/*
+ 	 * Make it visible to the rest of the system, but dont wake it up yet.
+ 	 * Need tasklist lock for parent etc handling!
+@@ -2435,7 +2444,11 @@ __latent_entropy struct task_struct *copy_process(
+ 			 */
+ 			p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+ 							 p->real_parent->signal->is_child_subreaper;
++#ifdef CONFIG_SCHED_BORE
++			list_add_tail_rcu(&p->sibling, &p->real_parent->children);
++#else /* !CONFIG_SCHED_BORE */
+ 			list_add_tail(&p->sibling, &p->real_parent->children);
++#endif /* CONFIG_SCHED_BORE */
+ 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ 			attach_pid(p, PIDTYPE_TGID);
+ 			attach_pid(p, PIDTYPE_PGID);
+diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
+index 1c2dd03f1..de57e2d54 100644
+--- a/kernel/futex/waitwake.c
++++ b/kernel/futex/waitwake.c
+@@ -4,6 +4,9 @@
+ #include <linux/sched/task.h>
+ #include <linux/sched/signal.h>
+ #include <linux/freezer.h>
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif /* CONFIG_SCHED_BORE */
+ 
+ #include "futex.h"
+ 
+@@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
+ 		 * is no timeout, or if it has yet to expire.
+ 		 */
+ 		if (!timeout || timeout->task)
++#ifdef CONFIG_SCHED_BORE
++		{
++			current->bore.futex_waiting = true;
++#endif /* CONFIG_SCHED_BORE */
+ 			schedule();
++#ifdef CONFIG_SCHED_BORE
++			current->bore.futex_waiting = false;
++		}
++#endif /* CONFIG_SCHED_BORE */
+ 	}
+ 	__set_current_state(TASK_RUNNING);
+ }
+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
+index b1f1a3670..f95a7b3d5 100644
+--- a/kernel/sched/Makefile
++++ b/kernel/sched/Makefile
+@@ -40,3 +40,4 @@ obj-y += core.o
+ obj-y += fair.o
+ obj-y += build_policy.o
+ obj-y += build_utility.o
++obj-$(CONFIG_SCHED_BORE) += bore.o
+diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c
+new file mode 100644
+index 000000000..c27a22cd6
+--- /dev/null
++++ b/kernel/sched/bore.c
+@@ -0,0 +1,434 @@
++/*
++ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
++ *  Copyright (C) 2021-2025 Masahito Suzuki <firelzrd@gmail.com>
++ */
++#include <linux/cpuset.h>
++#include <linux/sched/task.h>
++#include <linux/sched/bore.h>
++#include "sched.h"
++
++#ifdef CONFIG_SCHED_BORE
++DEFINE_STATIC_KEY_TRUE(sched_bore_key);
++u8   __read_mostly sched_bore                   = 1;
++u8   __read_mostly sched_burst_inherit_type     = 2;
++u8   __read_mostly sched_burst_smoothness       = 1;
++u8   __read_mostly sched_burst_penalty_offset   = 24;
++uint __read_mostly sched_burst_penalty_scale    = 1536;
++uint __read_mostly sched_burst_cache_lifetime   = 75000000;
++static int __maybe_unused maxval_prio    =   39;
++static int __maybe_unused maxval_6_bits  =   63;
++static int __maybe_unused maxval_8_bits  =  255;
++static int __maybe_unused maxval_12_bits = 4095;
++
++#define MAX_BURST_PENALTY ((40U << 8) - 1)
++#define BURST_CACHE_SAMPLE_LIMIT 63
++#define BURST_CACHE_SCAN_LIMIT (BURST_CACHE_SAMPLE_LIMIT * 2)
++
++static u32 bore_reciprocal_lut[BURST_CACHE_SAMPLE_LIMIT + 1];
++
++DEFINE_STATIC_KEY_TRUE(sched_burst_inherit_key);
++DEFINE_STATIC_KEY_TRUE(sched_burst_ancestor_key);
++
++static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) {
++	if (unlikely(!v)) return 0;
++	int clz = __builtin_clzll(v);
++	int exponent = 64 - clz;
++	u32 mantissa = (u32)((v << clz) << 1 >> (64 - fp));
++	return exponent << fp | mantissa;
++}
++
++static inline u32 calc_burst_penalty(u64 burst_time) {
++	u32 greed = log2p1_u64_u32fp(burst_time, 8),
++		tolerance = sched_burst_penalty_offset << 8;
++	s32 diff = (s32)(greed - tolerance);
++	u32 penalty = diff & ~(diff >> 31);
++	u32 scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
++	s32 overflow = scaled_penalty - MAX_BURST_PENALTY;
++	return scaled_penalty - (overflow & ~(overflow >> 31));
++}
++
++static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) {
++	u64 unscaled, rescaled;
++	unscaled = mul_u64_u32_shr(delta   , sched_prio_to_weight[old_prio], 10);
++	rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22);
++	return rescaled;
++}
++
++static inline u32 binary_smooth(u32 new, u32 old) {
++	u32 is_growing = (new > old);
++	u32 increment = (new - old) * is_growing;
++	u32 shift = sched_burst_smoothness;
++	u32 smoothed = old + ((increment + (1U << shift) - 1) >> shift);
++	return (new & ~(-is_growing)) | (smoothed & (-is_growing));
++}
++
++static void reweight_task_by_prio(struct task_struct *p, int prio) {
++	if (task_has_idle_policy(p)) return;
++
++	struct sched_entity *se = &p->se;
++	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
++
++	if (se->on_rq) {
++		p->bore.stop_update = true;
++		reweight_entity(cfs_rq_of(se), se, weight);
++		p->bore.stop_update = false;
++	} else
++		se->load.weight = weight;
++	se->load.inv_weight = sched_prio_to_wmult[prio];
++}
++
++u8 effective_prio_bore(struct task_struct *p) {
++	int prio = p->static_prio - MAX_RT_PRIO;
++	if (static_branch_likely(&sched_bore_key))
++		prio += p->bore.score;
++	prio &= ~(prio >> 31);
++	s32 diff = prio - maxval_prio;
++	prio -= (diff & ~(diff >> 31));
++	return (u8)prio;
++}
++
++static void update_penalty(struct task_struct *p) {
++	struct bore_ctx *ctx = &p->bore;
++
++	u8  prev_prio = effective_prio_bore(p);
++
++	s32 diff = (s32)ctx->curr_penalty - (s32)ctx->prev_penalty;
++	u16 max_val = ctx->curr_penalty - (diff & (diff >> 31));
++	u32 is_kthread = !!(p->flags & PF_KTHREAD);
++	ctx->penalty = max_val & -(s32)(!is_kthread);
++
++	u8 new_prio = effective_prio_bore(p);
++	if (new_prio != prev_prio)
++		reweight_task_by_prio(p, new_prio);
++}
++
++void update_curr_bore(struct task_struct *p, u64 delta_exec) {
++	struct bore_ctx *ctx = &p->bore;
++	if (ctx->stop_update) return;
++
++	ctx->burst_time += delta_exec;
++	u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time);
++
++	if (curr_penalty <= ctx->prev_penalty) return;
++	update_penalty(p);
++}
++
++void restart_burst_bore(struct task_struct *p) {
++	struct bore_ctx *ctx = &p->bore;
++	u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty);
++	ctx->prev_penalty = new_penalty;
++	ctx->curr_penalty = 0;
++	ctx->burst_time = 0;
++	update_penalty(p);
++}
++
++void restart_burst_rescale_deadline_bore(struct task_struct *p) {
++	struct sched_entity *se = &p->se;
++	s64 vscaled, vremain = se->deadline - se->vruntime;
++
++	u8 old_prio = effective_prio_bore(p);
++	restart_burst_bore(p);
++	u8 new_prio = effective_prio_bore(p);
++
++	if (old_prio > new_prio) {
++		vscaled = rescale_slice(abs(vremain), old_prio, new_prio);
++		if (unlikely(vremain < 0))
++			vscaled = -vscaled;
++		se->deadline = se->vruntime + vscaled;
++	}
++}
++
++static inline bool task_is_bore_eligible(struct task_struct *p)
++{return p && p->sched_class == &fair_sched_class && !p->exit_state;}
++
++#ifndef for_each_child_task
++#define for_each_child_task(p, t) \
++	list_for_each_entry_rcu(t, &(p)->children, sibling)
++#endif
++
++static inline u32 count_children_upto2(struct task_struct *p) {
++	struct list_head *head = &p->children;
++	struct list_head *first = READ_ONCE(head->next);
++	struct list_head *second = READ_ONCE(first->next);
++	return (first != head) + (second != head);
++}
++
++static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) {
++	struct bore_bc bc_val = { .value = READ_ONCE(bc->value) };
++	u64 timestamp = (u64)bc_val.timestamp << BORE_BC_TIMESTAMP_SHIFT;
++	return now - timestamp > (u64)sched_burst_cache_lifetime;
++}
++
++static void update_burst_cache(struct bore_bc *bc,
++		struct task_struct *p, u32 count, u32 total, u64 now) {
++	u32 average = (count == 1) ? total :
++		(u32)(((u64)total * bore_reciprocal_lut[count]) >> 32);
++
++	struct bore_bc new_bc = {
++		.penalty = max(average, p->bore.penalty),
++		.timestamp = now >> BORE_BC_TIMESTAMP_SHIFT
++	};
++	WRITE_ONCE(bc->value, new_bc.value);
++}
++
++static u32 inherit_from_parent(struct task_struct *parent,
++									u64 clone_flags, u64 now) {
++	struct bore_bc bc_val;
++
++	if (clone_flags & CLONE_PARENT)
++		parent = rcu_dereference(parent->real_parent);
++
++	struct bore_bc *bc = &parent->bore.subtree;
++
++	if (burst_cache_expired(bc, now)) {
++		struct task_struct *child;
++		u32 count = 0, total = 0, scan_count = 0;
++		for_each_child_task(parent, child) {
++			if (count >= BURST_CACHE_SAMPLE_LIMIT) break;
++			if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break;
++
++			if (!task_is_bore_eligible(child)) continue;
++			count++;
++			total += child->bore.penalty;
++		}
++
++		update_burst_cache(bc, parent, count, total, now);
++	}
++
++	bc_val.value = READ_ONCE(bc->value);
++	return (u32)bc_val.penalty;
++}
++
++static u32 inherit_from_ancestor_hub(struct task_struct *parent,
++										u64 clone_flags, u64 now) {
++	struct bore_bc bc_val;
++	struct task_struct *ancestor = parent;
++	u32 sole_child_count = 0;
++
++	if (clone_flags & CLONE_PARENT) {
++		ancestor = rcu_dereference(ancestor->real_parent);
++		sole_child_count = 1;
++	}
++
++	for (struct task_struct *next;
++			(next = rcu_dereference(ancestor->real_parent)) != ancestor &&
++			count_children_upto2(ancestor) <= sole_child_count;
++			ancestor = next, sole_child_count = 1) {}
++
++	struct bore_bc *bc = &ancestor->bore.subtree;
++
++	if (burst_cache_expired(bc, now)) {
++		struct task_struct *direct_child;
++		u32 count = 0, total = 0, scan_count = 0;
++		for_each_child_task(ancestor, direct_child) {
++			if (count >= BURST_CACHE_SAMPLE_LIMIT) break;
++			if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break;
++
++			struct task_struct *descendant = direct_child;
++			while (count_children_upto2(descendant) == 1) {
++				struct task_struct *next_descendant =
++					list_first_or_null_rcu(&descendant->children,
++											struct task_struct, sibling);
++				if (!next_descendant) break;
++				descendant = next_descendant;
++			}
++
++			if (!task_is_bore_eligible(descendant)) continue;
++			count++;
++			total += descendant->bore.penalty;
++		}
++
++		update_burst_cache(bc, ancestor, count, total, now);
++	}
++
++	bc_val.value = READ_ONCE(bc->value);
++	return (u32)bc_val.penalty;
++}
++
++static u32 inherit_from_thread_group(struct task_struct *p, u64 now) {
++	struct bore_bc bc_val;
++	struct task_struct *leader = p->group_leader;
++	struct bore_bc *bc = &leader->bore.group;
++
++	if (burst_cache_expired(bc, now)) {
++		struct task_struct *sibling;
++		u32 count = 0, total = 0, scan_count = 0;
++
++		for_each_thread(leader, sibling) {
++			if (count >= BURST_CACHE_SAMPLE_LIMIT) break;
++			if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break;
++
++			if (!task_is_bore_eligible(sibling)) continue;
++			count++;
++			total += sibling->bore.penalty;
++		}
++
++		update_burst_cache(bc, leader, count, total, now);
++	}
++
++	bc_val.value = READ_ONCE(bc->value);
++	return (u32)bc_val.penalty;
++}
++
++void task_fork_bore(struct task_struct *p,
++	               struct task_struct *parent, u64 clone_flags, u64 now) {
++	if (!static_branch_likely(&sched_bore_key) || !task_is_bore_eligible(p)) return;
++
++	rcu_read_lock();
++	struct bore_ctx *ctx = &p->bore;
++	u32 inherited_penalty;
++	if (clone_flags & CLONE_THREAD)
++		inherited_penalty = inherit_from_thread_group(parent, now);
++	else if (static_branch_likely(&sched_burst_inherit_key))
++		inherited_penalty = static_branch_likely(&sched_burst_ancestor_key)?
++			inherit_from_ancestor_hub(parent, clone_flags, now):
++			inherit_from_parent(parent, clone_flags, now);
++	else
++		inherited_penalty = 0;
++
++	if (ctx->prev_penalty < inherited_penalty)
++		ctx->prev_penalty = inherited_penalty;
++	ctx->curr_penalty  = 0;
++	ctx->burst_time    = 0;
++	ctx->stop_update   = false;
++	ctx->futex_waiting = false;
++	update_penalty(p);
++	rcu_read_unlock();
++}
++
++void reset_task_bore(struct task_struct *p)
++{ memset(&p->bore, 0, sizeof(struct bore_ctx)); }
++
++static void update_inherit_type(void) {
++	switch(sched_burst_inherit_type) {
++	case 1:
++		static_branch_enable(&sched_burst_inherit_key);
++		static_branch_disable(&sched_burst_ancestor_key);
++		break;
++	case 2:
++		static_branch_enable(&sched_burst_inherit_key);
++		static_branch_enable(&sched_burst_ancestor_key);
++		break;
++	default:
++		static_branch_disable(&sched_burst_inherit_key);
++		break;
++	}
++}
++
++void __init sched_init_bore(void) {
++	printk(KERN_INFO "%s %s by %s\n",
++		SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR);
++
++	for (int i = 1; i <= BURST_CACHE_SAMPLE_LIMIT; i++)
++		bore_reciprocal_lut[i] = (u32)div64_u64(0xffffffffULL + i, i);
++
++	reset_task_bore(&init_task);
++	update_inherit_type();
++}
++
++static void readjust_all_task_weights(void) {
++	struct task_struct *task;
++	struct rq *rq;
++	struct rq_flags rf;
++
++	scoped_guard(write_lock_irq, &tasklist_lock)
++	for_each_process(task) {
++		if (!task_is_bore_eligible(task)) continue;
++		rq = task_rq_lock(task, &rf);
++		update_rq_clock(rq);
++		reweight_task_by_prio(task, effective_prio_bore(task));
++		task_rq_unlock(rq, task, &rf);
++	}
++}
++
++int sched_bore_update_handler(const struct ctl_table *table,
++		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	if (sched_bore)
++		static_branch_enable(&sched_bore_key);
++	else
++		static_branch_disable(&sched_bore_key);
++
++	readjust_all_task_weights();
++
++	return 0;
++}
++
++int sched_burst_inherit_type_update_handler(const struct ctl_table *table,
++		int write, void __user *buffer, size_t *lenp, loff_t *ppos) {
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	update_inherit_type();
++
++	return 0;
++}
++
++#ifdef CONFIG_SYSCTL
++static struct ctl_table sched_bore_sysctls[] = {
++	{
++		.procname	= "sched_bore",
++		.data		= &sched_bore,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = sched_bore_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE,
++	},
++	{
++		.procname	= "sched_burst_inherit_type",
++		.data		= &sched_burst_inherit_type,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = sched_burst_inherit_type_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_TWO,
++	},
++	{
++		.procname	= "sched_burst_smoothness",
++		.data		= &sched_burst_smoothness,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_THREE,
++	},
++	{
++		.procname	= "sched_burst_penalty_offset",
++		.data		= &sched_burst_penalty_offset,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler = proc_dou8vec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_6_bits,
++	},
++	{
++		.procname	= "sched_burst_penalty_scale",
++		.data		= &sched_burst_penalty_scale,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_12_bits,
++	},
++	{
++		.procname	= "sched_burst_cache_lifetime",
++		.data		= &sched_burst_cache_lifetime,
++		.maxlen		= sizeof(uint),
++		.mode		= 0644,
++		.proc_handler = proc_douintvec,
++	},
++};
++
++static int __init sched_bore_sysctl_init(void) {
++	register_sysctl_init("kernel", sched_bore_sysctls);
++	return 0;
++}
++late_initcall(sched_bore_sysctl_init);
++
++#endif // CONFIG_SYSCTL
++#endif /* CONFIG_SCHED_BORE */
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 496dff740..2bc2b943a 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -100,6 +100,10 @@
+ #include "../smpboot.h"
+ #include "../locking/mutex.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif /* CONFIG_SCHED_BORE */
++
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
+ 
+@@ -1446,7 +1450,11 @@ int tg_nop(struct task_group *tg, void *data)
+ 
+ void set_load_weight(struct task_struct *p, bool update_load)
+ {
++#ifdef CONFIG_SCHED_BORE
++	int prio = effective_prio_bore(p);
++#else /* !CONFIG_SCHED_BORE */
+ 	int prio = p->static_prio - MAX_RT_PRIO;
++#endif /* CONFIG_SCHED_BORE */
+ 	struct load_weight lw;
+ 
+ 	if (task_has_idle_policy(p)) {
+@@ -8611,6 +8619,10 @@ void __init sched_init(void)
+ 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	sched_init_bore();
++#endif /* CONFIG_SCHED_BORE */
++
+ 	wait_bit_init();
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 15bf45b6f..282007725 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
+ 	.release	= single_release,
+ };
+ 
++#ifdef CONFIG_SCHED_BORE
++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \
++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \
++{ \
++	char buf[16]; \
++	unsigned int value; \
++\
++	if (cnt > 15) \
++		cnt = 15; \
++\
++	if (copy_from_user(&buf, ubuf, cnt)) \
++		return -EFAULT; \
++	buf[cnt] = '\0'; \
++\
++	if (kstrtouint(buf, 10, &value)) \
++		return -EINVAL; \
++\
++	sysctl_sched_##name = value; \
++	sched_update_##update_func(); \
++\
++	*ppos += cnt; \
++	return cnt; \
++} \
++\
++static int sched_##name##_show(struct seq_file *m, void *v) \
++{ \
++	seq_printf(m, "%d\n", sysctl_sched_##name); \
++	return 0; \
++} \
++\
++static int sched_##name##_open(struct inode *inode, struct file *filp) \
++{ \
++	return single_open(filp, sched_##name##_show, NULL); \
++} \
++\
++static const struct file_operations sched_##name##_fops = { \
++	.open		= sched_##name##_open, \
++	.write		= sched_##name##_write, \
++	.read		= seq_read, \
++	.llseek		= seq_lseek, \
++	.release	= single_release, \
++};
++
++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice)
++
++#undef DEFINE_SYSCTL_SCHED_FUNC
++#else /* !CONFIG_SCHED_BORE */
+ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ 				   size_t cnt, loff_t *ppos)
+ {
+@@ -208,6 +255,7 @@ static const struct file_operations sched_scaling_fops = {
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
++#endif /* CONFIG_SCHED_BORE */
+ 
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ 
+@@ -602,12 +650,19 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops);
++	debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice);
++#else /* !CONFIG_SCHED_BORE */
+ 	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
++#endif /* CONFIG_SCHED_BORE */
+ 
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+ 
++#if !defined(CONFIG_SCHED_BORE)
+ 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
++#endif /* CONFIG_SCHED_BORE */
+ 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+ 
+@@ -852,6 +907,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
++#ifdef CONFIG_SCHED_BORE
++	SEQ_printf(m, " %2d", p->bore.score);
++#endif /* CONFIG_SCHED_BORE */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p));
+ #endif
+@@ -1331,6 +1389,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 	__PS("nr_involuntary_switches", p->nivcsw);
+ 
+ 	P(se.load.weight);
++#ifdef CONFIG_SCHED_BORE
++	P(bore.score);
++#endif /* CONFIG_SCHED_BORE */
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+ 	P(se.avg.util_sum);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index ab4114712..630896fc0 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -58,6 +58,10 @@
+ #include "stats.h"
+ #include "autogroup.h"
+ 
++#ifdef CONFIG_SCHED_BORE
++#include <linux/sched/bore.h>
++#endif /* CONFIG_SCHED_BORE */
++
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -67,17 +71,30 @@
+  *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
+  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+  *
+- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant
++ * EEVDF: default SCHED_TUNABLESCALING_LOG  = *(1+ilog(ncpus))
+  */
++#ifdef CONFIG_SCHED_BORE
++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
++#else /* !CONFIG_SCHED_BORE */
+ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
++#endif /* CONFIG_SCHED_BORE */
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound tasks:
+  *
+- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice
++ * (default min_base_slice = 2000000 constant, units: nanoseconds)
++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds
+  */
++#ifdef CONFIG_SCHED_BORE
++static const unsigned int nsecs_per_tick       = 1000000000ULL / HZ;
++unsigned int sysctl_sched_min_base_slice       = CONFIG_MIN_BASE_SLICE_NS;
++__read_mostly uint sysctl_sched_base_slice     = nsecs_per_tick;
++#else /* !CONFIG_SCHED_BORE */
+ unsigned int sysctl_sched_base_slice			= 700000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
++#endif /* CONFIG_SCHED_BORE */
+ 
+ __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
+@@ -189,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
+  *
+  * This idea comes from the SD scheduler of Con Kolivas:
+  */
++#ifdef CONFIG_SCHED_BORE
++static void update_sysctl(void) {
++	sysctl_sched_base_slice = nsecs_per_tick *
++		max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick));
++}
++void sched_update_min_base_slice(void) { update_sysctl(); }
++#else /* !CONFIG_SCHED_BORE */
+ static unsigned int get_update_sysctl_factor(void)
+ {
+ 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
+@@ -219,6 +243,7 @@ static void update_sysctl(void)
+ 	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
++#endif /* CONFIG_SCHED_BORE */
+ 
+ void __init sched_init_granularity(void)
+ {
+@@ -957,7 +982,11 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+  */
+ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
++#ifdef CONFIG_SCHED_BORE
++	u64 slice = sysctl_sched_base_slice;
++#else /* CONFIG_SCHED_BORE */
+ 	u64 slice = normalized_sysctl_sched_base_slice;
++#endif /* CONFIG_SCHED_BORE */
+ 	u64 vprot = se->deadline;
+ 
+ 	if (sched_feat(RUN_TO_PARITY))
+@@ -1035,6 +1064,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+ 		curr = NULL;
+ 
+ 	if (curr && protect && protect_slice(curr))
++#ifdef CONFIG_SCHED_BORE
++		if (!static_branch_likely(&sched_bore_key) ||
++			!entity_is_task(curr) ||
++			!task_of(curr)->bore.futex_waiting)
++#endif /* CONFIG_SCHED_BORE */
+ 		return curr;
+ 
+ 	/* Pick the leftmost entity if it's eligible */
+@@ -1096,6 +1130,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+ /**************************************************************
+  * Scheduling class statistics methods:
+  */
++#if !defined(CONFIG_SCHED_BORE)
+ int sched_update_scaling(void)
+ {
+ 	unsigned int factor = get_update_sysctl_factor();
+@@ -1107,6 +1142,7 @@ int sched_update_scaling(void)
+ 
+ 	return 0;
+ }
++#endif /* CONFIG_SCHED_BORE */
+ 
+ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
+ 
+@@ -1307,6 +1343,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	resched = update_deadline(cfs_rq, curr);
+ 
+ 	if (entity_is_task(curr)) {
++#ifdef CONFIG_SCHED_BORE
++		struct task_struct *p = task_of(curr);
++		update_curr_bore(p, delta_exec);
++#endif /* CONFIG_SCHED_BORE */
++
+ 		/*
+ 		 * If the fair_server is active, we need to account for the
+ 		 * fair_server time whether or not the task is running on
+@@ -3843,17 +3884,23 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+ 
+-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 			    unsigned long weight)
+ {
+ 	bool curr = cfs_rq->curr == se;
+ 	bool rel_vprot = false;
+ 	u64 vprot;
++#ifdef CONFIG_SCHED_BORE
++	s64 vlag_unscaled = 0;
++#endif /* !CONFIG_SCHED_BORE */
+ 
+ 	if (se->on_rq) {
+ 		/* commit outstanding execution time */
+ 		update_curr(cfs_rq);
+ 		update_entity_lag(cfs_rq, se);
++#ifdef CONFIG_SCHED_BORE
++		vlag_unscaled = se->vlag;
++#endif /* !CONFIG_SCHED_BORE */
+ 		se->deadline -= se->vruntime;
+ 		se->rel_deadline = 1;
+ 		if (curr && protect_slice(se)) {
+@@ -3889,6 +3936,16 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 
+ 	enqueue_load_avg(cfs_rq, se);
+ 	if (se->on_rq) {
++#ifdef CONFIG_SCHED_BORE
++		if (curr) {
++			se->vruntime += vlag_unscaled - se->vlag;
++			if (se->rel_deadline) {
++				se->deadline += se->vruntime;
++				se->rel_deadline = 0;
++			}
++		}
++		else
++#endif /* !CONFIG_SCHED_BORE */
+ 		place_entity(cfs_rq, se, 0);
+ 		if (rel_vprot)
+ 			se->vprot = se->vruntime + vprot;
+@@ -5164,12 +5221,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
+ static void
+ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+-	u64 vslice, vruntime = avg_vruntime(cfs_rq);
++	u64 vslice = 0, vruntime = avg_vruntime(cfs_rq);
+ 	s64 lag = 0;
+ 
+ 	if (!se->custom_slice)
+ 		se->slice = sysctl_sched_base_slice;
+-	vslice = calc_delta_fair(se->slice, se);
+ 
+ 	/*
+ 	 * Due to how V is constructed as the weighted average of entities,
+@@ -5254,7 +5310,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		se->rel_deadline = 0;
+ 		return;
+ 	}
+-
++#ifdef CONFIG_SCHED_BORE
++	if (static_branch_likely(&sched_bore_key) &&
++			entity_is_task(se) &&
++			task_of(se)->bore.futex_waiting)
++		goto vslice_found;
++#endif /* !CONFIG_SCHED_BORE */
++	vslice = calc_delta_fair(se->slice, se);
++#ifdef CONFIG_SCHED_BORE
++	if (static_branch_likely(&sched_bore_key))
++		vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP));
++	else
++#endif /* CONFIG_SCHED_BORE */
+ 	/*
+ 	 * When joining the competition; the existing tasks will be,
+ 	 * on average, halfway through their slice, as such start tasks
+@@ -5263,6 +5330,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ 		vslice /= 2;
+ 
++#ifdef CONFIG_SCHED_BORE
++vslice_found:
++#endif /* CONFIG_SCHED_BORE */
+ 	/*
+ 	 * EEVDF: vd_i = ve_i + r_i/w_i
+ 	 */
+@@ -5273,7 +5343,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+ 
+ static void
+-requeue_delayed_entity(struct sched_entity *se);
++requeue_delayed_entity(struct sched_entity *se, int flags);
+ 
+ static void
+ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -5431,6 +5501,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		if (sched_feat(DELAY_DEQUEUE) && delay &&
+ 		    !entity_eligible(cfs_rq, se)) {
+ 			update_load_avg(cfs_rq, se, 0);
++#ifdef CONFIG_SCHED_BORE
++			if (static_branch_likely(&sched_bore_key) && sched_feat(DELAY_ZERO))
++				update_entity_lag(cfs_rq, se);
++#endif /* CONFIG_SCHED_BORE */
+ 			set_delayed(se);
+ 			return false;
+ 		}
+@@ -6902,7 +6976,7 @@ static int sched_idle_cpu(int cpu)
+ }
+ 
+ static void
+-requeue_delayed_entity(struct sched_entity *se)
++requeue_delayed_entity(struct sched_entity *se, int flags)
+ {
+ 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 
+@@ -6915,13 +6989,22 @@ requeue_delayed_entity(struct sched_entity *se)
+ 	WARN_ON_ONCE(!se->on_rq);
+ 
+ 	if (sched_feat(DELAY_ZERO)) {
++#ifdef CONFIG_SCHED_BORE
++		if (static_branch_likely(&sched_bore_key))
++			flags |= ENQUEUE_WAKEUP;
++		else {
++#endif /* CONFIG_SCHED_BORE */
++		flags = 0;
+ 		update_entity_lag(cfs_rq, se);
++#ifdef CONFIG_SCHED_BORE
++		}
++#endif /* CONFIG_SCHED_BORE */
+ 		if (se->vlag > 0) {
+ 			cfs_rq->nr_queued--;
+ 			if (se != cfs_rq->curr)
+ 				__dequeue_entity(cfs_rq, se);
+ 			se->vlag = 0;
+-			place_entity(cfs_rq, se, 0);
++			place_entity(cfs_rq, se, flags);
+ 			if (se != cfs_rq->curr)
+ 				__enqueue_entity(cfs_rq, se);
+ 			cfs_rq->nr_queued++;
+@@ -6961,7 +7044,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		util_est_enqueue(&rq->cfs, p);
+ 
+ 	if (flags & ENQUEUE_DELAYED) {
+-		requeue_delayed_entity(se);
++		requeue_delayed_entity(se, flags);
+ 		return;
+ 	}
+ 
+@@ -6979,7 +7062,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	for_each_sched_entity(se) {
+ 		if (se->on_rq) {
+ 			if (se->sched_delayed)
+-				requeue_delayed_entity(se);
++				requeue_delayed_entity(se, flags);
+ 			break;
+ 		}
+ 		cfs_rq = cfs_rq_of(se);
+@@ -7186,6 +7269,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		util_est_dequeue(&rq->cfs, p);
+ 
+ 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
++#ifdef CONFIG_SCHED_BORE
++	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
++	struct sched_entity *se = &p->se;
++	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
++		if (cfs_rq->curr == se)
++			update_curr(cfs_rq);
++		restart_burst_bore(p);
++	}
++#endif /* CONFIG_SCHED_BORE */
+ 	if (dequeue_entities(rq, &p->se, flags) < 0)
+ 		return false;
+ 
+@@ -9097,16 +9189,25 @@ static void yield_task_fair(struct rq *rq)
+ 	/*
+ 	 * Are we the only task in the tree?
+ 	 */
++#if !defined(CONFIG_SCHED_BORE)
+ 	if (unlikely(rq->nr_running == 1))
+ 		return;
+ 
+ 	clear_buddies(cfs_rq, se);
++#endif /* CONFIG_SCHED_BORE */
+ 
+ 	update_rq_clock(rq);
+ 	/*
+ 	 * Update run-time statistics of the 'current'.
+ 	 */
+ 	update_curr(cfs_rq);
++#ifdef CONFIG_SCHED_BORE
++	restart_burst_rescale_deadline_bore(curr);
++	if (unlikely(rq->nr_running == 1))
++		return;
++
++	clear_buddies(cfs_rq, se);
++#endif /* CONFIG_SCHED_BORE */
+ 	/*
+ 	 * Tell update_rq_clock() that we've just updated,
+ 	 * so we don't do microscopic update in schedule()
+@@ -13586,6 +13687,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
+ 	WARN_ON_ONCE(p->se.sched_delayed);
+ 
+ 	attach_task_cfs_rq(p);
++#ifdef CONFIG_SCHED_BORE
++	reset_task_bore(p);
++#endif /* CONFIG_SCHED_BORE */
+ 
+ 	set_task_max_allowed_capacity(p);
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 1ef9ba480..4b5bbf708 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2219,7 +2219,11 @@ extern int group_balance_cpu(struct sched_group *sg);
+ extern void update_sched_domain_debugfs(void);
+ extern void dirty_sched_domain_sysctl(int cpu);
+ 
++#ifdef CONFIG_SCHED_BORE
++extern void sched_update_min_base_slice(void);
++#else /* !CONFIG_SCHED_BORE */
+ extern int sched_update_scaling(void);
++#endif /* CONFIG_SCHED_BORE */
+ 
+ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+ {
+@@ -3013,7 +3017,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+ extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+ extern __read_mostly unsigned int sysctl_sched_migration_cost;
+ 
++#ifdef CONFIG_SCHED_BORE
++extern unsigned int sysctl_sched_min_base_slice;
++extern __read_mostly uint sysctl_sched_base_slice;
++#else /* !CONFIG_SCHED_BORE */
+ extern unsigned int sysctl_sched_base_slice;
++#endif /* CONFIG_SCHED_BORE */
+ 
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
+-- 
+2.53.0
+
diff --git a/sys-kernel/git-sources/0002-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch
similarity index 100%
rename from sys-kernel/git-sources/0002-glitched-additional-timer-tick-frequencies.patch
rename to sys-kernel/gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch
diff --git a/sys-kernel/git-sources/0001-asus.patch b/sys-kernel/git-sources/0001-asus.patch
new file mode 100644
index 0000000..75ef225
--- /dev/null
+++ b/sys-kernel/git-sources/0001-asus.patch
@@ -0,0 +1,6038 @@
+From b5b4f8345dc0d81e7922485af45f5384008db8bf Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 1 Sep 2025 09:38:53 +0800
+Subject: [PATCH 1/4] asus
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ .../ABI/testing/sysfs-platform-asus-wmi       |   17 +
+ drivers/hid/Kconfig                           |    9 +
+ drivers/hid/Makefile                          |    1 +
+ drivers/hid/hid-asus-ally.c                   | 2197 +++++++++++++++++
+ drivers/hid/hid-asus-ally.h                   |  398 +++
+ drivers/hid/hid-asus.c                        |   29 +-
+ drivers/hid/hid-asus.h                        |   13 +
+ drivers/hid/hid-ids.h                         |    1 +
+ drivers/platform/x86/Kconfig                  |   23 +
+ drivers/platform/x86/Makefile                 |    1 +
+ drivers/platform/x86/asus-armoury.c           | 1174 +++++++++
+ drivers/platform/x86/asus-armoury.h           | 1278 ++++++++++
+ drivers/platform/x86/asus-wmi.c               |  171 +-
+ include/linux/platform_data/x86/asus-wmi.h    |   22 +
+ 14 files changed, 5293 insertions(+), 41 deletions(-)
+ create mode 100644 drivers/hid/hid-asus-ally.c
+ create mode 100644 drivers/hid/hid-asus-ally.h
+ create mode 100644 drivers/hid/hid-asus.h
+ create mode 100644 drivers/platform/x86/asus-armoury.c
+ create mode 100644 drivers/platform/x86/asus-armoury.h
+
+diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
+index 28144371a0f1..765d50b0d9df 100644
+--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
++++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
+@@ -63,6 +63,7 @@ Date:		Aug 2022
+ KernelVersion:	6.1
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Switch the GPU hardware MUX mode. Laptops with this feature can
+ 		can be toggled to boot with only the dGPU (discrete mode) or in
+ 		standard Optimus/Hybrid mode. On switch a reboot is required:
+@@ -75,6 +76,7 @@ Date:		Aug 2022
+ KernelVersion:	5.17
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Disable discrete GPU:
+ 			* 0 - Enable dGPU,
+ 			* 1 - Disable dGPU
+@@ -84,6 +86,7 @@ Date:		Aug 2022
+ KernelVersion:	5.17
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Enable the external GPU paired with ROG X-Flow laptops.
+ 		Toggling this setting will also trigger ACPI to disable the dGPU:
+ 
+@@ -95,6 +98,7 @@ Date:		Aug 2022
+ KernelVersion:	5.17
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Enable an LCD response-time boost to reduce or remove ghosting:
+ 			* 0 - Disable,
+ 			* 1 - Enable
+@@ -104,6 +108,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Get the current charging mode being used:
+ 			* 1 - Barrel connected charger,
+ 			* 2 - USB-C charging
+@@ -114,6 +119,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Show if the egpu (XG Mobile) is correctly connected:
+ 			* 0 - False,
+ 			* 1 - True
+@@ -123,6 +129,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Change the mini-LED mode:
+ 			* 0 - Single-zone,
+ 			* 1 - Multi-zone
+@@ -133,6 +140,7 @@ Date:		Apr 2024
+ KernelVersion:	6.10
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		List the available mini-led modes.
+ 
+ What:		/sys/devices/platform/<platform>/ppt_pl1_spl
+@@ -140,6 +148,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the Package Power Target total of CPU: PL1 on Intel, SPL on AMD.
+ 		Shown on Intel+Nvidia or AMD+Nvidia based systems:
+ 
+@@ -150,6 +159,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the Slow Package Power Tracking Limit of CPU: PL2 on Intel, SPPT,
+ 		on AMD. Shown on Intel+Nvidia or AMD+Nvidia based systems:
+ 
+@@ -160,6 +170,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the Fast Package Power Tracking Limit of CPU. AMD+Nvidia only:
+ 			* min=5, max=250
+ 
+@@ -168,6 +179,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the APU SPPT limit. Shown on full AMD systems only:
+ 			* min=5, max=130
+ 
+@@ -176,6 +188,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the platform SPPT limit. Shown on full AMD systems only:
+ 			* min=5, max=130
+ 
+@@ -184,6 +197,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the dynamic boost limit of the Nvidia dGPU:
+ 			* min=5, max=25
+ 
+@@ -192,6 +206,7 @@ Date:		Jun 2023
+ KernelVersion:	6.5
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set the target temperature limit of the Nvidia dGPU:
+ 			* min=75, max=87
+ 
+@@ -200,6 +215,7 @@ Date:		Apr 2024
+ KernelVersion:	6.10
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set if the BIOS POST sound is played on boot.
+ 			* 0 - False,
+ 			* 1 - True
+@@ -209,6 +225,7 @@ Date:		Apr 2024
+ KernelVersion:	6.10
+ Contact:	"Luke Jones" <luke@ljones.dev>
+ Description:
++        DEPRECATED, WILL BE REMOVED SOON
+ 		Set if the MCU can go in to low-power mode on system sleep
+ 			* 0 - False,
+ 			* 1 - True
+diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
+index 79997553d8f9..d3147e48a8f1 100644
+--- a/drivers/hid/Kconfig
++++ b/drivers/hid/Kconfig
+@@ -191,6 +191,15 @@ config HID_ASUS
+ 	- GL553V series
+ 	- GL753V series
+ 
++config HID_ASUS_ALLY
++    tristate "Asus Ally gamepad configuration support"
++    depends on USB_HID
++    depends on LEDS_CLASS
++    depends on LEDS_CLASS_MULTICOLOR
++    select POWER_SUPPLY
++    help
++    Support for configuring the Asus ROG Ally gamepad using attributes.
++
+ config HID_AUREAL
+ 	tristate "Aureal"
+ 	help
+diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
+index 10ae5dedbd84..958f67193c85 100644
+--- a/drivers/hid/Makefile
++++ b/drivers/hid/Makefile
+@@ -33,6 +33,7 @@ obj-$(CONFIG_HID_APPLETB_BL)	+= hid-appletb-bl.o
+ obj-$(CONFIG_HID_APPLETB_KBD)	+= hid-appletb-kbd.o
+ obj-$(CONFIG_HID_CREATIVE_SB0540)	+= hid-creative-sb0540.o
+ obj-$(CONFIG_HID_ASUS)		+= hid-asus.o
++obj-$(CONFIG_HID_ASUS_ALLY)	+= hid-asus-ally.o
+ obj-$(CONFIG_HID_AUREAL)	+= hid-aureal.o
+ obj-$(CONFIG_HID_BELKIN)	+= hid-belkin.o
+ obj-$(CONFIG_HID_BETOP_FF)	+= hid-betopff.o
+diff --git a/drivers/hid/hid-asus-ally.c b/drivers/hid/hid-asus-ally.c
+new file mode 100644
+index 000000000000..e78625f70c44
+--- /dev/null
++++ b/drivers/hid/hid-asus-ally.c
+@@ -0,0 +1,2197 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  HID driver for Asus ROG laptops and Ally
++ *
++ *  Copyright (c) 2023 Luke Jones <luke@ljones.dev>
++ */
++
++#include "linux/compiler_attributes.h"
++#include "linux/device.h"
++#include <linux/platform_data/x86/asus-wmi.h>
++#include <linux/platform_device.h>
++#include "linux/pm.h"
++#include "linux/printk.h"
++#include "linux/slab.h"
++#include <linux/hid.h>
++#include <linux/types.h>
++#include <linux/usb.h>
++#include <linux/leds.h>
++#include <linux/led-class-multicolor.h>
++
++#include "hid-ids.h"
++#include "hid-asus.h"
++#include "hid-asus-ally.h"
++
++#define DEBUG
++
++#define READY_MAX_TRIES 3
++#define FEATURE_REPORT_ID 0x0d
++#define FEATURE_ROG_ALLY_REPORT_ID 0x5a
++#define FEATURE_ROG_ALLY_CODE_PAGE 0xD1
++#define FEATURE_ROG_ALLY_REPORT_SIZE 64
++#define ALLY_X_INPUT_REPORT_USB 0x0B
++#define ALLY_X_INPUT_REPORT_USB_SIZE 16
++
++#define ROG_ALLY_REPORT_SIZE 64
++#define ROG_ALLY_X_MIN_MCU 313
++#define ROG_ALLY_MIN_MCU 319
++
++#define FEATURE_KBD_LED_REPORT_ID1 0x5d
++#define FEATURE_KBD_LED_REPORT_ID2 0x5e
++
++#define BTN_DATA_LEN 11;
++#define BTN_CODE_BYTES_LEN 8
++
++static const u8 EC_INIT_STRING[] = { 0x5A, 'A', 'S', 'U', 'S', ' ', 'T', 'e','c', 'h', '.', 'I', 'n', 'c', '.', '\0' };
++static const u8 EC_MODE_LED_APPLY[] = { 0x5A, 0xB4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
++static const u8 EC_MODE_LED_SET[] = { 0x5A, 0xB5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
++static const u8 FORCE_FEEDBACK_OFF[] = { 0x0D, 0x0F, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xEB };
++
++static const struct hid_device_id rog_ally_devices[] = {
++	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY) },
++	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X) },
++	{}
++};
++
++struct btn_code_map {
++	u64 code;
++	const char *name;
++};
++
++static const struct btn_code_map ally_btn_codes[] = {
++	{ 0, "NONE" },
++	/* Gamepad button codes */
++	{ BTN_PAD_A, "PAD_A" },
++	{ BTN_PAD_B, "PAD_B" },
++	{ BTN_PAD_X, "PAD_X" },
++	{ BTN_PAD_Y, "PAD_Y" },
++	{ BTN_PAD_LB, "PAD_LB" },
++	{ BTN_PAD_RB, "PAD_RB" },
++	{ BTN_PAD_LS, "PAD_LS" },
++	{ BTN_PAD_RS, "PAD_RS" },
++	{ BTN_PAD_DPAD_UP, "PAD_DPAD_UP" },
++	{ BTN_PAD_DPAD_DOWN, "PAD_DPAD_DOWN" },
++	{ BTN_PAD_DPAD_LEFT, "PAD_DPAD_LEFT" },
++	{ BTN_PAD_DPAD_RIGHT, "PAD_DPAD_RIGHT" },
++	{ BTN_PAD_VIEW, "PAD_VIEW" },
++	{ BTN_PAD_MENU, "PAD_MENU" },
++	{ BTN_PAD_XBOX, "PAD_XBOX" },
++
++	/* Triggers mapped to keyboard codes */
++	{ BTN_KB_M2, "KB_M2" },
++	{ BTN_KB_M1, "KB_M1" },
++	{ BTN_KB_ESC, "KB_ESC" },
++	{ BTN_KB_F1, "KB_F1" },
++	{ BTN_KB_F2, "KB_F2" },
++	{ BTN_KB_F3, "KB_F3" },
++	{ BTN_KB_F4, "KB_F4" },
++	{ BTN_KB_F5, "KB_F5" },
++	{ BTN_KB_F6, "KB_F6" },
++	{ BTN_KB_F7, "KB_F7" },
++	{ BTN_KB_F8, "KB_F8" },
++	{ BTN_KB_F9, "KB_F9" },
++	{ BTN_KB_F10, "KB_F10" },
++	{ BTN_KB_F11, "KB_F11" },
++	{ BTN_KB_F12, "KB_F12" },
++	{ BTN_KB_F14, "KB_F14" },
++	{ BTN_KB_F15, "KB_F15" },
++	{ BTN_KB_BACKTICK, "KB_BACKTICK" },
++	{ BTN_KB_1, "KB_1" },
++	{ BTN_KB_2, "KB_2" },
++	{ BTN_KB_3, "KB_3" },
++	{ BTN_KB_4, "KB_4" },
++	{ BTN_KB_5, "KB_5" },
++	{ BTN_KB_6, "KB_6" },
++	{ BTN_KB_7, "KB_7" },
++	{ BTN_KB_8, "KB_8" },
++	{ BTN_KB_9, "KB_9" },
++	{ BTN_KB_0, "KB_0" },
++	{ BTN_KB_HYPHEN, "KB_HYPHEN" },
++	{ BTN_KB_EQUALS, "KB_EQUALS" },
++	{ BTN_KB_BACKSPACE, "KB_BACKSPACE" },
++	{ BTN_KB_TAB, "KB_TAB" },
++	{ BTN_KB_Q, "KB_Q" },
++	{ BTN_KB_W, "KB_W" },
++	{ BTN_KB_E, "KB_E" },
++	{ BTN_KB_R, "KB_R" },
++	{ BTN_KB_T, "KB_T" },
++	{ BTN_KB_Y, "KB_Y" },
++	{ BTN_KB_U, "KB_U" },
++	{ BTN_KB_O, "KB_O" },
++	{ BTN_KB_P, "KB_P" },
++	{ BTN_KB_LBRACKET, "KB_LBRACKET" },
++	{ BTN_KB_RBRACKET, "KB_RBRACKET" },
++	{ BTN_KB_BACKSLASH, "KB_BACKSLASH" },
++	{ BTN_KB_CAPS, "KB_CAPS" },
++	{ BTN_KB_A, "KB_A" },
++	{ BTN_KB_S, "KB_S" },
++	{ BTN_KB_D, "KB_D" },
++	{ BTN_KB_F, "KB_F" },
++	{ BTN_KB_G, "KB_G" },
++	{ BTN_KB_H, "KB_H" },
++	{ BTN_KB_J, "KB_J" },
++	{ BTN_KB_K, "KB_K" },
++	{ BTN_KB_L, "KB_L" },
++	{ BTN_KB_SEMI, "KB_SEMI" },
++	{ BTN_KB_QUOTE, "KB_QUOTE" },
++	{ BTN_KB_RET, "KB_RET" },
++	{ BTN_KB_LSHIFT, "KB_LSHIFT" },
++	{ BTN_KB_Z, "KB_Z" },
++	{ BTN_KB_X, "KB_X" },
++	{ BTN_KB_C, "KB_C" },
++	{ BTN_KB_V, "KB_V" },
++	{ BTN_KB_B, "KB_B" },
++	{ BTN_KB_N, "KB_N" },
++	{ BTN_KB_M, "KB_M" },
++	{ BTN_KB_COMMA, "KB_COMMA" },
++	{ BTN_KB_PERIOD, "KB_PERIOD" },
++	{ BTN_KB_RSHIFT, "KB_RSHIFT" },
++	{ BTN_KB_LCTL, "KB_LCTL" },
++	{ BTN_KB_META, "KB_META" },
++	{ BTN_KB_LALT, "KB_LALT" },
++	{ BTN_KB_SPACE, "KB_SPACE" },
++	{ BTN_KB_RALT, "KB_RALT" },
++	{ BTN_KB_MENU, "KB_MENU" },
++	{ BTN_KB_RCTL, "KB_RCTL" },
++	{ BTN_KB_PRNTSCN, "KB_PRNTSCN" },
++	{ BTN_KB_SCRLCK, "KB_SCRLCK" },
++	{ BTN_KB_PAUSE, "KB_PAUSE" },
++	{ BTN_KB_INS, "KB_INS" },
++	{ BTN_KB_HOME, "KB_HOME" },
++	{ BTN_KB_PGUP, "KB_PGUP" },
++	{ BTN_KB_DEL, "KB_DEL" },
++	{ BTN_KB_END, "KB_END" },
++	{ BTN_KB_PGDWN, "KB_PGDWN" },
++	{ BTN_KB_UP_ARROW, "KB_UP_ARROW" },
++	{ BTN_KB_DOWN_ARROW, "KB_DOWN_ARROW" },
++	{ BTN_KB_LEFT_ARROW, "KB_LEFT_ARROW" },
++	{ BTN_KB_RIGHT_ARROW, "KB_RIGHT_ARROW" },
++
++	/* Numpad mappings */
++	{ BTN_NUMPAD_LOCK, "NUMPAD_LOCK" },
++	{ BTN_NUMPAD_FWDSLASH, "NUMPAD_FWDSLASH" },
++	{ BTN_NUMPAD_ASTERISK, "NUMPAD_ASTERISK" },
++	{ BTN_NUMPAD_HYPHEN, "NUMPAD_HYPHEN" },
++	{ BTN_NUMPAD_0, "NUMPAD_0" },
++	{ BTN_NUMPAD_1, "NUMPAD_1" },
++	{ BTN_NUMPAD_2, "NUMPAD_2" },
++	{ BTN_NUMPAD_3, "NUMPAD_3" },
++	{ BTN_NUMPAD_4, "NUMPAD_4" },
++	{ BTN_NUMPAD_5, "NUMPAD_5" },
++	{ BTN_NUMPAD_6, "NUMPAD_6" },
++	{ BTN_NUMPAD_7, "NUMPAD_7" },
++	{ BTN_NUMPAD_8, "NUMPAD_8" },
++	{ BTN_NUMPAD_9, "NUMPAD_9" },
++	{ BTN_NUMPAD_PLUS, "NUMPAD_PLUS" },
++	{ BTN_NUMPAD_ENTER, "NUMPAD_ENTER" },
++	{ BTN_NUMPAD_PERIOD, "NUMPAD_PERIOD" },
++
++	/* Mouse mappings */
++	{ BTN_MOUSE_LCLICK, "MOUSE_LCLICK" },
++	{ BTN_MOUSE_RCLICK, "MOUSE_RCLICK" },
++	{ BTN_MOUSE_MCLICK, "MOUSE_MCLICK" },
++	{ BTN_MOUSE_WHEEL_UP, "MOUSE_WHEEL_UP" },
++	{ BTN_MOUSE_WHEEL_DOWN, "MOUSE_WHEEL_DOWN" },
++
++	/* Media mappings */
++	{ BTN_MEDIA_SCREENSHOT, "MEDIA_SCREENSHOT" },
++	{ BTN_MEDIA_SHOW_KEYBOARD, "MEDIA_SHOW_KEYBOARD" },
++	{ BTN_MEDIA_SHOW_DESKTOP, "MEDIA_SHOW_DESKTOP" },
++	{ BTN_MEDIA_START_RECORDING, "MEDIA_START_RECORDING" },
++	{ BTN_MEDIA_MIC_OFF, "MEDIA_MIC_OFF" },
++	{ BTN_MEDIA_VOL_DOWN, "MEDIA_VOL_DOWN" },
++	{ BTN_MEDIA_VOL_UP, "MEDIA_VOL_UP" },
++};
++static const size_t keymap_len = ARRAY_SIZE(ally_btn_codes);
++
++/* byte_array must be >= 8 in length */
++static void btn_code_to_byte_array(u64 keycode, u8 *byte_array)
++{
++	/* Convert the u64 to bytes[8] */
++	for (int i = 0; i < 8; ++i) {
++		byte_array[i] = (keycode >> (56 - 8 * i)) & 0xFF;
++	}
++}
++
++static u64 name_to_btn(const char *name)
++{
++	int len = strcspn(name, "\n");
++	for (size_t i = 0; i < keymap_len; ++i) {
++		if (strncmp(ally_btn_codes[i].name, name, len) == 0) {
++			return ally_btn_codes[i].code;
++		}
++	}
++	return -EINVAL;
++}
++
++static const char* btn_to_name(u64 key)
++{
++	for (size_t i = 0; i < keymap_len; ++i) {
++		if (ally_btn_codes[i].code == key) {
++			return ally_btn_codes[i].name;
++		}
++	}
++	return NULL;
++}
++
++struct btn_data {
++	u64 button;
++	u64 macro;
++	bool turbo;
++};
++
++struct btn_mapping {
++	struct btn_data btn_a;
++	struct btn_data btn_b;
++	struct btn_data btn_x;
++	struct btn_data btn_y;
++	struct btn_data btn_lb;
++	struct btn_data btn_rb;
++	struct btn_data btn_ls;
++	struct btn_data btn_rs;
++	struct btn_data btn_lt;
++	struct btn_data btn_rt;
++	struct btn_data dpad_up;
++	struct btn_data dpad_down;
++	struct btn_data dpad_left;
++	struct btn_data dpad_right;
++	struct btn_data btn_view;
++	struct btn_data btn_menu;
++	struct btn_data btn_m1;
++	struct btn_data btn_m2;
++};
++
++struct deadzone {
++	u8 inner;
++	u8 outer;
++};
++
++struct response_curve {
++	uint8_t move_pct_1;
++	uint8_t response_pct_1;
++	uint8_t move_pct_2;
++	uint8_t response_pct_2;
++	uint8_t move_pct_3;
++	uint8_t response_pct_3;
++	uint8_t move_pct_4;
++	uint8_t response_pct_4;
++} __packed;
++
++struct js_axis_calibrations {
++	uint16_t left_y_stable;
++	uint16_t left_y_min;
++	uint16_t left_y_max;
++	uint16_t left_x_stable;
++	uint16_t left_x_min;
++	uint16_t left_x_max;
++	uint16_t right_y_stable;
++	uint16_t right_y_min;
++	uint16_t right_y_max;
++	uint16_t right_x_stable;
++	uint16_t right_x_min;
++	uint16_t right_x_max;
++} __packed;
++
++struct tr_axis_calibrations {
++	uint16_t left_stable;
++	uint16_t left_max;
++	uint16_t right_stable;
++	uint16_t right_max;
++} __packed;
++
++/* ROG Ally has many settings related to the gamepad, all using the same n-key endpoint */
++struct ally_gamepad_cfg {
++	struct hid_device *hdev;
++	struct input_dev *input;
++
++	enum xpad_mode mode;
++	/*
++	 * index: [mode]
++	 */
++	struct btn_mapping key_mapping[xpad_mode_mouse];
++	/*
++	 * index: left, right
++	 * max: 64
++	 */
++	u8 vibration_intensity[2];
++
++	/* deadzones */
++	struct deadzone ls_dz; // left stick
++	struct deadzone rs_dz; // right stick
++	struct deadzone lt_dz; // left trigger
++	struct deadzone rt_dz; // right trigger
++	/* anti-deadzones */
++	u8 ls_adz; // left stick
++	u8 rs_adz; // right stick
++	/* joystick response curves */
++	struct response_curve ls_rc;
++	struct response_curve rs_rc;
++
++	struct js_axis_calibrations js_cal;
++	struct tr_axis_calibrations tr_cal;
++};
++
++/* The hatswitch outputs integers, we use them to index this X|Y pair */
++static const int hat_values[][2] = {
++	{ 0, 0 }, { 0, -1 }, { 1, -1 }, { 1, 0 },   { 1, 1 },
++	{ 0, 1 }, { -1, 1 }, { -1, 0 }, { -1, -1 },
++};
++
++/* rumble packet structure */
++struct ff_data {
++	u8 enable;
++	u8 magnitude_left;
++	u8 magnitude_right;
++	u8 magnitude_strong;
++	u8 magnitude_weak;
++	u8 pulse_sustain_10ms;
++	u8 pulse_release_10ms;
++	u8 loop_count;
++} __packed;
++
++struct ff_report {
++	u8 report_id;
++	struct ff_data ff;
++} __packed;
++
++struct ally_x_input_report {
++	uint16_t x, y;
++	uint16_t rx, ry;
++	uint16_t z, rz;
++	uint8_t buttons[4];
++} __packed;
++
++struct ally_x_device {
++	struct input_dev *input;
++	struct hid_device *hdev;
++	spinlock_t lock;
++
++	struct ff_report *ff_packet;
++	struct work_struct output_worker;
++	bool output_worker_initialized;
++	/* Prevent multiple queued event due to the enforced delay in worker */
++	bool update_qam_btn;
++	/* Set if the QAM and AC buttons emit Xbox and Xbox+A */
++	bool qam_btns_steam_mode;
++	bool update_ff;
++};
++
++struct ally_rgb_dev {
++	struct hid_device *hdev;
++	struct led_classdev_mc led_rgb_dev;
++	struct work_struct work;
++	bool output_worker_initialized;
++	spinlock_t lock;
++
++	bool removed;
++	bool update_rgb;
++	uint8_t red[4];
++	uint8_t green[4];
++	uint8_t blue[4];
++};
++
++struct ally_rgb_data {
++	uint8_t brightness;
++	uint8_t red[4];
++	uint8_t green[4];
++	uint8_t blue[4];
++	bool initialized;
++};
++
++static struct ally_drvdata {
++	struct hid_device *hdev;
++	struct ally_x_device *ally_x;
++	struct ally_gamepad_cfg *gamepad_cfg;
++	struct ally_rgb_dev *led_rgb_dev;
++	struct ally_rgb_data led_rgb_data;
++	uint mcu_version;
++} drvdata;
++
++static void reverse_bytes_in_pairs(u8 *buf, size_t size) {
++	uint16_t *word_ptr;
++	size_t i;
++
++	for (i = 0; i < size; i += 2) {
++		if (i + 1 < size) {
++			word_ptr = (uint16_t *)&buf[i];
++			*word_ptr = cpu_to_be16(*word_ptr);
++		}
++	}
++}
++
++/**
++ * asus_dev_set_report - send set report request to device.
++ *
++ * @hdev: hid device
++ * @buf: in/out data to transfer
++ * @len: length of buf
++ *
++ * Return: count of data transferred, negative if error
++ *
++ * Same behavior as hid_hw_raw_request. Note that the input buffer is duplicated.
++ */
++static int asus_dev_set_report(struct hid_device *hdev, const u8 *buf, size_t len)
++{
++	unsigned char *dmabuf;
++	int ret;
++
++	dmabuf = kmemdup(buf, len, GFP_KERNEL);
++	if (!dmabuf)
++		return -ENOMEM;
++
++	ret = hid_hw_raw_request(hdev, buf[0], dmabuf, len, HID_FEATURE_REPORT,
++				 HID_REQ_SET_REPORT);
++	kfree(dmabuf);
++
++	return ret;
++}
++
++/**
++ * asus_dev_get_report - send get report request to device.
++ *
++ * @hdev: hid device
++ * @out: buffer to write output data in to
++ * @len: length the output buffer provided
++ *
++ * Return: count of data transferred, negative if error
++ *
++ * Same behavior as hid_hw_raw_request.
++ */
++static int asus_dev_get_report(struct hid_device *hdev, u8 *out, size_t len)
++{
++	return hid_hw_raw_request(hdev, FEATURE_REPORT_ID, out, len,
++		HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
++}
++
++static u8 get_endpoint_address(struct hid_device *hdev)
++{
++	struct usb_interface *intf;
++	struct usb_host_endpoint *ep;
++
++	intf = to_usb_interface(hdev->dev.parent);
++
++	if (intf) {
++		ep = intf->cur_altsetting->endpoint;
++		if (ep) {
++			return ep->desc.bEndpointAddress;
++		}
++	}
++
++	return -ENODEV;
++}
++
++/**************************************************************************************************/
++/* ROG Ally gamepad configuration                                                                 */
++/**************************************************************************************************/
++
++/* This should be called before any attempts to set device functions */
++static int ally_gamepad_check_ready(struct hid_device *hdev)
++{
++	int ret, count;
++	u8 *hidbuf;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	ret = 0;
++	for (count = 0; count < READY_MAX_TRIES; count++) {
++		hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++		hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++		hidbuf[2] = xpad_cmd_check_ready;
++		hidbuf[3] = 01;
++		ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0)
++			hid_dbg(hdev, "ROG Ally check failed set report: %d\n", ret);
++
++		hidbuf[0] = hidbuf[1] = hidbuf[2] = hidbuf[3] = 0;
++		ret = asus_dev_get_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0)
++			hid_dbg(hdev, "ROG Ally check failed get report: %d\n", ret);
++
++		ret = hidbuf[2] == xpad_cmd_check_ready;
++		if (ret)
++			break;
++		usleep_range(
++			1000,
++			2000); /* don't spam the entire loop in less than USB response time */
++	}
++
++	if (count == READY_MAX_TRIES)
++		hid_warn(hdev, "ROG Ally never responded with a ready\n");
++
++	kfree(hidbuf);
++	return ret;
++}
++
++/* VIBRATION INTENSITY ****************************************************************************/
++static ssize_t gamepad_vibration_intensity_index_show(struct device *dev,
++						      struct device_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "left right\n");
++}
++
++ALLY_DEVICE_ATTR_RO(gamepad_vibration_intensity_index, vibration_intensity_index);
++
++static ssize_t _gamepad_apply_intensity(struct hid_device *hdev,
++					struct ally_gamepad_cfg *ally_cfg)
++{
++	u8 *hidbuf;
++	int ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	hidbuf[2] = xpad_cmd_set_vibe_intensity;
++	hidbuf[3] = xpad_cmd_len_vibe_intensity;
++	hidbuf[4] = ally_cfg->vibration_intensity[0];
++	hidbuf[5] = ally_cfg->vibration_intensity[1];
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t gamepad_vibration_intensity_show(struct device *dev,
++						struct device_attribute *attr, char *buf)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	return sysfs_emit(
++		buf, "%d %d\n",
++		ally_cfg->vibration_intensity[0],
++		ally_cfg->vibration_intensity[1]);
++}
++
++static ssize_t gamepad_vibration_intensity_store(struct device *dev,
++						 struct device_attribute *attr, const char *buf,
++						 size_t count)
++{
++	struct hid_device *hdev = to_hid_device(dev);
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	u32 left, right;
++	int ret;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	if (sscanf(buf, "%d %d", &left, &right) != 2)
++		return -EINVAL;
++
++	if (left > 64 || right > 64)
++		return -EINVAL;
++
++	ally_cfg->vibration_intensity[0] = left;
++	ally_cfg->vibration_intensity[1] = right;
++
++	ret = _gamepad_apply_intensity(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++ALLY_DEVICE_ATTR_RW(gamepad_vibration_intensity, vibration_intensity);
++
++/* ANALOGUE DEADZONES *****************************************************************************/
++static ssize_t _gamepad_apply_deadzones(struct hid_device *hdev,
++				       struct ally_gamepad_cfg *ally_cfg)
++{
++	u8 *hidbuf;
++	int ret;
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	hidbuf[2] = xpad_cmd_set_js_dz;
++	hidbuf[3] = xpad_cmd_len_deadzone;
++	hidbuf[4] = ally_cfg->ls_dz.inner;
++	hidbuf[5] = ally_cfg->ls_dz.outer;
++	hidbuf[6] = ally_cfg->rs_dz.inner;
++	hidbuf[7] = ally_cfg->rs_dz.outer;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto end;
++
++	hidbuf[2] = xpad_cmd_set_tr_dz;
++	hidbuf[4] = ally_cfg->lt_dz.inner;
++	hidbuf[5] = ally_cfg->lt_dz.outer;
++	hidbuf[6] = ally_cfg->rt_dz.inner;
++	hidbuf[7] = ally_cfg->rt_dz.outer;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto end;
++
++end:
++	kfree(hidbuf);
++	return ret;
++}
++
++static void _gamepad_set_deadzones_default(struct ally_gamepad_cfg *ally_cfg)
++{
++	ally_cfg->ls_dz.inner = 0x00;
++	ally_cfg->ls_dz.outer = 0x64;
++	ally_cfg->rs_dz.inner = 0x00;
++	ally_cfg->rs_dz.outer = 0x64;
++	ally_cfg->lt_dz.inner = 0x00;
++	ally_cfg->lt_dz.outer = 0x64;
++	ally_cfg->rt_dz.inner = 0x00;
++	ally_cfg->rt_dz.outer = 0x64;
++}
++
++static ssize_t axis_xyz_deadzone_index_show(struct device *dev, struct device_attribute *attr,
++					    char *buf)
++{
++	return sysfs_emit(buf, "inner outer\n");
++}
++
++ALLY_DEVICE_ATTR_RO(axis_xyz_deadzone_index, deadzone_index);
++
++ALLY_DEADZONES(axis_xy_left, ls_dz);
++ALLY_DEADZONES(axis_xy_right, rs_dz);
++ALLY_DEADZONES(axis_z_left, lt_dz);
++ALLY_DEADZONES(axis_z_right, rt_dz);
++
++/* ANTI-DEADZONES *********************************************************************************/
++static ssize_t _gamepad_apply_js_ADZ(struct hid_device *hdev,
++					     struct ally_gamepad_cfg *ally_cfg)
++{
++	u8 *hidbuf;
++	int ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	hidbuf[2] = xpad_cmd_set_adz;
++	hidbuf[3] = xpad_cmd_len_adz;
++	hidbuf[4] = ally_cfg->ls_adz;
++	hidbuf[5] = ally_cfg->rs_adz;
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static void _gamepad_set_anti_deadzones_default(struct ally_gamepad_cfg *ally_cfg)
++{
++	ally_cfg->ls_adz = 0x00;
++	ally_cfg->rs_adz = 0x00;
++}
++
++static ssize_t _gamepad_js_ADZ_store(struct device *dev, const char *buf, u8 *adz)
++{
++	int ret, val;
++
++	ret = kstrtoint(buf, 0, &val);
++	if (ret)
++		return ret;
++
++	if (val < 0 || val > 32)
++		return -EINVAL;
++
++	*adz = val;
++
++	return ret;
++}
++
++static ssize_t axis_xy_left_anti_deadzone_show(struct device *dev,
++						struct device_attribute *attr,
++						char *buf)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++
++	return sysfs_emit(buf, "%d\n", ally_cfg->ls_adz);
++}
++
++static ssize_t axis_xy_left_anti_deadzone_store(struct device *dev,
++						struct device_attribute *attr,
++						const char *buf, size_t count)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	int ret;
++
++	ret = _gamepad_js_ADZ_store(dev, buf, &ally_cfg->ls_adz);
++	if (ret)
++		return ret;
++
++	return count;
++}
++ALLY_DEVICE_ATTR_RW(axis_xy_left_anti_deadzone, anti_deadzone);
++
++static ssize_t axis_xy_right_anti_deadzone_show(struct device *dev,
++						struct device_attribute *attr,
++						char *buf)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++
++	return sysfs_emit(buf, "%d\n", ally_cfg->rs_adz);
++}
++
++static ssize_t axis_xy_right_anti_deadzone_store(struct device *dev,
++						struct device_attribute *attr,
++						const char *buf, size_t count)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	int ret;
++
++	ret = _gamepad_js_ADZ_store(dev, buf, &ally_cfg->rs_adz);
++	if (ret)
++		return ret;
++
++	return count;
++}
++ALLY_DEVICE_ATTR_RW(axis_xy_right_anti_deadzone, anti_deadzone);
++
++/* JS RESPONSE CURVES *****************************************************************************/
++static void _gamepad_set_js_response_curves_default(struct ally_gamepad_cfg *ally_cfg)
++{
++	struct response_curve *js1_rc = &ally_cfg->ls_rc;
++	struct response_curve *js2_rc = &ally_cfg->rs_rc;
++	js1_rc->move_pct_1 = js2_rc->move_pct_1 = 0x16; // 25%
++	js1_rc->move_pct_2 = js2_rc->move_pct_2 = 0x32; // 50%
++	js1_rc->move_pct_3 = js2_rc->move_pct_3 = 0x48; // 75%
++	js1_rc->move_pct_4 = js2_rc->move_pct_4 = 0x64; // 100%
++	js1_rc->response_pct_1 = js2_rc->response_pct_1 = 0x16;
++	js1_rc->response_pct_2 = js2_rc->response_pct_2 = 0x32;
++	js1_rc->response_pct_3 = js2_rc->response_pct_3 = 0x48;
++	js1_rc->response_pct_4 = js2_rc->response_pct_4 = 0x64;
++}
++
++static ssize_t _gamepad_apply_response_curves(struct hid_device *hdev,
++					      struct ally_gamepad_cfg *ally_cfg)
++{
++	u8 *hidbuf;
++	int ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	memcpy(&hidbuf[2], &ally_cfg->ls_rc, sizeof(ally_cfg->ls_rc));
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	hidbuf[4] = 0x02;
++	memcpy(&hidbuf[5], &ally_cfg->rs_rc, sizeof(ally_cfg->rs_rc));
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++ALLY_JS_RC_POINT(axis_xy_left, move, 1);
++ALLY_JS_RC_POINT(axis_xy_left, move, 2);
++ALLY_JS_RC_POINT(axis_xy_left, move, 3);
++ALLY_JS_RC_POINT(axis_xy_left, move, 4);
++ALLY_JS_RC_POINT(axis_xy_left, response, 1);
++ALLY_JS_RC_POINT(axis_xy_left, response, 2);
++ALLY_JS_RC_POINT(axis_xy_left, response, 3);
++ALLY_JS_RC_POINT(axis_xy_left, response, 4);
++
++ALLY_JS_RC_POINT(axis_xy_right, move, 1);
++ALLY_JS_RC_POINT(axis_xy_right, move, 2);
++ALLY_JS_RC_POINT(axis_xy_right, move, 3);
++ALLY_JS_RC_POINT(axis_xy_right, move, 4);
++ALLY_JS_RC_POINT(axis_xy_right, response, 1);
++ALLY_JS_RC_POINT(axis_xy_right, response, 2);
++ALLY_JS_RC_POINT(axis_xy_right, response, 3);
++ALLY_JS_RC_POINT(axis_xy_right, response, 4);
++
++/* CALIBRATIONS ***********************************************************************************/
++static int gamepad_get_calibration(struct hid_device *hdev)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	u8 *hidbuf;
++	int ret, i;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	for (i = 0; i < 2; i++) {
++		hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++		hidbuf[1] = 0xD0;
++		hidbuf[2] = 0x03;
++		hidbuf[3] = i + 1; // 0x01 JS, 0x02 TR
++		hidbuf[4] = 0x20;
++
++		ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0) {
++			hid_warn(hdev, "ROG Ally check failed set report: %d\n", ret);
++			goto cleanup;
++		}
++
++		memset(hidbuf, 0, FEATURE_ROG_ALLY_REPORT_SIZE);
++		ret = asus_dev_get_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0 || hidbuf[5] != 1) {
++			hid_warn(hdev, "ROG Ally check failed get report: %d\n", ret);
++			goto cleanup;
++		}
++
++		if (i == 0) {
++			/* Joystick calibration */
++			reverse_bytes_in_pairs(&hidbuf[6], sizeof(struct js_axis_calibrations));
++			ally_cfg->js_cal = *(struct js_axis_calibrations *)&hidbuf[6];
++			print_hex_dump(KERN_INFO, "HID Buffer JS: ", DUMP_PREFIX_OFFSET, 16, 1, hidbuf, 32, true);
++			struct js_axis_calibrations *cal = &drvdata.gamepad_cfg->js_cal;
++			pr_err("LS_CAL: X: %d, Min: %d, Max: %d", cal->left_x_stable, cal->left_x_min, cal->left_x_max);
++			pr_err("LS_CAL: Y: %d, Min: %d, Max: %d", cal->left_y_stable, cal->left_y_min, cal->left_y_max);
++			pr_err("RS_CAL: X: %d, Min: %d, Max: %d", cal->right_x_stable, cal->right_x_min, cal->right_x_max);
++			pr_err("RS_CAL: Y: %d, Min: %d, Max: %d", cal->right_y_stable, cal->right_y_min, cal->right_y_max);
++		} else {
++			/* Trigger calibration */
++			reverse_bytes_in_pairs(&hidbuf[6], sizeof(struct tr_axis_calibrations));
++			ally_cfg->tr_cal = *(struct tr_axis_calibrations *)&hidbuf[6];
++			print_hex_dump(KERN_INFO, "HID Buffer TR: ", DUMP_PREFIX_OFFSET, 16, 1, hidbuf, 32, true);
++		}
++	}
++
++cleanup:
++	kfree(hidbuf);
++	return ret;
++}
++
++static struct attribute *axis_xy_left_attrs[] = {
++	&dev_attr_axis_xy_left_anti_deadzone.attr,
++	&dev_attr_axis_xy_left_deadzone.attr,
++	&dev_attr_axis_xyz_deadzone_index.attr,
++	&dev_attr_axis_xy_left_move_1.attr,
++	&dev_attr_axis_xy_left_move_2.attr,
++	&dev_attr_axis_xy_left_move_3.attr,
++	&dev_attr_axis_xy_left_move_4.attr,
++	&dev_attr_axis_xy_left_response_1.attr,
++	&dev_attr_axis_xy_left_response_2.attr,
++	&dev_attr_axis_xy_left_response_3.attr,
++	&dev_attr_axis_xy_left_response_4.attr,
++	NULL
++};
++static const struct attribute_group axis_xy_left_attr_group = {
++	.name = "axis_xy_left",
++	.attrs = axis_xy_left_attrs,
++};
++
++static struct attribute *axis_xy_right_attrs[] = {
++	&dev_attr_axis_xy_right_anti_deadzone.attr,
++	&dev_attr_axis_xy_right_deadzone.attr,
++	&dev_attr_axis_xyz_deadzone_index.attr,
++	&dev_attr_axis_xy_right_move_1.attr,
++	&dev_attr_axis_xy_right_move_2.attr,
++	&dev_attr_axis_xy_right_move_3.attr,
++	&dev_attr_axis_xy_right_move_4.attr,
++	&dev_attr_axis_xy_right_response_1.attr,
++	&dev_attr_axis_xy_right_response_2.attr,
++	&dev_attr_axis_xy_right_response_3.attr,
++	&dev_attr_axis_xy_right_response_4.attr,
++	NULL
++};
++static const struct attribute_group axis_xy_right_attr_group = {
++	.name = "axis_xy_right",
++	.attrs = axis_xy_right_attrs,
++};
++
++static struct attribute *axis_z_left_attrs[] = {
++	&dev_attr_axis_z_left_deadzone.attr,
++	&dev_attr_axis_xyz_deadzone_index.attr,
++	NULL,
++};
++static const struct attribute_group axis_z_left_attr_group = {
++	.name = "axis_z_left",
++	.attrs = axis_z_left_attrs,
++};
++
++static struct attribute *axis_z_right_attrs[] = {
++	&dev_attr_axis_z_right_deadzone.attr,
++	&dev_attr_axis_xyz_deadzone_index.attr,
++	NULL,
++};
++static const struct attribute_group axis_z_right_attr_group = {
++	.name = "axis_z_right",
++	.attrs = axis_z_right_attrs,
++};
++
++/* A HID packet conatins mappings for two buttons: btn1, btn1_macro, btn2, btn2_macro */
++static void _btn_pair_to_hid_pkt(struct ally_gamepad_cfg *ally_cfg,
++				enum btn_pair_index pair,
++				struct btn_data *btn1, struct btn_data *btn2,
++				u8 *out, int out_len)
++{
++	int start = 5;
++
++	out[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	out[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	out[2] = xpad_cmd_set_mapping;
++	out[3] = pair;
++	out[4] = xpad_cmd_len_mapping;
++
++	btn_code_to_byte_array(btn1->button, &out[start]);
++	start += BTN_DATA_LEN;
++	btn_code_to_byte_array(btn1->macro, &out[start]);
++	start += BTN_DATA_LEN;
++	btn_code_to_byte_array(btn2->button, &out[start]);
++	start += BTN_DATA_LEN;
++	btn_code_to_byte_array(btn2->macro, &out[start]);
++	//print_hex_dump(KERN_DEBUG, "byte_array: ", DUMP_PREFIX_OFFSET, 64, 1, out, 64, false);
++}
++
++/* Apply the mapping pair to the device */
++static int _gamepad_apply_btn_pair(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg,
++				 enum btn_pair_index btn_pair)
++{
++	u8 mode = ally_cfg->mode - 1;
++	struct btn_data *btn1, *btn2;
++	u8 *hidbuf;
++	int ret;
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	switch (btn_pair) {
++	case btn_pair_dpad_u_d:
++		btn1 = &ally_cfg->key_mapping[mode].dpad_up;
++		btn2 = &ally_cfg->key_mapping[mode].dpad_down;
++		break;
++	case btn_pair_dpad_l_r:
++		btn1 = &ally_cfg->key_mapping[mode].dpad_left;
++		btn2 = &ally_cfg->key_mapping[mode].dpad_right;
++		break;
++	case btn_pair_ls_rs:
++		btn1 = &ally_cfg->key_mapping[mode].btn_ls;
++		btn2 = &ally_cfg->key_mapping[mode].btn_rs;
++		break;
++	case btn_pair_lb_rb:
++		btn1 = &ally_cfg->key_mapping[mode].btn_lb;
++		btn2 = &ally_cfg->key_mapping[mode].btn_rb;
++		break;
++	case btn_pair_lt_rt:
++		btn1 = &ally_cfg->key_mapping[mode].btn_lt;
++		btn2 = &ally_cfg->key_mapping[mode].btn_rt;
++		break;
++	case btn_pair_a_b:
++		btn1 = &ally_cfg->key_mapping[mode].btn_a;
++		btn2 = &ally_cfg->key_mapping[mode].btn_b;
++		break;
++	case btn_pair_x_y:
++		btn1 = &ally_cfg->key_mapping[mode].btn_x;
++		btn2 = &ally_cfg->key_mapping[mode].btn_y;
++		break;
++	case btn_pair_view_menu:
++		btn1 = &ally_cfg->key_mapping[mode].btn_view;
++		btn2 = &ally_cfg->key_mapping[mode].btn_menu;
++		break;
++	case btn_pair_m1_m2:
++		btn1 = &ally_cfg->key_mapping[mode].btn_m1;
++		btn2 = &ally_cfg->key_mapping[mode].btn_m2;
++		break;
++	default:
++		break;
++	}
++
++	_btn_pair_to_hid_pkt(ally_cfg, btn_pair, btn1, btn2, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++
++	kfree(hidbuf);
++
++	return ret;
++}
++
++static int _gamepad_apply_turbo(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg)
++{
++	struct btn_mapping *map = &ally_cfg->key_mapping[ally_cfg->mode - 1];
++	u8 *hidbuf;
++	int ret;
++
++	/* set turbo */
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	hidbuf[2] = xpad_cmd_set_turbo;
++	hidbuf[3] = xpad_cmd_len_turbo;
++
++	hidbuf[4] = map->dpad_up.turbo;
++	hidbuf[6] = map->dpad_down.turbo;
++	hidbuf[8] = map->dpad_left.turbo;
++	hidbuf[10] = map->dpad_right.turbo;
++
++	hidbuf[12] = map->btn_ls.turbo;
++	hidbuf[14] = map->btn_rs.turbo;
++	hidbuf[16] = map->btn_lb.turbo;
++	hidbuf[18] = map->btn_rb.turbo;
++
++	hidbuf[20] = map->btn_a.turbo;
++	hidbuf[22] = map->btn_b.turbo;
++	hidbuf[24] = map->btn_x.turbo;
++	hidbuf[26] = map->btn_y.turbo;
++
++	hidbuf[28] = map->btn_lt.turbo;
++	hidbuf[30] = map->btn_rt.turbo;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++
++	kfree(hidbuf);
++
++	return ret;
++}
++
++static ssize_t _gamepad_apply_all(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg)
++{
++	int ret;
++
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_dpad_u_d);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_dpad_l_r);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_ls_rs);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_lb_rb);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_a_b);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_x_y);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_view_menu);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_m1_m2);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_lt_rt);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_turbo(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_deadzones(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++	ret = _gamepad_apply_js_ADZ(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++	ret =_gamepad_apply_response_curves(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++
++	return 0;
++}
++
++static ssize_t gamepad_apply_all_store(struct device *dev, struct device_attribute *attr,
++				       const char *buf, size_t count)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	struct hid_device *hdev = to_hid_device(dev);
++	int ret;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	ret = _gamepad_apply_all(hdev, ally_cfg);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++ALLY_DEVICE_ATTR_WO(gamepad_apply_all, apply_all);
++
++/* button map attributes, regular and macro*/
++ALLY_BTN_MAPPING(m1, btn_m1);
++ALLY_BTN_MAPPING(m2, btn_m2);
++ALLY_BTN_MAPPING(view, btn_view);
++ALLY_BTN_MAPPING(menu, btn_menu);
++ALLY_TURBO_BTN_MAPPING(a, btn_a);
++ALLY_TURBO_BTN_MAPPING(b, btn_b);
++ALLY_TURBO_BTN_MAPPING(x, btn_x);
++ALLY_TURBO_BTN_MAPPING(y, btn_y);
++ALLY_TURBO_BTN_MAPPING(lb, btn_lb);
++ALLY_TURBO_BTN_MAPPING(rb, btn_rb);
++ALLY_TURBO_BTN_MAPPING(ls, btn_ls);
++ALLY_TURBO_BTN_MAPPING(rs, btn_rs);
++ALLY_TURBO_BTN_MAPPING(lt, btn_lt);
++ALLY_TURBO_BTN_MAPPING(rt, btn_rt);
++ALLY_TURBO_BTN_MAPPING(dpad_u, dpad_up);
++ALLY_TURBO_BTN_MAPPING(dpad_d, dpad_down);
++ALLY_TURBO_BTN_MAPPING(dpad_l, dpad_left);
++ALLY_TURBO_BTN_MAPPING(dpad_r, dpad_right);
++
++static void _gamepad_set_xpad_default(struct ally_gamepad_cfg *ally_cfg)
++{
++	struct btn_mapping *map = &ally_cfg->key_mapping[ally_cfg->mode - 1];
++	map->btn_m1.button = BTN_KB_M1;
++	map->btn_m2.button = BTN_KB_M2;
++	map->btn_a.button = BTN_PAD_A;
++	map->btn_b.button = BTN_PAD_B;
++	map->btn_x.button = BTN_PAD_X;
++	map->btn_y.button = BTN_PAD_Y;
++	map->btn_lb.button = BTN_PAD_LB;
++	map->btn_rb.button = BTN_PAD_RB;
++	map->btn_lt.button = BTN_PAD_LT;
++	map->btn_rt.button = BTN_PAD_RT;
++	map->btn_ls.button = BTN_PAD_LS;
++	map->btn_rs.button = BTN_PAD_RS;
++	map->dpad_up.button = BTN_PAD_DPAD_UP;
++	map->dpad_down.button = BTN_PAD_DPAD_DOWN;
++	map->dpad_left.button = BTN_PAD_DPAD_LEFT;
++	map->dpad_right.button = BTN_PAD_DPAD_RIGHT;
++	map->btn_view.button = BTN_PAD_VIEW;
++	map->btn_menu.button = BTN_PAD_MENU;
++}
++
++static ssize_t btn_mapping_reset_store(struct device *dev, struct device_attribute *attr,
++				       const char *buf, size_t count)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	switch (ally_cfg->mode) {
++	case xpad_mode_game:
++		_gamepad_set_xpad_default(ally_cfg);
++		break;
++	default:
++		_gamepad_set_xpad_default(ally_cfg);
++		break;
++	}
++
++	return count;
++}
++ALLY_DEVICE_ATTR_WO(btn_mapping_reset, reset_btn_mapping);
++
++/* GAMEPAD MODE */
++static ssize_t _gamepad_set_mode(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg,
++				  int val)
++{
++	u8 *hidbuf;
++	int ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID;
++	hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE;
++	hidbuf[2] = xpad_cmd_set_mode;
++	hidbuf[3] = xpad_cmd_len_mode;
++	hidbuf[4] = val;
++
++	ret = ally_gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = _gamepad_apply_all(hdev, ally_cfg);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t gamepad_mode_show(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	return sysfs_emit(buf, "%d\n", ally_cfg->mode);
++}
++
++static ssize_t gamepad_mode_store(struct device *dev, struct device_attribute *attr,
++				  const char *buf, size_t count)
++{
++	struct hid_device *hdev = to_hid_device(dev);
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	int ret, val;
++
++	if (!drvdata.gamepad_cfg)
++		return -ENODEV;
++
++	ret = kstrtoint(buf, 0, &val);
++	if (ret)
++		return ret;
++
++	if (val < xpad_mode_game || val > xpad_mode_mouse)
++		return -EINVAL;
++
++	ally_cfg->mode = val;
++
++	ret = _gamepad_set_mode(hdev, ally_cfg, val);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++DEVICE_ATTR_RW(gamepad_mode);
++
++static ssize_t mcu_version_show(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%d\n", drvdata.mcu_version);
++}
++
++DEVICE_ATTR_RO(mcu_version);
++
++/* ROOT LEVEL ATTRS *******************************************************************************/
++static struct attribute *gamepad_device_attrs[] = {
++	&dev_attr_btn_mapping_reset.attr,
++	&dev_attr_gamepad_mode.attr,
++	&dev_attr_gamepad_apply_all.attr,
++	&dev_attr_gamepad_vibration_intensity.attr,
++	&dev_attr_gamepad_vibration_intensity_index.attr,
++	&dev_attr_mcu_version.attr,
++	NULL
++};
++
++static const struct attribute_group ally_controller_attr_group = {
++	.attrs = gamepad_device_attrs,
++};
++
++static const struct attribute_group *gamepad_device_attr_groups[] = {
++	&ally_controller_attr_group,
++	&axis_xy_left_attr_group,
++	&axis_xy_right_attr_group,
++	&axis_z_left_attr_group,
++	&axis_z_right_attr_group,
++	&btn_mapping_m1_attr_group,
++	&btn_mapping_m2_attr_group,
++	&btn_mapping_a_attr_group,
++	&btn_mapping_b_attr_group,
++	&btn_mapping_x_attr_group,
++	&btn_mapping_y_attr_group,
++	&btn_mapping_lb_attr_group,
++	&btn_mapping_rb_attr_group,
++	&btn_mapping_ls_attr_group,
++	&btn_mapping_rs_attr_group,
++	&btn_mapping_lt_attr_group,
++	&btn_mapping_rt_attr_group,
++	&btn_mapping_dpad_u_attr_group,
++	&btn_mapping_dpad_d_attr_group,
++	&btn_mapping_dpad_l_attr_group,
++	&btn_mapping_dpad_r_attr_group,
++	&btn_mapping_view_attr_group,
++	&btn_mapping_menu_attr_group,
++	NULL,
++};
++
++static struct ally_gamepad_cfg *ally_gamepad_cfg_create(struct hid_device *hdev)
++{
++	struct ally_gamepad_cfg *ally_cfg;
++	struct input_dev *input_dev;
++	int err;
++
++	ally_cfg = devm_kzalloc(&hdev->dev, sizeof(*ally_cfg), GFP_KERNEL);
++	if (!ally_cfg)
++		return ERR_PTR(-ENOMEM);
++	ally_cfg->hdev = hdev;
++	// Allocate memory for each mode's `btn_mapping`
++	ally_cfg->mode = xpad_mode_game;
++
++	input_dev = devm_input_allocate_device(&hdev->dev);
++	if (!input_dev) {
++		err = -ENOMEM;
++		goto free_ally_cfg;
++	}
++
++	input_dev->id.bustype = hdev->bus;
++	input_dev->id.vendor = hdev->vendor;
++	input_dev->id.product = hdev->product;
++	input_dev->id.version = hdev->version;
++	input_dev->uniq = hdev->uniq;
++	input_dev->name = "ASUS ROG Ally Config";
++	input_set_capability(input_dev, EV_KEY, KEY_PROG1);
++	input_set_capability(input_dev, EV_KEY, KEY_F16);
++	input_set_capability(input_dev, EV_KEY, KEY_F17);
++	input_set_capability(input_dev, EV_KEY, KEY_F18);
++	input_set_drvdata(input_dev, hdev);
++
++	err = input_register_device(input_dev);
++	if (err)
++		goto free_input_dev;
++	ally_cfg->input = input_dev;
++
++	/* ignore all errors for this as they are related to USB HID I/O */
++	_gamepad_set_xpad_default(ally_cfg);
++	ally_cfg->key_mapping[ally_cfg->mode - 1].btn_m1.button = BTN_KB_M1;
++	ally_cfg->key_mapping[ally_cfg->mode - 1].btn_m2.button = BTN_KB_M2;
++	_gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_m1_m2);
++	gamepad_get_calibration(hdev);
++
++	ally_cfg->vibration_intensity[0] = 0x64;
++	ally_cfg->vibration_intensity[1] = 0x64;
++	_gamepad_set_deadzones_default(ally_cfg);
++	_gamepad_set_anti_deadzones_default(ally_cfg);
++	_gamepad_set_js_response_curves_default(ally_cfg);
++
++	drvdata.gamepad_cfg = ally_cfg; // Must asign before attr group setup
++	if (sysfs_create_groups(&hdev->dev.kobj, gamepad_device_attr_groups)) {
++		err = -ENODEV;
++		goto unregister_input_dev;
++	}
++
++	return ally_cfg;
++
++unregister_input_dev:
++	input_unregister_device(input_dev);
++	ally_cfg->input = NULL; // Prevent double free when kfree(ally_cfg) happens
++
++free_input_dev:
++	devm_kfree(&hdev->dev, input_dev);
++
++free_ally_cfg:
++	devm_kfree(&hdev->dev, ally_cfg);
++	return ERR_PTR(err);
++}
++
++static void ally_cfg_remove(struct hid_device *hdev)
++{
++	// __gamepad_set_mode(hdev, drvdata.gamepad_cfg, xpad_mode_mouse);
++	sysfs_remove_groups(&hdev->dev.kobj, gamepad_device_attr_groups);
++}
++
++/**************************************************************************************************/
++/* ROG Ally gamepad i/o and force-feedback                                                        */
++/**************************************************************************************************/
++static int ally_x_raw_event(struct ally_x_device *ally_x, struct hid_report *report, u8 *data,
++			    int size)
++{
++	struct ally_x_input_report *in_report;
++	unsigned long flags;
++	u8 byte;
++
++	if (data[0] == 0x0B) {
++		in_report = (struct ally_x_input_report *)&data[1];
++
++		input_report_abs(ally_x->input, ABS_X, in_report->x);
++		input_report_abs(ally_x->input, ABS_Y, in_report->y);
++		input_report_abs(ally_x->input, ABS_RX, in_report->rx);
++		input_report_abs(ally_x->input, ABS_RY, in_report->ry);
++		input_report_abs(ally_x->input, ABS_Z, in_report->z);
++		input_report_abs(ally_x->input, ABS_RZ, in_report->rz);
++
++		byte = in_report->buttons[0];
++		input_report_key(ally_x->input, BTN_A, byte & BIT(0));
++		input_report_key(ally_x->input, BTN_B, byte & BIT(1));
++		input_report_key(ally_x->input, BTN_X, byte & BIT(2));
++		input_report_key(ally_x->input, BTN_Y, byte & BIT(3));
++		input_report_key(ally_x->input, BTN_TL, byte & BIT(4));
++		input_report_key(ally_x->input, BTN_TR, byte & BIT(5));
++		input_report_key(ally_x->input, BTN_SELECT, byte & BIT(6));
++		input_report_key(ally_x->input, BTN_START, byte & BIT(7));
++
++		byte = in_report->buttons[1];
++		input_report_key(ally_x->input, BTN_THUMBL, byte & BIT(0));
++		input_report_key(ally_x->input, BTN_THUMBR, byte & BIT(1));
++		input_report_key(ally_x->input, BTN_MODE, byte & BIT(2));
++
++		byte = in_report->buttons[2];
++		input_report_abs(ally_x->input, ABS_HAT0X, hat_values[byte][0]);
++		input_report_abs(ally_x->input, ABS_HAT0Y, hat_values[byte][1]);
++	}
++	/*
++	 * The MCU used on Ally provides many devices: gamepad, keyboord, mouse, other.
++	 * The AC and QAM buttons route through another interface making it difficult to
++	 * use the events unless we grab those and use them here. Only works for Ally X.
++	 */
++	else if (data[0] == 0x5A) {
++		if (ally_x->qam_btns_steam_mode) {
++			spin_lock_irqsave(&ally_x->lock, flags);
++			if (data[1] == 0x38 && !ally_x->update_qam_btn) {
++				ally_x->update_qam_btn = true;
++				if (ally_x->output_worker_initialized)
++					schedule_work(&ally_x->output_worker);
++			}
++			spin_unlock_irqrestore(&ally_x->lock, flags);
++			/* Left/XBox button. Long press does ctrl+alt+del which we can't catch */
++			input_report_key(ally_x->input, BTN_MODE, data[1] == 0xA6);
++		} else {
++			input_report_key(ally_x->input, KEY_F16, data[1] == 0xA6);
++			input_report_key(ally_x->input, KEY_PROG1, data[1] == 0x38);
++		}
++		/* QAM long press */
++		input_report_key(ally_x->input, KEY_F17, data[1] == 0xA7);
++		/* QAM long press released */
++		input_report_key(ally_x->input, KEY_F18, data[1] == 0xA8);
++	}
++
++	input_sync(ally_x->input);
++
++	return 0;
++}
++
++static struct input_dev *ally_x_alloc_input_dev(struct hid_device *hdev,
++						const char *name_suffix)
++{
++	struct input_dev *input_dev;
++
++	input_dev = devm_input_allocate_device(&hdev->dev);
++	if (!input_dev)
++		return ERR_PTR(-ENOMEM);
++
++	input_dev->id.bustype = hdev->bus;
++	input_dev->id.vendor = hdev->vendor;
++	input_dev->id.product = hdev->product;
++	input_dev->id.version = hdev->version;
++	input_dev->uniq = hdev->uniq;
++	input_dev->name = "ASUS ROG Ally X Gamepad";
++
++	input_set_drvdata(input_dev, hdev);
++
++	return input_dev;
++}
++
++static int ally_x_play_effect(struct input_dev *idev, void *data, struct ff_effect *effect)
++{
++	struct ally_x_device *ally_x = drvdata.ally_x;
++	unsigned long flags;
++
++	if (effect->type != FF_RUMBLE)
++		return 0;
++
++	spin_lock_irqsave(&ally_x->lock, flags);
++	ally_x->ff_packet->ff.magnitude_strong = effect->u.rumble.strong_magnitude / 512;
++	ally_x->ff_packet->ff.magnitude_weak = effect->u.rumble.weak_magnitude / 512;
++	ally_x->update_ff = true;
++	spin_unlock_irqrestore(&ally_x->lock, flags);
++
++	if (ally_x->output_worker_initialized)
++		schedule_work(&ally_x->output_worker);
++
++	return 0;
++}
++
++static void ally_x_work(struct work_struct *work)
++{
++	struct ally_x_device *ally_x = container_of(work, struct ally_x_device, output_worker);
++	struct ff_report *ff_report = NULL;
++	bool update_qam = false;
++	bool update_ff = false;
++	unsigned long flags;
++
++	spin_lock_irqsave(&ally_x->lock, flags);
++	update_ff = ally_x->update_ff;
++	if (ally_x->update_ff) {
++		ff_report = kmemdup(ally_x->ff_packet, sizeof(*ally_x->ff_packet), GFP_KERNEL);
++		ally_x->update_ff = false;
++	}
++	update_qam = ally_x->update_qam_btn;
++	spin_unlock_irqrestore(&ally_x->lock, flags);
++
++	if (update_ff && ff_report) {
++		ff_report->ff.magnitude_left = ff_report->ff.magnitude_strong;
++		ff_report->ff.magnitude_right = ff_report->ff.magnitude_weak;
++		asus_dev_set_report(ally_x->hdev, (u8 *)ff_report, sizeof(*ff_report));
++	}
++	kfree(ff_report);
++
++	if (update_qam) {
++		/*
++		 * The sleeps here are required to allow steam to register the button combo.
++		 */
++		usleep_range(1000, 2000);
++		input_report_key(ally_x->input, BTN_MODE, 1);
++		input_sync(ally_x->input);
++
++		msleep(80);
++		input_report_key(ally_x->input, BTN_A, 1);
++		input_sync(ally_x->input);
++
++		msleep(80);
++		input_report_key(ally_x->input, BTN_A, 0);
++		input_sync(ally_x->input);
++
++		msleep(80);
++		input_report_key(ally_x->input, BTN_MODE, 0);
++		input_sync(ally_x->input);
++
++		spin_lock_irqsave(&ally_x->lock, flags);
++		ally_x->update_qam_btn = false;
++		spin_unlock_irqrestore(&ally_x->lock, flags);
++	}
++}
++
++static struct input_dev *ally_x_setup_input(struct hid_device *hdev)
++{
++	int ret, abs_min = 0, js_abs_max = 65535, tr_abs_max = 1023;
++	struct input_dev *input;
++
++	input = ally_x_alloc_input_dev(hdev, NULL);
++	if (IS_ERR(input))
++		return ERR_CAST(input);
++
++	input_set_abs_params(input, ABS_X, abs_min, js_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_Y, abs_min, js_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_RX, abs_min, js_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_RY, abs_min, js_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_Z, abs_min, tr_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_RZ, abs_min, tr_abs_max, 0, 0);
++	input_set_abs_params(input, ABS_HAT0X, -1, 1, 0, 0);
++	input_set_abs_params(input, ABS_HAT0Y, -1, 1, 0, 0);
++	input_set_capability(input, EV_KEY, BTN_A);
++	input_set_capability(input, EV_KEY, BTN_B);
++	input_set_capability(input, EV_KEY, BTN_X);
++	input_set_capability(input, EV_KEY, BTN_Y);
++	input_set_capability(input, EV_KEY, BTN_TL);
++	input_set_capability(input, EV_KEY, BTN_TR);
++	input_set_capability(input, EV_KEY, BTN_SELECT);
++	input_set_capability(input, EV_KEY, BTN_START);
++	input_set_capability(input, EV_KEY, BTN_MODE);
++	input_set_capability(input, EV_KEY, BTN_THUMBL);
++	input_set_capability(input, EV_KEY, BTN_THUMBR);
++
++	input_set_capability(input, EV_KEY, KEY_PROG1);
++	input_set_capability(input, EV_KEY, KEY_F16);
++	input_set_capability(input, EV_KEY, KEY_F17);
++	input_set_capability(input, EV_KEY, KEY_F18);
++
++	input_set_capability(input, EV_FF, FF_RUMBLE);
++	input_ff_create_memless(input, NULL, ally_x_play_effect);
++
++	ret = input_register_device(input);
++	if (ret)
++		return ERR_PTR(ret);
++
++	return input;
++}
++
++static ssize_t ally_x_qam_mode_show(struct device *dev, struct device_attribute *attr,
++				    char *buf)
++{
++	struct ally_x_device *ally_x = drvdata.ally_x;
++
++	return sysfs_emit(buf, "%d\n", ally_x->qam_btns_steam_mode);
++}
++
++static ssize_t ally_x_qam_mode_store(struct device *dev, struct device_attribute *attr,
++				     const char *buf, size_t count)
++{
++	struct ally_x_device *ally_x = drvdata.ally_x;
++	bool val;
++	int ret;
++
++	ret = kstrtobool(buf, &val);
++	if (ret < 0)
++		return ret;
++
++	ally_x->qam_btns_steam_mode = val;
++
++	return count;
++}
++ALLY_DEVICE_ATTR_RW(ally_x_qam_mode, qam_mode);
++
++static struct ally_x_device *ally_x_create(struct hid_device *hdev)
++{
++	uint8_t max_output_report_size;
++	struct ally_x_device *ally_x;
++	struct ff_report *report;
++	int ret;
++
++	ally_x = devm_kzalloc(&hdev->dev, sizeof(*ally_x), GFP_KERNEL);
++	if (!ally_x)
++		return ERR_PTR(-ENOMEM);
++
++	ally_x->hdev = hdev;
++	INIT_WORK(&ally_x->output_worker, ally_x_work);
++	spin_lock_init(&ally_x->lock);
++	ally_x->output_worker_initialized = true;
++	ally_x->qam_btns_steam_mode =
++		true; /* Always default to steam mode, it can be changed by userspace attr */
++
++	max_output_report_size = sizeof(struct ally_x_input_report);
++	report = devm_kzalloc(&hdev->dev, sizeof(*report), GFP_KERNEL);
++	if (!report) {
++		ret = -ENOMEM;
++		goto free_ally_x;
++	}
++
++	/* None of these bytes will change for the FF command for now */
++	report->report_id = 0x0D;
++	report->ff.enable = 0x0F; /* Enable all by default */
++	report->ff.pulse_sustain_10ms = 0xFF; /* Duration */
++	report->ff.pulse_release_10ms = 0x00; /* Start Delay */
++	report->ff.loop_count = 0xEB; /* Loop Count */
++	ally_x->ff_packet = report;
++
++	ally_x->input = ally_x_setup_input(hdev);
++	if (IS_ERR(ally_x->input)) {
++		ret = PTR_ERR(ally_x->input);
++		goto free_ff_packet;
++	}
++
++	if (sysfs_create_file(&hdev->dev.kobj, &dev_attr_ally_x_qam_mode.attr)) {
++		ret = -ENODEV;
++		goto unregister_input;
++	}
++
++	ally_x->update_ff = true;
++	if (ally_x->output_worker_initialized)
++		schedule_work(&ally_x->output_worker);
++
++	hid_info(hdev, "Registered Ally X controller using %s\n",
++		 dev_name(&ally_x->input->dev));
++	return ally_x;
++
++unregister_input:
++	input_unregister_device(ally_x->input);
++free_ff_packet:
++	kfree(ally_x->ff_packet);
++free_ally_x:
++	kfree(ally_x);
++	return ERR_PTR(ret);
++}
++
++static void ally_x_remove(struct hid_device *hdev)
++{
++	struct ally_x_device *ally_x = drvdata.ally_x;
++	unsigned long flags;
++
++	spin_lock_irqsave(&ally_x->lock, flags);
++	ally_x->output_worker_initialized = false;
++	spin_unlock_irqrestore(&ally_x->lock, flags);
++	cancel_work_sync(&ally_x->output_worker);
++	sysfs_remove_file(&hdev->dev.kobj, &dev_attr_ally_x_qam_mode.attr);
++}
++
++/**************************************************************************************************/
++/* ROG Ally LED control                                                                           */
++/**************************************************************************************************/
++static void ally_rgb_schedule_work(struct ally_rgb_dev *led)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&led->lock, flags);
++	if (!led->removed)
++		schedule_work(&led->work);
++	spin_unlock_irqrestore(&led->lock, flags);
++}
++
++/*
++ * The RGB still has the basic 0-3 level brightness. Since the multicolour
++ * brightness is being used in place, set this to max
++ */
++static int ally_rgb_set_bright_base_max(struct hid_device *hdev)
++{
++	u8 buf[] = { FEATURE_KBD_LED_REPORT_ID1, 0xba, 0xc5, 0xc4, 0x02 };
++
++	return asus_dev_set_report(hdev, buf, sizeof(buf));
++}
++
++static void ally_rgb_do_work(struct work_struct *work)
++{
++	struct ally_rgb_dev *led = container_of(work, struct ally_rgb_dev, work);
++	int ret;
++	unsigned long flags;
++
++	u8 buf[16] = { [0] = FEATURE_ROG_ALLY_REPORT_ID,
++		       [1] = FEATURE_ROG_ALLY_CODE_PAGE,
++		       [2] = xpad_cmd_set_leds,
++		       [3] = xpad_cmd_len_leds };
++
++	spin_lock_irqsave(&led->lock, flags);
++	if (!led->update_rgb) {
++		spin_unlock_irqrestore(&led->lock, flags);
++		return;
++	}
++
++	for (int i = 0; i < 4; i++) {
++		buf[5 + i * 3] = drvdata.led_rgb_dev->green[i];
++		buf[6 + i * 3] = drvdata.led_rgb_dev->blue[i];
++		buf[4 + i * 3] = drvdata.led_rgb_dev->red[i];
++	}
++	led->update_rgb = false;
++
++	spin_unlock_irqrestore(&led->lock, flags);
++
++	ret = asus_dev_set_report(led->hdev, buf, sizeof(buf));
++	if (ret < 0)
++		hid_err(led->hdev, "Ally failed to set gamepad backlight: %d\n", ret);
++}
++
++static void ally_rgb_set(struct led_classdev *cdev, enum led_brightness brightness)
++{
++	struct led_classdev_mc *mc_cdev = lcdev_to_mccdev(cdev);
++	struct ally_rgb_dev *led = container_of(mc_cdev, struct ally_rgb_dev, led_rgb_dev);
++	int intensity, bright;
++	unsigned long flags;
++
++	led_mc_calc_color_components(mc_cdev, brightness);
++	spin_lock_irqsave(&led->lock, flags);
++	led->update_rgb = true;
++	bright = mc_cdev->led_cdev.brightness;
++	for (int i = 0; i < 4; i++) {
++		intensity = mc_cdev->subled_info[i].intensity;
++		drvdata.led_rgb_dev->red[i] = (((intensity >> 16) & 0xFF) * bright) / 255;
++		drvdata.led_rgb_dev->green[i] = (((intensity >> 8) & 0xFF) * bright) / 255;
++		drvdata.led_rgb_dev->blue[i] = ((intensity & 0xFF) * bright) / 255;
++	}
++	spin_unlock_irqrestore(&led->lock, flags);
++	drvdata.led_rgb_data.initialized = true;
++
++	ally_rgb_schedule_work(led);
++}
++
++static int ally_rgb_set_static_from_multi(struct hid_device *hdev)
++{
++	u8 buf[17] = {FEATURE_KBD_LED_REPORT_ID1, 0xb3};
++	int ret;
++
++	/*
++	 * Set single zone single colour based on the first LED of EC software mode.
++	 * buf[2] = zone, buf[3] = mode
++	 */
++	buf[4] = drvdata.led_rgb_data.red[0];
++	buf[5] = drvdata.led_rgb_data.green[0];
++	buf[6] = drvdata.led_rgb_data.blue[0];
++
++	ret = asus_dev_set_report(hdev, buf, sizeof(buf));
++	if (ret < 0)
++		return ret;
++
++	ret = asus_dev_set_report(hdev, EC_MODE_LED_APPLY, sizeof(EC_MODE_LED_APPLY));
++	if (ret < 0)
++		return ret;
++
++	return asus_dev_set_report(hdev, EC_MODE_LED_SET, sizeof(EC_MODE_LED_SET));
++}
++
++/*
++ * Store the RGB values for restoring on resume, and set the static mode to the first LED colour
++*/
++static void ally_rgb_store_settings(void)
++{
++	int arr_size = sizeof(drvdata.led_rgb_data.red);
++
++	struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev;
++
++	drvdata.led_rgb_data.brightness = led_rgb->led_rgb_dev.led_cdev.brightness;
++
++	memcpy(drvdata.led_rgb_data.red, led_rgb->red, arr_size);
++	memcpy(drvdata.led_rgb_data.green, led_rgb->green, arr_size);
++	memcpy(drvdata.led_rgb_data.blue, led_rgb->blue, arr_size);
++
++	ally_rgb_set_static_from_multi(led_rgb->hdev);
++}
++
++static void ally_rgb_restore_settings(struct ally_rgb_dev *led_rgb, struct led_classdev *led_cdev,
++				      struct mc_subled *mc_led_info)
++{
++	int arr_size = sizeof(drvdata.led_rgb_data.red);
++
++	memcpy(led_rgb->red, drvdata.led_rgb_data.red, arr_size);
++	memcpy(led_rgb->green, drvdata.led_rgb_data.green, arr_size);
++	memcpy(led_rgb->blue, drvdata.led_rgb_data.blue, arr_size);
++	for (int i = 0; i < 4; i++) {
++		mc_led_info[i].intensity = (drvdata.led_rgb_data.red[i] << 16) |
++					   (drvdata.led_rgb_data.green[i] << 8) |
++					   drvdata.led_rgb_data.blue[i];
++	}
++	led_cdev->brightness = drvdata.led_rgb_data.brightness;
++}
++
++/* Set LEDs. Call after any setup. */
++static void ally_rgb_resume(void)
++{
++	struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev;
++	struct led_classdev *led_cdev;
++	struct mc_subled *mc_led_info;
++
++	if (!led_rgb)
++		return;
++
++	led_cdev = &led_rgb->led_rgb_dev.led_cdev;
++	mc_led_info = led_rgb->led_rgb_dev.subled_info;
++
++	if (drvdata.led_rgb_data.initialized) {
++		ally_rgb_restore_settings(led_rgb, led_cdev, mc_led_info);
++		led_rgb->update_rgb = true;
++		ally_rgb_schedule_work(led_rgb);
++		ally_rgb_set_bright_base_max(led_rgb->hdev);
++	}
++}
++
++static int ally_rgb_register(struct hid_device *hdev, struct ally_rgb_dev *led_rgb)
++{
++	struct mc_subled *mc_led_info;
++	struct led_classdev *led_cdev;
++
++	mc_led_info =
++		devm_kmalloc_array(&hdev->dev, 12, sizeof(*mc_led_info), GFP_KERNEL | __GFP_ZERO);
++	if (!mc_led_info)
++		return -ENOMEM;
++
++	mc_led_info[0].color_index = LED_COLOR_ID_RGB;
++	mc_led_info[1].color_index = LED_COLOR_ID_RGB;
++	mc_led_info[2].color_index = LED_COLOR_ID_RGB;
++	mc_led_info[3].color_index = LED_COLOR_ID_RGB;
++
++	led_rgb->led_rgb_dev.subled_info = mc_led_info;
++	led_rgb->led_rgb_dev.num_colors = 4;
++
++	led_cdev = &led_rgb->led_rgb_dev.led_cdev;
++	led_cdev->brightness = 128;
++	led_cdev->name = "ally:rgb:joystick_rings";
++	led_cdev->max_brightness = 255;
++	led_cdev->brightness_set = ally_rgb_set;
++
++	if (drvdata.led_rgb_data.initialized) {
++		ally_rgb_restore_settings(led_rgb, led_cdev, mc_led_info);
++	}
++
++	return devm_led_classdev_multicolor_register(&hdev->dev, &led_rgb->led_rgb_dev);
++}
++
++static struct ally_rgb_dev *ally_rgb_create(struct hid_device *hdev)
++{
++	struct ally_rgb_dev *led_rgb;
++	int ret;
++
++	led_rgb = devm_kzalloc(&hdev->dev, sizeof(struct ally_rgb_dev), GFP_KERNEL);
++	if (!led_rgb)
++		return ERR_PTR(-ENOMEM);
++
++	ret = ally_rgb_register(hdev, led_rgb);
++	if (ret < 0) {
++		cancel_work_sync(&led_rgb->work);
++		devm_kfree(&hdev->dev, led_rgb);
++		return ERR_PTR(ret);
++	}
++
++	led_rgb->hdev = hdev;
++	led_rgb->removed = false;
++
++	INIT_WORK(&led_rgb->work, ally_rgb_do_work);
++	led_rgb->output_worker_initialized = true;
++	spin_lock_init(&led_rgb->lock);
++
++	ally_rgb_set_bright_base_max(hdev);
++
++	/* Not marked as initialized unless ally_rgb_set() is called */
++	if (drvdata.led_rgb_data.initialized) {
++		msleep(1500);
++		led_rgb->update_rgb = true;
++		ally_rgb_schedule_work(led_rgb);
++	}
++
++	return led_rgb;
++}
++
++static void ally_rgb_remove(struct hid_device *hdev)
++{
++	struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev;
++	unsigned long flags;
++	int ep;
++
++	ep = get_endpoint_address(hdev);
++	if (ep != ROG_ALLY_CFG_INTF_IN)
++		return;
++
++	if (!drvdata.led_rgb_dev || led_rgb->removed)
++		return;
++
++	spin_lock_irqsave(&led_rgb->lock, flags);
++	led_rgb->removed = true;
++	led_rgb->output_worker_initialized = false;
++	spin_unlock_irqrestore(&led_rgb->lock, flags);
++	cancel_work_sync(&led_rgb->work);
++	devm_led_classdev_multicolor_unregister(&hdev->dev, &led_rgb->led_rgb_dev);
++
++	hid_info(hdev, "Removed Ally RGB interface");
++}
++
++/**************************************************************************************************/
++/* ROG Ally driver init                                                                           */
++/**************************************************************************************************/
++
++static int ally_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data,
++			  int size)
++{
++	struct ally_gamepad_cfg *cfg = drvdata.gamepad_cfg;
++	struct ally_x_device *ally_x = drvdata.ally_x;
++
++	if (ally_x) {
++		if ((hdev->bus == BUS_USB && report->id == ALLY_X_INPUT_REPORT_USB &&
++		     size == ALLY_X_INPUT_REPORT_USB_SIZE) ||
++		    (data[0] == 0x5A)) {
++			ally_x_raw_event(ally_x, report, data, size);
++		} else {
++			return -1;
++		}
++	}
++
++	if (cfg && !ally_x) {
++		input_report_key(cfg->input, KEY_PROG1, data[1] == 0x38);
++		input_report_key(cfg->input, KEY_F16, data[1] == 0xA6);
++		input_report_key(cfg->input, KEY_F17, data[1] == 0xA7);
++		input_report_key(cfg->input, KEY_F18, data[1] == 0xA8);
++		input_sync(cfg->input);
++	}
++
++	return 0;
++}
++
++static int ally_hid_init(struct hid_device *hdev)
++{
++	int ret;
++
++	ret = asus_dev_set_report(hdev, EC_INIT_STRING, sizeof(EC_INIT_STRING));
++	if (ret < 0) {
++		hid_err(hdev, "Ally failed to send init command: %d\n", ret);
++		return ret;
++	}
++
++	ret = asus_dev_set_report(hdev, FORCE_FEEDBACK_OFF, sizeof(FORCE_FEEDBACK_OFF));
++	if (ret < 0)
++		hid_err(hdev, "Ally failed to send init command: %d\n", ret);
++
++	return ret;
++}
++
++static int ally_hid_probe(struct hid_device *hdev, const struct hid_device_id *_id)
++{
++	struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
++	struct usb_device *udev = interface_to_usbdev(intf);
++	u16 idProduct = le16_to_cpu(udev->descriptor.idProduct);
++	int ret, ep;
++
++	ep = get_endpoint_address(hdev);
++	if (ep < 0)
++		return ep;
++
++	if (ep != ROG_ALLY_CFG_INTF_IN &&
++	    ep != ROG_ALLY_X_INTF_IN)
++		return -ENODEV;
++
++	ret = hid_parse(hdev);
++	if (ret) {
++		hid_err(hdev, "Parse failed\n");
++		return ret;
++	}
++
++	ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW);
++	if (ret) {
++		hid_err(hdev, "Failed to start HID device\n");
++		return ret;
++	}
++
++	ret = hid_hw_open(hdev);
++	if (ret) {
++		hid_err(hdev, "Failed to open HID device\n");
++		goto err_stop;
++	}
++
++	/* Initialize MCU even before alloc */
++	ret = ally_hid_init(hdev);
++	if (ret < 0)
++		return ret;
++
++	drvdata.hdev = hdev;
++	hid_set_drvdata(hdev, &drvdata);
++
++	/* This should almost always exist */
++	if (ep == ROG_ALLY_CFG_INTF_IN) {
++		validate_mcu_fw_version(hdev, idProduct);
++
++		drvdata.led_rgb_dev = ally_rgb_create(hdev);
++		if (IS_ERR(drvdata.led_rgb_dev))
++			hid_err(hdev, "Failed to create Ally gamepad LEDs.\n");
++		else
++			hid_info(hdev, "Created Ally RGB LED controls.\n");
++
++		drvdata.gamepad_cfg = ally_gamepad_cfg_create(hdev);
++		if (IS_ERR(drvdata.gamepad_cfg))
++			hid_err(hdev, "Failed to create Ally gamepad attributes.\n");
++		else
++			hid_info(hdev, "Created Ally gamepad attributes.\n");
++
++		if (IS_ERR(drvdata.led_rgb_dev) && IS_ERR(drvdata.gamepad_cfg))
++			goto err_close;
++	}
++
++	/* May or may not exist */
++	if (ep == ROG_ALLY_X_INTF_IN) {
++		drvdata.ally_x = ally_x_create(hdev);
++		if (IS_ERR(drvdata.ally_x)) {
++			hid_err(hdev, "Failed to create Ally X gamepad.\n");
++			drvdata.ally_x = NULL;
++			goto err_close;
++		}
++		hid_info(hdev, "Created Ally X controller.\n");
++
++		// Not required since we send this inputs ep through the gamepad input dev
++		if (drvdata.gamepad_cfg && drvdata.gamepad_cfg->input) {
++			input_unregister_device(drvdata.gamepad_cfg->input);
++			hid_info(hdev, "Ally X removed unrequired input dev.\n");
++		}
++	}
++
++	return 0;
++
++err_close:
++	hid_hw_close(hdev);
++err_stop:
++	hid_hw_stop(hdev);
++	return ret;
++}
++
++static void ally_hid_remove(struct hid_device *hdev)
++{
++	if (drvdata.led_rgb_dev)
++		ally_rgb_remove(hdev);
++
++	if (drvdata.ally_x)
++		ally_x_remove(hdev);
++
++	if (drvdata.gamepad_cfg)
++		ally_cfg_remove(hdev);
++
++	hid_hw_close(hdev);
++	hid_hw_stop(hdev);
++}
++
++static int ally_hid_resume(struct hid_device *hdev)
++{
++	struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;
++	int err;
++
++	if (!ally_cfg)
++		return 0;
++
++	err = _gamepad_apply_all(hdev, ally_cfg);
++	if (err)
++		return err;
++
++	return 0;
++}
++
++static int ally_hid_reset_resume(struct hid_device *hdev)
++{
++	int ep = get_endpoint_address(hdev);
++	if (ep != ROG_ALLY_CFG_INTF_IN)
++		return 0;
++
++	ally_hid_init(hdev);
++	ally_rgb_resume();
++
++	return ally_hid_resume(hdev);
++}
++
++static int ally_pm_thaw(struct device *dev)
++{
++	struct hid_device *hdev = to_hid_device(dev);
++
++	return ally_hid_reset_resume(hdev);
++}
++
++static int ally_pm_suspend(struct device *dev)
++{
++	if (drvdata.led_rgb_dev) {
++		ally_rgb_store_settings();
++	}
++
++	return 0;
++}
++
++static const struct dev_pm_ops ally_pm_ops = {
++	.thaw = ally_pm_thaw,
++	.suspend = ally_pm_suspend,
++	.poweroff = ally_pm_suspend,
++};
++
++MODULE_DEVICE_TABLE(hid, rog_ally_devices);
++
++static struct hid_driver rog_ally_cfg = { .name = "asus_rog_ally",
++		.id_table = rog_ally_devices,
++		.probe = ally_hid_probe,
++		.remove = ally_hid_remove,
++		.raw_event = ally_raw_event,
++		/* HID is the better place for resume functions, not pm_ops */
++		.resume = ally_hid_resume,
++		/* ALLy 1 requires this to reset device state correctly */
++		.reset_resume = ally_hid_reset_resume,
++		.driver = {
++			.pm = &ally_pm_ops,
++		}
++};
++
++static int __init rog_ally_init(void)
++{
++	return hid_register_driver(&rog_ally_cfg);
++}
++
++static void __exit rog_ally_exit(void)
++{
++	hid_unregister_driver(&rog_ally_cfg);
++}
++
++module_init(rog_ally_init);
++module_exit(rog_ally_exit);
++
++MODULE_IMPORT_NS("ASUS_WMI");
++MODULE_IMPORT_NS("HID_ASUS");
++MODULE_AUTHOR("Luke D. Jones");
++MODULE_DESCRIPTION("HID Driver for ASUS ROG Ally gamepad configuration.");
++MODULE_LICENSE("GPL");
+diff --git a/drivers/hid/hid-asus-ally.h b/drivers/hid/hid-asus-ally.h
+new file mode 100644
+index 000000000000..c83817589082
+--- /dev/null
++++ b/drivers/hid/hid-asus-ally.h
+@@ -0,0 +1,398 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later
++ *
++ *  HID driver for Asus ROG laptops and Ally
++ *
++ *  Copyright (c) 2023 Luke Jones <luke@ljones.dev>
++ */
++
++#include <linux/hid.h>
++#include <linux/types.h>
++
++/*
++ * the xpad_mode is used inside the mode setting packet and is used
++ * for indexing (xpad_mode - 1)
++ */
++enum xpad_mode {
++	xpad_mode_game = 0x01,
++	xpad_mode_wasd = 0x02,
++	xpad_mode_mouse = 0x03,
++};
++
++/* the xpad_cmd determines which feature is set or queried */
++enum xpad_cmd {
++	xpad_cmd_set_mode = 0x01,
++	xpad_cmd_set_mapping = 0x02,
++	xpad_cmd_set_js_dz = 0x04, /* deadzones */
++	xpad_cmd_set_tr_dz = 0x05, /* deadzones */
++	xpad_cmd_set_vibe_intensity = 0x06,
++	xpad_cmd_set_leds = 0x08,
++	xpad_cmd_check_ready = 0x0A,
++	xpad_cmd_set_turbo = 0x0F,
++	xpad_cmd_set_response_curve = 0x13,
++	xpad_cmd_set_adz = 0x18,
++};
++
++/* the xpad_cmd determines which feature is set or queried */
++enum xpad_cmd_len {
++	xpad_cmd_len_mode = 0x01,
++	xpad_cmd_len_mapping = 0x2c,
++	xpad_cmd_len_deadzone = 0x04,
++	xpad_cmd_len_vibe_intensity = 0x02,
++	xpad_cmd_len_leds = 0x0C,
++	xpad_cmd_len_turbo = 0x20,
++	xpad_cmd_len_response_curve = 0x09,
++	xpad_cmd_len_adz = 0x02,
++};
++
++/* Values correspond to the actual HID byte value required */
++enum btn_pair_index {
++	btn_pair_dpad_u_d = 0x01,
++	btn_pair_dpad_l_r = 0x02,
++	btn_pair_ls_rs = 0x03,
++	btn_pair_lb_rb = 0x04,
++	btn_pair_a_b = 0x05,
++	btn_pair_x_y = 0x06,
++	btn_pair_view_menu = 0x07,
++	btn_pair_m1_m2 = 0x08,
++	btn_pair_lt_rt = 0x09,
++};
++
++#define BTN_PAD_A             0x0101000000000000
++#define BTN_PAD_B             0x0102000000000000
++#define BTN_PAD_X             0x0103000000000000
++#define BTN_PAD_Y             0x0104000000000000
++#define BTN_PAD_LB            0x0105000000000000
++#define BTN_PAD_RB            0x0106000000000000
++#define BTN_PAD_LS            0x0107000000000000
++#define BTN_PAD_RS            0x0108000000000000
++#define BTN_PAD_DPAD_UP       0x0109000000000000
++#define BTN_PAD_DPAD_DOWN     0x010A000000000000
++#define BTN_PAD_DPAD_LEFT     0x010B000000000000
++#define BTN_PAD_DPAD_RIGHT    0x010C000000000000
++#define BTN_PAD_LT            0x010D000000000000
++#define BTN_PAD_RT            0x010E000000000000
++#define BTN_PAD_VIEW          0x0111000000000000
++#define BTN_PAD_MENU          0x0112000000000000
++#define BTN_PAD_XBOX          0x0113000000000000
++
++#define BTN_KB_M2             0x02008E0000000000
++#define BTN_KB_M1             0x02008F0000000000
++#define BTN_KB_ESC            0x0200760000000000
++#define BTN_KB_F1             0x0200500000000000
++#define BTN_KB_F2             0x0200600000000000
++#define BTN_KB_F3             0x0200400000000000
++#define BTN_KB_F4             0x02000C0000000000
++#define BTN_KB_F5             0x0200030000000000
++#define BTN_KB_F6             0x02000B0000000000
++#define BTN_KB_F7             0x0200800000000000
++#define BTN_KB_F8             0x02000A0000000000
++#define BTN_KB_F9             0x0200010000000000
++#define BTN_KB_F10            0x0200090000000000
++#define BTN_KB_F11            0x0200780000000000
++#define BTN_KB_F12            0x0200070000000000
++#define BTN_KB_F14            0x0200180000000000
++#define BTN_KB_F15            0x0200100000000000
++#define BTN_KB_BACKTICK       0x02000E0000000000
++#define BTN_KB_1              0x0200160000000000
++#define BTN_KB_2              0x02001E0000000000
++#define BTN_KB_3              0x0200260000000000
++#define BTN_KB_4              0x0200250000000000
++#define BTN_KB_5              0x02002E0000000000
++#define BTN_KB_6              0x0200360000000000
++#define BTN_KB_7              0x02003D0000000000
++#define BTN_KB_8              0x02003E0000000000
++#define BTN_KB_9              0x0200460000000000
++#define BTN_KB_0              0x0200450000000000
++#define BTN_KB_HYPHEN         0x02004E0000000000
++#define BTN_KB_EQUALS         0x0200550000000000
++#define BTN_KB_BACKSPACE      0x0200660000000000
++#define BTN_KB_TAB            0x02000D0000000000
++#define BTN_KB_Q              0x0200150000000000
++#define BTN_KB_W              0x02001D0000000000
++#define BTN_KB_E              0x0200240000000000
++#define BTN_KB_R              0x02002D0000000000
++#define BTN_KB_T              0x02002C0000000000
++#define BTN_KB_Y              0x0200350000000000
++#define BTN_KB_U              0x02003C0000000000
++#define BTN_KB_O              0x0200440000000000
++#define BTN_KB_P              0x02004D0000000000
++#define BTN_KB_LBRACKET       0x0200540000000000
++#define BTN_KB_RBRACKET       0x02005B0000000000
++#define BTN_KB_BACKSLASH      0x02005D0000000000
++#define BTN_KB_CAPS           0x0200580000000000
++#define BTN_KB_A              0x02001C0000000000
++#define BTN_KB_S              0x02001B0000000000
++#define BTN_KB_D              0x0200230000000000
++#define BTN_KB_F              0x02002B0000000000
++#define BTN_KB_G              0x0200340000000000
++#define BTN_KB_H              0x0200330000000000
++#define BTN_KB_J              0x02003B0000000000
++#define BTN_KB_K              0x0200420000000000
++#define BTN_KB_L              0x02004B0000000000
++#define BTN_KB_SEMI           0x02004C0000000000
++#define BTN_KB_QUOTE          0x0200520000000000
++#define BTN_KB_RET            0x02005A0000000000
++#define BTN_KB_LSHIFT         0x0200880000000000
++#define BTN_KB_Z              0x02001A0000000000
++#define BTN_KB_X              0x0200220000000000
++#define BTN_KB_C              0x0200210000000000
++#define BTN_KB_V              0x02002A0000000000
++#define BTN_KB_B              0x0200320000000000
++#define BTN_KB_N              0x0200310000000000
++#define BTN_KB_M              0x02003A0000000000
++#define BTN_KB_COMMA          0x0200410000000000
++#define BTN_KB_PERIOD         0x0200490000000000
++#define BTN_KB_RSHIFT         0x0200890000000000
++#define BTN_KB_LCTL           0x02008C0000000000
++#define BTN_KB_META           0x0200820000000000
++#define BTN_KB_LALT           0x02008A0000000000
++#define BTN_KB_SPACE          0x0200290000000000
++#define BTN_KB_RALT           0x02008B0000000000
++#define BTN_KB_MENU           0x0200840000000000
++#define BTN_KB_RCTL           0x02008D0000000000
++#define BTN_KB_PRNTSCN        0x0200C30000000000
++#define BTN_KB_SCRLCK         0x02007E0000000000
++#define BTN_KB_PAUSE          0x0200910000000000
++#define BTN_KB_INS            0x0200C20000000000
++#define BTN_KB_HOME           0x0200940000000000
++#define BTN_KB_PGUP           0x0200960000000000
++#define BTN_KB_DEL            0x0200C00000000000
++#define BTN_KB_END            0x0200950000000000
++#define BTN_KB_PGDWN          0x0200970000000000
++#define BTN_KB_UP_ARROW       0x0200980000000000
++#define BTN_KB_DOWN_ARROW     0x0200990000000000
++#define BTN_KB_LEFT_ARROW     0x0200910000000000
++#define BTN_KB_RIGHT_ARROW    0x02009B0000000000
++
++#define BTN_NUMPAD_LOCK       0x0200770000000000
++#define BTN_NUMPAD_FWDSLASH   0x0200900000000000
++#define BTN_NUMPAD_ASTERISK   0x02007C0000000000
++#define BTN_NUMPAD_HYPHEN     0x02007B0000000000
++#define BTN_NUMPAD_0          0x0200700000000000
++#define BTN_NUMPAD_1          0x0200690000000000
++#define BTN_NUMPAD_2          0x0200720000000000
++#define BTN_NUMPAD_3          0x02007A0000000000
++#define BTN_NUMPAD_4          0x02006B0000000000
++#define BTN_NUMPAD_5          0x0200730000000000
++#define BTN_NUMPAD_6          0x0200740000000000
++#define BTN_NUMPAD_7          0x02006C0000000000
++#define BTN_NUMPAD_8          0x0200750000000000
++#define BTN_NUMPAD_9          0x02007D0000000000
++#define BTN_NUMPAD_PLUS       0x0200790000000000
++#define BTN_NUMPAD_ENTER      0x0200810000000000
++#define BTN_NUMPAD_PERIOD     0x0200710000000000
++
++#define BTN_MOUSE_LCLICK      0x0300000001000000
++#define BTN_MOUSE_RCLICK      0x0300000002000000
++#define BTN_MOUSE_MCLICK      0x0300000003000000
++#define BTN_MOUSE_WHEEL_UP    0x0300000004000000
++#define BTN_MOUSE_WHEEL_DOWN  0x0300000005000000
++
++#define BTN_MEDIA_SCREENSHOT      0x0500001600000000
++#define BTN_MEDIA_SHOW_KEYBOARD   0x0500001900000000
++#define BTN_MEDIA_SHOW_DESKTOP    0x0500001C00000000
++#define BTN_MEDIA_START_RECORDING 0x0500001E00000000
++#define BTN_MEDIA_MIC_OFF         0x0500000100000000
++#define BTN_MEDIA_VOL_DOWN        0x0500000200000000
++#define BTN_MEDIA_VOL_UP          0x0500000300000000
++
++#define ALLY_DEVICE_ATTR_WO(_name, _sysfs_name)    \
++	struct device_attribute dev_attr_##_name = \
++		__ATTR(_sysfs_name, 0200, NULL, _name##_store)
++
++/* required so we can have nested attributes with same name but different functions */
++#define ALLY_DEVICE_ATTR_RW(_name, _sysfs_name)    \
++	struct device_attribute dev_attr_##_name = \
++		__ATTR(_sysfs_name, 0644, _name##_show, _name##_store)
++
++#define ALLY_DEVICE_ATTR_RO(_name, _sysfs_name)    \
++	struct device_attribute dev_attr_##_name = \
++		__ATTR(_sysfs_name, 0444, _name##_show, NULL)
++
++/* button specific macros */
++#define ALLY_BTN_SHOW(_fname, _btn_name, _secondary)                           \
++	static ssize_t _fname##_show(struct device *dev,                       \
++				     struct device_attribute *attr, char *buf) \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct btn_data *btn;                                          \
++		const char* name;                                              \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name;   \
++		name = btn_to_name(_secondary ? btn->macro : btn->button);     \
++		return sysfs_emit(buf, "%s\n", name);                          \
++	}
++
++#define ALLY_BTN_STORE(_fname, _btn_name, _secondary)                          \
++	static ssize_t _fname##_store(struct device *dev,                      \
++				      struct device_attribute *attr,           \
++				      const char *buf, size_t count)           \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct btn_data *btn;                                          \
++		u64 code;                                                      \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name;   \
++		code = name_to_btn(buf);                                       \
++		if (_secondary)                                                \
++			btn->macro = code;                                     \
++		else                                                           \
++			btn->button = code;                                    \
++		return count;                                                  \
++	}
++
++#define ALLY_TURBO_SHOW(_fname, _btn_name)                                     \
++	static ssize_t _fname##_show(struct device *dev,                       \
++				     struct device_attribute *attr, char *buf) \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct btn_data *btn;                                          \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name;   \
++		return sysfs_emit(buf, "%d\n", btn->turbo);                    \
++	}
++
++#define ALLY_TURBO_STORE(_fname, _btn_name)                                    \
++	static ssize_t _fname##_store(struct device *dev,                      \
++				      struct device_attribute *attr,           \
++				      const char *buf, size_t count)           \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct btn_data *btn;                                          \
++		bool turbo;                                                    \
++		int ret; \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name;   \
++		ret = kstrtobool(buf, &turbo);                                 \
++		if (ret)                                                       \
++			return ret;                                            \
++		btn->turbo = turbo;                                            \
++		return count;                                                  \
++	}
++
++#define ALLY_DEADZONE_SHOW(_fname, _axis_name)                                 \
++	static ssize_t _fname##_show(struct device *dev,                       \
++				     struct device_attribute *attr, char *buf) \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct deadzone *dz;                                           \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		dz = &ally_cfg->_axis_name;                                    \
++		return sysfs_emit(buf, "%d %d\n", dz->inner, dz->outer);       \
++	}
++
++#define ALLY_DEADZONE_STORE(_fname, _axis_name)                                \
++	static ssize_t _fname##_store(struct device *dev,                      \
++				      struct device_attribute *attr,           \
++				      const char *buf, size_t count)           \
++	{                                                                      \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg;       \
++		struct hid_device *hdev = to_hid_device(dev);                  \
++		u32 inner, outer;                                              \
++		if (!drvdata.gamepad_cfg)                                      \
++			return -ENODEV;                                        \
++		if (sscanf(buf, "%d %d", &inner, &outer) != 2)                 \
++			return -EINVAL;                                        \
++		if (inner > 64 || outer > 64 || inner > outer)                 \
++			return -EINVAL;                                        \
++		ally_cfg->_axis_name.inner = inner;                            \
++		ally_cfg->_axis_name.outer = outer;                            \
++		_gamepad_apply_deadzones(hdev, ally_cfg);                      \
++		return count;                                                  \
++	}
++
++#define ALLY_DEADZONES(_fname, _mname)                                    \
++	ALLY_DEADZONE_SHOW(_fname##_deadzone, _mname);                    \
++	ALLY_DEADZONE_STORE(_fname##_deadzone, _mname);                   \
++	ALLY_DEVICE_ATTR_RW(_fname##_deadzone, deadzone)
++
++/* response curve macros */
++#define ALLY_RESP_CURVE_SHOW(_fname, _mname)                             \
++static ssize_t _fname##_show(struct device *dev,                         \
++			struct device_attribute *attr,                   \
++			char *buf)                                       \
++	{                                                                \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \
++		if (!drvdata.gamepad_cfg)                                \
++			return -ENODEV;                                  \
++		return sysfs_emit(buf, "%d\n", ally_cfg->ls_rc._mname);  \
++	}
++
++#define ALLY_RESP_CURVE_STORE(_fname, _mname)                            \
++static ssize_t _fname##_store(struct device *dev,                        \
++			struct device_attribute *attr,                   \
++			const char *buf, size_t count)                   \
++	{                                                                \
++		struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \
++		int ret, val;                                            \
++		if (!drvdata.gamepad_cfg)                                \
++			return -ENODEV;                                  \
++		ret = kstrtoint(buf, 0, &val);                           \
++		if (ret)                                                 \
++			return ret;                                      \
++		if (val < 0 || val > 100)                                \
++			return -EINVAL;                                  \
++		ally_cfg->ls_rc._mname = val;                            \
++		return count;                                            \
++	}
++
++/* _point_n must start at 1 */
++#define ALLY_JS_RC_POINT(_fname, _mname, _num)                                 \
++	ALLY_RESP_CURVE_SHOW(_fname##_##_mname##_##_num, _mname##_pct_##_num); \
++	ALLY_RESP_CURVE_STORE(_fname##_##_mname##_##_num, _mname##_pct_##_num); \
++	ALLY_DEVICE_ATTR_RW(_fname##_##_mname##_##_num, curve_##_mname##_pct_##_num)
++
++#define ALLY_BTN_ATTRS_GROUP(_name, _fname)                               \
++	static struct attribute *_fname##_attrs[] = {                     \
++		&dev_attr_##_fname.attr,                                  \
++		&dev_attr_##_fname##_macro.attr,                          \
++	};                                                                \
++	static const struct attribute_group _fname##_attr_group = {       \
++		.name = __stringify(_name),                               \
++		.attrs = _fname##_attrs,                                  \
++	}
++
++#define _ALLY_BTN_REMAP(_fname, _btn_name)                              \
++	ALLY_BTN_SHOW(btn_mapping_##_fname##_remap, _btn_name, false);  \
++	ALLY_BTN_STORE(btn_mapping_##_fname##_remap, _btn_name, false); \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_remap, remap);
++
++#define _ALLY_BTN_MACRO(_fname, _btn_name)                             \
++	ALLY_BTN_SHOW(btn_mapping_##_fname##_macro, _btn_name, true);  \
++	ALLY_BTN_STORE(btn_mapping_##_fname##_macro, _btn_name, true); \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_macro, macro_remap);
++
++#define ALLY_BTN_MAPPING(_fname, _btn_name)                                       \
++	_ALLY_BTN_REMAP(_fname, _btn_name)                                        \
++	_ALLY_BTN_MACRO(_fname, _btn_name)                                        \
++	static struct attribute *_fname##_attrs[] = {                             \
++		&dev_attr_btn_mapping_##_fname##_remap.attr,                      \
++		&dev_attr_btn_mapping_##_fname##_macro.attr,                      \
++		NULL,                                                             \
++	};                                                                        \
++	static const struct attribute_group btn_mapping_##_fname##_attr_group = { \
++		.name = __stringify(btn_##_fname),                                \
++		.attrs = _fname##_attrs,                                          \
++	}
++
++#define ALLY_TURBO_BTN_MAPPING(_fname, _btn_name)                                 \
++	_ALLY_BTN_REMAP(_fname, _btn_name)                                        \
++	_ALLY_BTN_MACRO(_fname, _btn_name)                                        \
++	ALLY_TURBO_SHOW(btn_mapping_##_fname##_turbo, _btn_name);                 \
++	ALLY_TURBO_STORE(btn_mapping_##_fname##_turbo, _btn_name);                \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_turbo, turbo);                 \
++	static struct attribute *_fname##_turbo_attrs[] = {                       \
++		&dev_attr_btn_mapping_##_fname##_remap.attr,                      \
++		&dev_attr_btn_mapping_##_fname##_macro.attr,                      \
++		&dev_attr_btn_mapping_##_fname##_turbo.attr,                      \
++		NULL,                                                             \
++	};                                                                        \
++	static const struct attribute_group btn_mapping_##_fname##_attr_group = { \
++		.name = __stringify(btn_##_fname),                                \
++		.attrs = _fname##_turbo_attrs,                                    \
++	}
+diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
+index d27dcfb2b9e4..188eff9d3573 100644
+--- a/drivers/hid/hid-asus.c
++++ b/drivers/hid/hid-asus.c
+@@ -23,6 +23,7 @@
+ /*
+  */
+ 
++#include "linux/export.h"
+ #include <linux/dmi.h>
+ #include <linux/hid.h>
+ #include <linux/module.h>
+@@ -33,6 +34,7 @@
+ #include <linux/leds.h>
+ 
+ #include "hid-ids.h"
++#include "hid-asus.h"
+ 
+ MODULE_AUTHOR("Yusuke Fujimaki <usk.fujimaki@gmail.com>");
+ MODULE_AUTHOR("Brendan McGrath <redmcg@redmandi.dyndns.org>");
+@@ -601,7 +603,7 @@ static int mcu_request_version(struct hid_device *hdev)
+ 	return ret;
+ }
+ 
+-static void validate_mcu_fw_version(struct hid_device *hdev, int idProduct)
++void validate_mcu_fw_version(struct hid_device *hdev, int idProduct)
+ {
+ 	int min_version, version;
+ 
+@@ -629,12 +631,11 @@ static void validate_mcu_fw_version(struct hid_device *hdev, int idProduct)
+ 		set_ally_mcu_powersave(true);
+ 	}
+ }
++EXPORT_SYMBOL_NS(validate_mcu_fw_version, "HID_ASUS");
+ 
+ static int asus_kbd_register_leds(struct hid_device *hdev)
+ {
+ 	struct asus_drvdata *drvdata = hid_get_drvdata(hdev);
+-	struct usb_interface *intf;
+-	struct usb_device *udev;
+ 	unsigned char kbd_func;
+ 	int ret;
+ 
+@@ -659,12 +660,14 @@ static int asus_kbd_register_leds(struct hid_device *hdev)
+ 				return ret;
+ 		}
+ 
++		#if !IS_REACHABLE(CONFIG_HID_ASUS_ALLY)
+ 		if (drvdata->quirks & QUIRK_ROG_ALLY_XPAD) {
+-			intf = to_usb_interface(hdev->dev.parent);
+-			udev = interface_to_usbdev(intf);
++			struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
++			struct usb_device *udev = interface_to_usbdev(intf);
+ 			validate_mcu_fw_version(hdev,
+ 				le16_to_cpu(udev->descriptor.idProduct));
+ 		}
++		#endif /* !IS_REACHABLE(CONFIG_HID_ASUS_ALLY) */
+ 
+ 	} else {
+ 		/* Initialize keyboard */
+@@ -1122,8 +1125,10 @@ static int __maybe_unused asus_reset_resume(struct hid_device *hdev)
+ 
+ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
+ {
+-	int ret;
+ 	struct asus_drvdata *drvdata;
++	struct usb_host_endpoint *ep;
++	struct usb_interface *intf;
++	int ret;
+ 
+ 	drvdata = devm_kzalloc(&hdev->dev, sizeof(*drvdata), GFP_KERNEL);
+ 	if (drvdata == NULL) {
+@@ -1135,6 +1140,18 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
+ 
+ 	drvdata->quirks = id->driver_data;
+ 
++	/* Ignore these endpoints as they are used by hid-asus-ally */
++	#if IS_REACHABLE(CONFIG_HID_ASUS_ALLY)
++	if (drvdata->quirks & QUIRK_ROG_ALLY_XPAD) {
++		intf = to_usb_interface(hdev->dev.parent);
++		ep = intf->cur_altsetting->endpoint;
++		if (ep->desc.bEndpointAddress == ROG_ALLY_X_INTF_IN ||
++			ep->desc.bEndpointAddress == ROG_ALLY_CFG_INTF_IN ||
++			ep->desc.bEndpointAddress == ROG_ALLY_CFG_INTF_OUT)
++			return -ENODEV;
++	}
++	#endif /* IS_REACHABLE(CONFIG_HID_ASUS_ALLY) */
++
+ 	/*
+ 	 * T90CHI's keyboard dock returns same ID values as T100CHI's dock.
+ 	 * Thus, identify T90CHI dock with product name string.
+diff --git a/drivers/hid/hid-asus.h b/drivers/hid/hid-asus.h
+new file mode 100644
+index 000000000000..f67dd5a3a1bc
+--- /dev/null
++++ b/drivers/hid/hid-asus.h
+@@ -0,0 +1,13 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef __HID_ASUS_H
++#define __HID_ASUS_H
++
++#include <linux/hid.h>
++
++#define ROG_ALLY_CFG_INTF_IN 0x83
++#define ROG_ALLY_CFG_INTF_OUT 0x04
++#define ROG_ALLY_X_INTF_IN 0x87
++
++void validate_mcu_fw_version(struct hid_device *hdev, int idProduct);
++
++#endif	/* __HID_ASUS_H */
+diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
+index 149798754570..a94b734266be 100644
+--- a/drivers/hid/hid-ids.h
++++ b/drivers/hid/hid-ids.h
+@@ -225,6 +225,7 @@
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD2	0x19b6
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3	0x1a30
+ #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR		0x18c6
++#define USB_DEVICE_ID_ASUSTEK_ROG_RAIKIRI_PAD		0x1abb
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY		0x1abe
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X		0x1b4c
+ #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD	0x196b
+diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
+index 6d238e120dce..fc45a7c8c201 100644
+--- a/drivers/platform/x86/Kconfig
++++ b/drivers/platform/x86/Kconfig
+@@ -250,6 +250,18 @@ config ASUS_WIRELESS
+ 	  If you choose to compile this driver as a module the module will be
+ 	  called asus-wireless.
+ 
++config ASUS_ARMOURY
++	tristate "ASUS Armoury driver"
++	depends on ASUS_WMI
++	select FW_ATTR_CLASS
++	help
++	  Say Y here if you have a WMI aware Asus machine and would like to use the
++	  firmware_attributes API to control various settings typically exposed in
++	  the ASUS Armoury Crate application available on Windows.
++
++	  To compile this driver as a module, choose M here: the module will
++	  be called asus-armoury.
++
+ config ASUS_WMI
+ 	tristate "ASUS WMI Driver"
+ 	depends on ACPI_WMI
+@@ -272,6 +284,17 @@ config ASUS_WMI
+ 	  To compile this driver as a module, choose M here: the module will
+ 	  be called asus-wmi.
+ 
++config ASUS_WMI_DEPRECATED_ATTRS
++	bool "BIOS option support in WMI platform (DEPRECATED)"
++	depends on ASUS_WMI
++	default y
++	help
++	  Say Y to expose the configurable BIOS options through the asus-wmi
++	  driver.
++
++	  This can be used with or without the asus-armoury driver which
++	  has the same attributes, but more, and better features.
++
+ config ASUS_NB_WMI
+ 	tristate "Asus Notebook WMI Driver"
+ 	depends on ASUS_WMI
+diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
+index a0c5848513e3..4279f5443f30 100644
+--- a/drivers/platform/x86/Makefile
++++ b/drivers/platform/x86/Makefile
+@@ -32,6 +32,7 @@ obj-$(CONFIG_APPLE_GMUX)	+= apple-gmux.o
+ # ASUS
+ obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
+ obj-$(CONFIG_ASUS_WIRELESS)	+= asus-wireless.o
++obj-$(CONFIG_ASUS_ARMOURY)	+= asus-armoury.o
+ obj-$(CONFIG_ASUS_WMI)		+= asus-wmi.o
+ obj-$(CONFIG_ASUS_NB_WMI)	+= asus-nb-wmi.o
+ obj-$(CONFIG_ASUS_TF103C_DOCK)	+= asus-tf103c-dock.o
+diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c
+new file mode 100644
+index 000000000000..a461be936294
+--- /dev/null
++++ b/drivers/platform/x86/asus-armoury.c
+@@ -0,0 +1,1174 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * Asus Armoury (WMI) attributes driver.
++ *
++ * This driver uses the fw_attributes class to expose various WMI functions
++ * that are present in many gaming and some non-gaming ASUS laptops.
++ *
++ * These typically don't fit anywhere else in the sysfs such as under LED class,
++ * hwmon or others, and are set in Windows using the ASUS Armoury Crate tool.
++ *
++ * Copyright(C) 2024 Luke Jones <luke@ljones.dev>
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/acpi.h>
++#include <linux/array_size.h>
++#include <linux/bitfield.h>
++#include <linux/device.h>
++#include <linux/dmi.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/kernel.h>
++#include <linux/kmod.h>
++#include <linux/kobject.h>
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/platform_data/x86/asus-wmi.h>
++#include <linux/printk.h>
++#include <linux/power_supply.h>
++#include <linux/types.h>
++
++#include "asus-armoury.h"
++#include "firmware_attributes_class.h"
++
++#define ASUS_NB_WMI_EVENT_GUID "0B3CBB35-E3C2-45ED-91C2-4C5A6D195D1C"
++
++#define ASUS_MINI_LED_MODE_MASK   0x03
++/* Standard modes for devices with only on/off */
++#define ASUS_MINI_LED_OFF         0x00
++#define ASUS_MINI_LED_ON          0x01
++/* Like "on" but the effect is more vibrant or brighter */
++#define ASUS_MINI_LED_STRONG_MODE 0x02
++/* New modes for devices with 3 mini-led mode types */
++#define ASUS_MINI_LED_2024_WEAK   0x00
++#define ASUS_MINI_LED_2024_STRONG 0x01
++#define ASUS_MINI_LED_2024_OFF    0x02
++
++/* Power tunable attribute name defines */
++#define ATTR_PPT_PL1_SPL        "ppt_pl1_spl"
++#define ATTR_PPT_PL2_SPPT       "ppt_pl2_sppt"
++#define ATTR_PPT_PL3_FPPT       "ppt_pl3_fppt"
++#define ATTR_PPT_APU_SPPT       "ppt_apu_sppt"
++#define ATTR_PPT_PLATFORM_SPPT  "ppt_platform_sppt"
++#define ATTR_NV_DYNAMIC_BOOST   "nv_dynamic_boost"
++#define ATTR_NV_TEMP_TARGET     "nv_temp_target"
++#define ATTR_NV_BASE_TGP        "nv_base_tgp"
++#define ATTR_NV_TGP             "nv_tgp"
++
++#define ASUS_POWER_CORE_MASK	GENMASK(15, 8)
++#define ASUS_PERF_CORE_MASK		GENMASK(7, 0)
++
++enum cpu_core_type {
++	CPU_CORE_PERF = 0,
++	CPU_CORE_POWER,
++};
++
++enum cpu_core_value {
++	CPU_CORE_DEFAULT = 0,
++	CPU_CORE_MIN,
++	CPU_CORE_MAX,
++	CPU_CORE_CURRENT,
++};
++
++#define CPU_PERF_CORE_COUNT_MIN 4
++#define CPU_POWR_CORE_COUNT_MIN 0
++
++/* Tunables provided by ASUS for gaming laptops */
++struct cpu_cores {
++	u32 cur_perf_cores;
++	u32 min_perf_cores;
++	u32 max_perf_cores;
++	u32 cur_power_cores;
++	u32 min_power_cores;
++	u32 max_power_cores;
++};
++
++struct rog_tunables {
++	const struct power_limits *power_limits;
++	u32 ppt_pl1_spl;			// cpu
++	u32 ppt_pl2_sppt;			// cpu
++	u32 ppt_pl3_fppt;			// cpu
++	u32 ppt_apu_sppt;			// plat
++	u32 ppt_platform_sppt;		// plat
++
++	u32 nv_dynamic_boost;
++	u32 nv_temp_target;
++	u32 nv_tgp;
++};
++
++static struct asus_armoury_priv {
++	struct device *fw_attr_dev;
++	struct kset *fw_attr_kset;
++
++	struct cpu_cores *cpu_cores;
++	/* Index 0 for DC, 1 for AC */
++	struct rog_tunables *rog_tunables[2];
++	u32 mini_led_dev_id;
++	u32 gpu_mux_dev_id;
++	/*
++	 * Mutex to prevent big/little core count changes writing to same
++	 * endpoint at the same time. Must lock during attr store.
++	 */
++	struct mutex cpu_core_mutex;
++} asus_armoury = {
++	.cpu_core_mutex = __MUTEX_INITIALIZER(asus_armoury.cpu_core_mutex)
++};
++
++struct fw_attrs_group {
++	bool pending_reboot;
++};
++
++static struct fw_attrs_group fw_attrs = {
++	.pending_reboot = false,
++};
++
++struct asus_attr_group {
++	const struct attribute_group *attr_group;
++	u32 wmi_devid;
++};
++
++static bool asus_wmi_is_present(u32 dev_id)
++{
++	u32 retval;
++	int status;
++
++	status = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, &retval);
++	pr_debug("%s called (0x%08x), retval: 0x%08x\n", __func__, dev_id, retval);
++
++	return status == 0 && (retval & ASUS_WMI_DSTS_PRESENCE_BIT);
++}
++
++static void asus_set_reboot_and_signal_event(void)
++{
++	fw_attrs.pending_reboot = true;
++	kobject_uevent(&asus_armoury.fw_attr_dev->kobj, KOBJ_CHANGE);
++}
++
++static ssize_t pending_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%d\n", fw_attrs.pending_reboot);
++}
++
++static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot);
++
++static bool asus_bios_requires_reboot(struct kobj_attribute *attr)
++{
++	return !strcmp(attr->attr.name, "gpu_mux_mode") ||
++	       !strcmp(attr->attr.name, "cores_performance") ||
++	       !strcmp(attr->attr.name, "cores_efficiency") ||
++	       !strcmp(attr->attr.name, "panel_hd_mode");
++}
++
++static int armoury_wmi_set_devstate(struct kobj_attribute *attr, u32 value, u32 wmi_dev)
++{
++	u32 result;
++	int err;
++
++	err = asus_wmi_set_devstate(wmi_dev, value, &result);
++	if (err) {
++		pr_err("Failed to set %s: %d\n", attr->attr.name, err);
++		return err;
++	}
++	/*
++	 * !1 is usually considered a fail by ASUS, but some WMI methods do use > 1
++	 * to return a status code or similar.
++	 */
++	if (result < 1) {
++		pr_err("Failed to set %s: (result): 0x%x\n", attr->attr.name, result);
++		return -EIO;
++	}
++
++	return 0;
++}
++
++/**
++ * attr_uint_store() - Send an uint to wmi method, checks if within min/max exclusive.
++ * @kobj: Pointer to the driver object.
++ * @attr: Pointer to the attribute calling this function.
++ * @buf: The buffer to read from, this is parsed to `uint` type.
++ * @count: Required by sysfs attribute macros, pass in from the callee attr.
++ * @min: Minimum accepted value. Below this returns -EINVAL.
++ * @max: Maximum accepted value. Above this returns -EINVAL.
++ * @store_value: Pointer to where the parsed value should be stored.
++ * @wmi_dev: The WMI function ID to use.
++ *
++ * This function is intended to be generic so it can be called from any "_store"
++ * attribute which works only with integers. The integer to be sent to the WMI method
++ * is range checked and an error returned if out of range.
++ *
++ * If the value is valid and WMI is success, then the sysfs attribute is notified
++ * and if asus_bios_requires_reboot() is true then reboot attribute is also notified.
++ *
++ * Returns: Either count, or an error.
++ */
++static ssize_t attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf,
++			      size_t count, u32 min, u32 max, u32 *store_value, u32 wmi_dev)
++{
++	u32 value;
++	int err;
++
++	err = kstrtouint(buf, 10, &value);
++	if (err)
++		return err;
++
++	if (value < min || value > max)
++		return -EINVAL;
++
++	err = armoury_wmi_set_devstate(attr, value, wmi_dev);
++	if (err)
++		return err;
++
++	if (store_value != NULL)
++		*store_value = value;
++	sysfs_notify(kobj, NULL, attr->attr.name);
++
++	if (asus_bios_requires_reboot(attr))
++		asus_set_reboot_and_signal_event();
++
++	return count;
++}
++
++static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr,
++			      char *buf)
++{
++	return sysfs_emit(buf, "enumeration\n");
++}
++
++static ssize_t int_type_show(struct kobject *kobj, struct kobj_attribute *attr,
++			     char *buf)
++{
++	return sysfs_emit(buf, "integer\n");
++}
++
++/* Mini-LED mode **************************************************************/
++static ssize_t mini_led_mode_current_value_show(struct kobject *kobj,
++						struct kobj_attribute *attr, char *buf)
++{
++	u32 value;
++	int err;
++
++	err = asus_wmi_get_devstate_dsts(asus_armoury.mini_led_dev_id, &value);
++	if (err)
++		return err;
++
++	value &= ASUS_MINI_LED_MODE_MASK;
++
++	/*
++	 * Remap the mode values to match previous generation mini-LED. The last gen
++	 * WMI 0 == off, while on this version WMI 2 == off (flipped).
++	 */
++	if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) {
++		switch (value) {
++		case ASUS_MINI_LED_2024_WEAK:
++			value = ASUS_MINI_LED_ON;
++			break;
++		case ASUS_MINI_LED_2024_STRONG:
++			value = ASUS_MINI_LED_STRONG_MODE;
++			break;
++		case ASUS_MINI_LED_2024_OFF:
++			value = ASUS_MINI_LED_OFF;
++			break;
++		}
++	}
++
++	return sysfs_emit(buf, "%u\n", value);
++}
++
++static ssize_t mini_led_mode_current_value_store(struct kobject *kobj,
++						 struct kobj_attribute *attr,
++						const char *buf, size_t count)
++{
++	u32 mode;
++	int err;
++
++	err = kstrtou32(buf, 10, &mode);
++	if (err)
++		return err;
++
++	if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE &&
++	    mode > ASUS_MINI_LED_ON)
++		return -EINVAL;
++	if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2 &&
++	    mode > ASUS_MINI_LED_STRONG_MODE)
++		return -EINVAL;
++
++	/*
++	 * Remap the mode values so expected behaviour is the same as the last
++	 * generation of mini-LED with 0 == off, 1 == on.
++	 */
++	if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) {
++		switch (mode) {
++		case ASUS_MINI_LED_OFF:
++			mode = ASUS_MINI_LED_2024_OFF;
++			break;
++		case ASUS_MINI_LED_ON:
++			mode = ASUS_MINI_LED_2024_WEAK;
++			break;
++		case ASUS_MINI_LED_STRONG_MODE:
++			mode = ASUS_MINI_LED_2024_STRONG;
++			break;
++		}
++	}
++
++	err = armoury_wmi_set_devstate(attr, mode, asus_armoury.mini_led_dev_id);
++	if (err)
++		return err;
++
++	sysfs_notify(kobj, NULL, attr->attr.name);
++
++	return count;
++}
++
++static ssize_t mini_led_mode_possible_values_show(struct kobject *kobj,
++						  struct kobj_attribute *attr, char *buf)
++{
++	switch (asus_armoury.mini_led_dev_id) {
++	case ASUS_WMI_DEVID_MINI_LED_MODE:
++		return sysfs_emit(buf, "0;1\n");
++	case ASUS_WMI_DEVID_MINI_LED_MODE2:
++		return sysfs_emit(buf, "0;1;2\n");
++	default:
++		return -ENODEV;
++	}
++}
++
++ATTR_GROUP_ENUM_CUSTOM(mini_led_mode, "mini_led_mode", "Set the mini-LED backlight mode");
++
++static ssize_t gpu_mux_mode_current_value_store(struct kobject *kobj,
++						struct kobj_attribute *attr, const char *buf,
++						size_t count)
++{
++	int result, err;
++	u32 optimus;
++
++	err = kstrtou32(buf, 10, &optimus);
++	if (err)
++		return err;
++
++	if (optimus > 1)
++		return -EINVAL;
++
++	if (asus_wmi_is_present(ASUS_WMI_DEVID_DGPU)) {
++		err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_DGPU, &result);
++		if (err)
++			return err;
++		if (result && !optimus) {
++			pr_warn("Can not switch MUX to dGPU mode when dGPU is disabled: %02X %02X\n",
++				result, optimus);
++			return -ENODEV;
++		}
++	}
++
++	if (asus_wmi_is_present(ASUS_WMI_DEVID_EGPU)) {
++		err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_EGPU, &result);
++		if (err)
++			return err;
++		if (result && !optimus) {
++			pr_warn("Can not switch MUX to dGPU mode when eGPU is enabled\n");
++			return -EBUSY;
++		}
++	}
++
++	err = armoury_wmi_set_devstate(attr, optimus, asus_armoury.gpu_mux_dev_id);
++	if (err)
++		return err;
++
++	sysfs_notify(kobj, NULL, attr->attr.name);
++	asus_set_reboot_and_signal_event();
++
++	return count;
++}
++WMI_SHOW_INT(gpu_mux_mode_current_value, "%u\n", asus_armoury.gpu_mux_dev_id);
++ATTR_GROUP_BOOL_CUSTOM(gpu_mux_mode, "gpu_mux_mode", "Set the GPU display MUX mode");
++
++/*
++ * A user may be required to store the value twice, typical store first, then
++ * rescan PCI bus to activate power, then store a second time to save correctly.
++ */
++static ssize_t dgpu_disable_current_value_store(struct kobject *kobj,
++						struct kobj_attribute *attr, const char *buf,
++						size_t count)
++{
++	int result, err;
++	u32 disable;
++
++	err = kstrtou32(buf, 10, &disable);
++	if (err)
++		return err;
++
++	if (disable > 1)
++		return -EINVAL;
++
++	if (asus_armoury.gpu_mux_dev_id) {
++		err = asus_wmi_get_devstate_dsts(asus_armoury.gpu_mux_dev_id, &result);
++		if (err)
++			return err;
++		if (!result && disable) {
++			pr_warn("Can not disable dGPU when the MUX is in dGPU mode\n");
++			return -EBUSY;
++		}
++	}
++
++	err = armoury_wmi_set_devstate(attr, disable, ASUS_WMI_DEVID_DGPU);
++	if (err)
++		return err;
++
++	sysfs_notify(kobj, NULL, attr->attr.name);
++
++	return count;
++}
++WMI_SHOW_INT(dgpu_disable_current_value, "%d\n", ASUS_WMI_DEVID_DGPU);
++ATTR_GROUP_BOOL_CUSTOM(dgpu_disable, "dgpu_disable", "Disable the dGPU");
++
++/* The ACPI call to enable the eGPU also disables the internal dGPU */
++static ssize_t egpu_enable_current_value_store(struct kobject *kobj, struct kobj_attribute *attr,
++					       const char *buf, size_t count)
++{
++	int result, err;
++	u32 enable;
++
++	err = kstrtou32(buf, 10, &enable);
++	if (err)
++		return err;
++
++	if (enable > 1)
++		return -EINVAL;
++
++	err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_EGPU_CONNECTED, &result);
++	if (err) {
++		pr_warn("Failed to get eGPU connection status: %d\n", err);
++		return err;
++	}
++
++	if (asus_armoury.gpu_mux_dev_id) {
++		err = asus_wmi_get_devstate_dsts(asus_armoury.gpu_mux_dev_id, &result);
++		if (err) {
++			pr_warn("Failed to get GPU MUX status: %d\n", result);
++			return err;
++		}
++		if (!result && enable) {
++			pr_warn("Can not enable eGPU when the MUX is in dGPU mode\n");
++			return -ENODEV;
++		}
++	}
++
++	err = armoury_wmi_set_devstate(attr, enable, ASUS_WMI_DEVID_EGPU);
++	if (err)
++		return err;
++
++	sysfs_notify(kobj, NULL, attr->attr.name);
++
++	return count;
++}
++WMI_SHOW_INT(egpu_enable_current_value, "%d\n", ASUS_WMI_DEVID_EGPU);
++ATTR_GROUP_BOOL_CUSTOM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)");
++
++/* Device memory available to APU */
++
++/* Values map for APU memory: some looks out of order but are actually correct */
++static u32 apu_mem_map[] = {
++	[0] = 0x000, /* called "AUTO" on the BIOS, is the minimum available */
++	[1] = 0x102,
++	[2] = 0x103,
++	[3] = 0x104,
++	[4] = 0x105,
++	[5] = 0x107,
++	[6] = 0x108,
++	[7] = 0x109,
++	[8] = 0x106,
++};
++
++static ssize_t apu_mem_current_value_show(struct kobject *kobj, struct kobj_attribute *attr,
++					  char *buf)
++{
++	int err;
++	u32 mem;
++
++	err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_APU_MEM, &mem);
++	if (err)
++		return err;
++
++	if ((mem & ASUS_WMI_DSTS_PRESENCE_BIT) == 0)
++		return -ENODEV;
++
++	mem &= ~ASUS_WMI_DSTS_PRESENCE_BIT;
++
++	/* After 0x000 is set, a read will return 0x100 */
++	if (mem == 0x100)
++		return sysfs_emit(buf, "0\n");
++
++	for (unsigned int i = 0; i < ARRAY_SIZE(apu_mem_map); i++) {
++		if (apu_mem_map[i] == mem)
++			return sysfs_emit(buf, "%u\n", i);
++	}
++
++	pr_warn("Unrecognised value for APU mem 0x%08x\n", mem);
++	return sysfs_emit(buf, "%u\n", mem);
++}
++
++static ssize_t apu_mem_current_value_store(struct kobject *kobj, struct kobj_attribute *attr,
++					   const char *buf, size_t count)
++{
++	int result, err;
++	u32 requested, mem;
++
++	result = kstrtou32(buf, 10, &requested);
++	if (result)
++		return result;
++
++	if (requested > ARRAY_SIZE(apu_mem_map))
++		return -EINVAL;
++
++	mem = apu_mem_map[requested];
++
++	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_APU_MEM, mem, &result);
++	if (err) {
++		pr_warn("Failed to set apu_mem: %d\n", err);
++		return err;
++	}
++
++	pr_info("APU memory changed to %uGB, reboot required\n", requested+1);
++	sysfs_notify(kobj, NULL, attr->attr.name);
++
++	asus_set_reboot_and_signal_event();
++
++	return count;
++}
++
++static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr,
++					    char *buf)
++{
++	BUILD_BUG_ON(ARRAY_SIZE(apu_mem_map) != 9);
++	return sysfs_emit(buf, "0;1;2;3;4;5;6;7;8\n");
++}
++ATTR_GROUP_ENUM_CUSTOM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use");
++
++static int init_max_cpu_cores(void)
++{
++	u32 cores;
++	int err;
++
++	asus_armoury.cpu_cores = kzalloc(sizeof(struct cpu_cores), GFP_KERNEL);
++	if (!asus_armoury.cpu_cores)
++		return -ENOMEM;
++
++	err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_CORES_MAX, &cores);
++	if (err)
++		return err;
++
++	if ((cores & ASUS_WMI_DSTS_PRESENCE_BIT) == 0) {
++		pr_err("ACPI does not support CPU core count control\n");
++		err = -ENODEV;
++		goto init_max_cpu_cores_err;
++	}
++
++	asus_armoury.cpu_cores->max_power_cores = FIELD_GET(ASUS_POWER_CORE_MASK, cores);
++	asus_armoury.cpu_cores->max_perf_cores = FIELD_GET(ASUS_PERF_CORE_MASK, cores);
++
++	err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_CORES, &cores);
++	if (err) {
++		pr_err("Could not get CPU core count: error %d\n", err);
++		goto init_max_cpu_cores_err;
++	}
++
++	asus_armoury.cpu_cores->cur_perf_cores = FIELD_GET(ASUS_PERF_CORE_MASK, cores);
++	asus_armoury.cpu_cores->cur_power_cores = FIELD_GET(ASUS_POWER_CORE_MASK, cores);
++
++	asus_armoury.cpu_cores->min_perf_cores = CPU_PERF_CORE_COUNT_MIN;
++	asus_armoury.cpu_cores->min_power_cores = CPU_POWR_CORE_COUNT_MIN;
++
++	return 0;
++
++init_max_cpu_cores_err:
++	kfree(asus_armoury.cpu_cores);
++	return err;
++}
++
++static ssize_t cores_value_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf,
++				enum cpu_core_type core_type, enum cpu_core_value core_value)
++{
++	u32 cores;
++
++	switch (core_value) {
++	case CPU_CORE_DEFAULT:
++	case CPU_CORE_MAX:
++		if (core_type == CPU_CORE_PERF)
++			return sysfs_emit(buf, "%u\n",
++					  asus_armoury.cpu_cores->max_perf_cores);
++		else
++			return sysfs_emit(buf, "%u\n",
++					  asus_armoury.cpu_cores->max_power_cores);
++	case CPU_CORE_MIN:
++		if (core_type == CPU_CORE_PERF)
++			return sysfs_emit(buf, "%u\n",
++					  asus_armoury.cpu_cores->min_perf_cores);
++		else
++			return sysfs_emit(buf, "%u\n",
++					  asus_armoury.cpu_cores->min_power_cores);
++	default:
++		break;
++	}
++
++	if (core_type == CPU_CORE_PERF)
++		cores = asus_armoury.cpu_cores->cur_perf_cores;
++	else
++		cores = asus_armoury.cpu_cores->cur_power_cores;
++
++	return sysfs_emit(buf, "%u\n", cores);
++}
++
++static ssize_t cores_current_value_store(struct kobject *kobj, struct kobj_attribute *attr,
++					 const char *buf, enum cpu_core_type core_type)
++{
++	u32 new_cores, perf_cores, power_cores, out_val, min, max;
++	int result, err;
++
++	result = kstrtou32(buf, 10, &new_cores);
++	if (result)
++		return result;
++
++	scoped_guard(mutex, &asus_armoury.cpu_core_mutex) {
++		if (core_type == CPU_CORE_PERF) {
++			perf_cores = new_cores;
++			power_cores = asus_armoury.cpu_cores->cur_power_cores;
++			min = asus_armoury.cpu_cores->min_perf_cores;
++			max = asus_armoury.cpu_cores->max_perf_cores;
++		} else {
++			perf_cores = asus_armoury.cpu_cores->cur_perf_cores;
++			power_cores = new_cores;
++			min = asus_armoury.cpu_cores->min_power_cores;
++			max = asus_armoury.cpu_cores->max_power_cores;
++		}
++
++		if (new_cores < min || new_cores > max)
++			return -EINVAL;
++
++		out_val = FIELD_PREP(ASUS_PERF_CORE_MASK, perf_cores) |
++			FIELD_PREP(ASUS_POWER_CORE_MASK, power_cores);
++
++		err = asus_wmi_set_devstate(ASUS_WMI_DEVID_CORES, out_val, &result);
++		if (err) {
++			pr_warn("Failed to set CPU core count: %d\n", err);
++			return err;
++		}
++
++		if (result > 1) {
++			pr_warn("Failed to set CPU core count (result): 0x%x\n", result);
++			return -EIO;
++		}
++	}
++
++	pr_info("CPU core count changed, reboot required\n");
++
++	sysfs_notify(kobj, NULL, attr->attr.name);
++	asus_set_reboot_and_signal_event();
++
++	return 0;
++}
++
++static ssize_t cores_performance_min_value_show(struct kobject *kobj,
++						struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_MIN);
++}
++
++static ssize_t cores_performance_max_value_show(struct kobject *kobj,
++						struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_MAX);
++}
++
++static ssize_t cores_performance_default_value_show(struct kobject *kobj,
++						    struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_DEFAULT);
++}
++
++static ssize_t cores_performance_current_value_show(struct kobject *kobj,
++						    struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_CURRENT);
++}
++
++static ssize_t cores_performance_current_value_store(struct kobject *kobj,
++						     struct kobj_attribute *attr,
++						     const char *buf, size_t count)
++{
++	int err;
++
++	err = cores_current_value_store(kobj, attr, buf, CPU_CORE_PERF);
++	if (err)
++		return err;
++
++	return count;
++}
++ATTR_GROUP_CORES_RW(cores_performance, "cores_performance",
++		    "Set the max available performance cores");
++
++static ssize_t cores_efficiency_min_value_show(struct kobject *kobj, struct kobj_attribute *attr,
++					       char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_MIN);
++}
++
++static ssize_t cores_efficiency_max_value_show(struct kobject *kobj, struct kobj_attribute *attr,
++					       char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_MAX);
++}
++
++static ssize_t cores_efficiency_default_value_show(struct kobject *kobj,
++						   struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_DEFAULT);
++}
++
++static ssize_t cores_efficiency_current_value_show(struct kobject *kobj,
++						   struct kobj_attribute *attr, char *buf)
++{
++	return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_CURRENT);
++}
++
++static ssize_t cores_efficiency_current_value_store(struct kobject *kobj,
++						    struct kobj_attribute *attr, const char *buf,
++						    size_t count)
++{
++	int err;
++
++	err = cores_current_value_store(kobj, attr, buf, CPU_CORE_POWER);
++	if (err)
++		return err;
++
++	return count;
++}
++ATTR_GROUP_CORES_RW(cores_efficiency, "cores_efficiency",
++		    "Set the max available efficiency cores");
++
++/* Define helper to access the current power mode tunable values */
++static inline struct rog_tunables *get_current_tunables(void)
++{
++	return asus_armoury
++		.rog_tunables[power_supply_is_system_supplied() ? 1 : 0];
++}
++
++/* Simple attribute creation */
++ATTR_GROUP_ROG_TUNABLE(ppt_pl1_spl, ATTR_PPT_PL1_SPL, ASUS_WMI_DEVID_PPT_PL1_SPL,
++		       "Set the CPU slow package limit");
++ATTR_GROUP_ROG_TUNABLE(ppt_pl2_sppt, ATTR_PPT_PL2_SPPT, ASUS_WMI_DEVID_PPT_PL2_SPPT,
++		       "Set the CPU fast package limit");
++ATTR_GROUP_ROG_TUNABLE(ppt_pl3_fppt, ATTR_PPT_PL3_FPPT, ASUS_WMI_DEVID_PPT_FPPT,
++		       "Set the CPU fastest package limit");
++ATTR_GROUP_ROG_TUNABLE(ppt_apu_sppt, ATTR_PPT_APU_SPPT, ASUS_WMI_DEVID_PPT_APU_SPPT,
++		       "Set the APU package limit");
++ATTR_GROUP_ROG_TUNABLE(ppt_platform_sppt, ATTR_PPT_PLATFORM_SPPT, ASUS_WMI_DEVID_PPT_PLAT_SPPT,
++		       "Set the platform package limit");
++ATTR_GROUP_ROG_TUNABLE(nv_dynamic_boost, ATTR_NV_DYNAMIC_BOOST, ASUS_WMI_DEVID_NV_DYN_BOOST,
++		       "Set the Nvidia dynamic boost limit");
++ATTR_GROUP_ROG_TUNABLE(nv_temp_target, ATTR_NV_TEMP_TARGET, ASUS_WMI_DEVID_NV_THERM_TARGET,
++		       "Set the Nvidia max thermal limit");
++ATTR_GROUP_ROG_TUNABLE(nv_tgp, "nv_tgp", ASUS_WMI_DEVID_DGPU_SET_TGP,
++		       "Set the additional TGP on top of the base TGP");
++ATTR_GROUP_INT_VALUE_ONLY_RO(nv_base_tgp, ATTR_NV_BASE_TGP, ASUS_WMI_DEVID_DGPU_BASE_TGP,
++			     "Read the base TGP value");
++
++
++ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2",
++		       "Show the current mode of charging");
++
++ATTR_GROUP_BOOL_RW(boot_sound, "boot_sound", ASUS_WMI_DEVID_BOOT_SOUND,
++		   "Set the boot POST sound");
++ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWERSAVE,
++		   "Set MCU powersaving mode");
++ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD,
++		   "Set the panel refresh overdrive");
++ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD,
++		   "Set the panel HD mode to UHD<0> or FHD<1>");
++ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness",
++		   ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS,
++		   "Set the panel brightness to Off<0> or On<1>");
++ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED,
++		   "Show the eGPU connection status");
++
++/* If an attribute does not require any special case handling add it here */
++static const struct asus_attr_group armoury_attr_groups[] = {
++	{ &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED },
++	{ &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU },
++	{ &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU },
++	{ &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM },
++	{ &cores_efficiency_attr_group, ASUS_WMI_DEVID_CORES_MAX },
++	{ &cores_performance_attr_group, ASUS_WMI_DEVID_CORES_MAX },
++
++	{ &ppt_pl1_spl_attr_group, ASUS_WMI_DEVID_PPT_PL1_SPL },
++	{ &ppt_pl2_sppt_attr_group, ASUS_WMI_DEVID_PPT_PL2_SPPT },
++	{ &ppt_pl3_fppt_attr_group, ASUS_WMI_DEVID_PPT_FPPT },
++	{ &ppt_apu_sppt_attr_group, ASUS_WMI_DEVID_PPT_APU_SPPT },
++	{ &ppt_platform_sppt_attr_group, ASUS_WMI_DEVID_PPT_PLAT_SPPT },
++	{ &nv_dynamic_boost_attr_group, ASUS_WMI_DEVID_NV_DYN_BOOST },
++	{ &nv_temp_target_attr_group, ASUS_WMI_DEVID_NV_THERM_TARGET },
++	{ &nv_base_tgp_attr_group, ASUS_WMI_DEVID_DGPU_BASE_TGP },
++	{ &nv_tgp_attr_group, ASUS_WMI_DEVID_DGPU_SET_TGP },
++
++	{ &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE },
++	{ &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND },
++	{ &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE },
++	{ &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD },
++	{ &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD },
++	{ &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS },
++};
++
++/**
++ * is_power_tunable_attr - Determines if an attribute is a power-related tunable
++ * @name: The name of the attribute to check
++ *
++ * This function checks if the given attribute name is related to power tuning.
++ *
++ * Return: true if the attribute is a power-related tunable, false otherwise
++ */
++static bool is_power_tunable_attr(const char *name)
++{
++	static const char * const power_tunable_attrs[] = {
++		ATTR_PPT_PL1_SPL,	ATTR_PPT_PL2_SPPT,
++		ATTR_PPT_PL3_FPPT,	ATTR_PPT_APU_SPPT,
++		ATTR_PPT_PLATFORM_SPPT, ATTR_NV_DYNAMIC_BOOST,
++		ATTR_NV_TEMP_TARGET,	ATTR_NV_BASE_TGP,
++		ATTR_NV_TGP
++	};
++
++	for (unsigned int i = 0; i < ARRAY_SIZE(power_tunable_attrs); i++) {
++		if (!strcmp(name, power_tunable_attrs[i]))
++			return true;
++	}
++
++	return false;
++}
++
++/**
++ * has_valid_limit - Checks if a power-related attribute has a valid limit value
++ * @name: The name of the attribute to check
++ * @limits: Pointer to the power_limits structure containing limit values
++ *
++ * This function checks if a power-related attribute has a valid limit value.
++ * It returns false if limits is NULL or if the corresponding limit value is zero.
++ *
++ * Return: true if the attribute has a valid limit value, false otherwise
++ */
++static bool has_valid_limit(const char *name, const struct power_limits *limits)
++{
++	u32 limit_value = 0;
++
++	if (!limits)
++		return false;
++
++	if (!strcmp(name, ATTR_PPT_PL1_SPL))
++		limit_value = limits->ppt_pl1_spl_max;
++	else if (!strcmp(name, ATTR_PPT_PL2_SPPT))
++		limit_value = limits->ppt_pl2_sppt_max;
++	else if (!strcmp(name, ATTR_PPT_PL3_FPPT))
++		limit_value = limits->ppt_pl3_fppt_max;
++	else if (!strcmp(name, ATTR_PPT_APU_SPPT))
++		limit_value = limits->ppt_apu_sppt_max;
++	else if (!strcmp(name, ATTR_PPT_PLATFORM_SPPT))
++		limit_value = limits->ppt_platform_sppt_max;
++	else if (!strcmp(name, ATTR_NV_DYNAMIC_BOOST))
++		limit_value = limits->nv_dynamic_boost_max;
++	else if (!strcmp(name, ATTR_NV_TEMP_TARGET))
++		limit_value = limits->nv_temp_target_max;
++	else if (!strcmp(name, ATTR_NV_BASE_TGP) ||
++		 !strcmp(name, ATTR_NV_TGP))
++		limit_value = limits->nv_tgp_max;
++
++	return limit_value > 0;
++}
++
++static int asus_fw_attr_add(void)
++{
++	const struct power_limits *limits;
++	bool should_create;
++	const char *name;
++	int err, i;
++
++	asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0),
++						NULL, "%s", DRIVER_NAME);
++	if (IS_ERR(asus_armoury.fw_attr_dev)) {
++		err = PTR_ERR(asus_armoury.fw_attr_dev);
++		goto fail_class_get;
++	}
++
++	asus_armoury.fw_attr_kset = kset_create_and_add("attributes", NULL,
++						&asus_armoury.fw_attr_dev->kobj);
++	if (!asus_armoury.fw_attr_kset) {
++		err = -ENOMEM;
++		goto err_destroy_classdev;
++	}
++
++	err = sysfs_create_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
++	if (err) {
++		pr_err("Failed to create sysfs level attributes\n");
++		goto err_destroy_kset;
++	}
++
++	asus_armoury.mini_led_dev_id = 0;
++	if (asus_wmi_is_present(ASUS_WMI_DEVID_MINI_LED_MODE))
++		asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE;
++	else if (asus_wmi_is_present(ASUS_WMI_DEVID_MINI_LED_MODE2))
++		asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2;
++
++	if (asus_armoury.mini_led_dev_id) {
++		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
++					 &mini_led_mode_attr_group);
++		if (err) {
++			pr_err("Failed to create sysfs-group for mini_led\n");
++			goto err_remove_file;
++		}
++	}
++
++	asus_armoury.gpu_mux_dev_id = 0;
++	if (asus_wmi_is_present(ASUS_WMI_DEVID_GPU_MUX))
++		asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX;
++	else if (asus_wmi_is_present(ASUS_WMI_DEVID_GPU_MUX_VIVO))
++		asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX_VIVO;
++
++	if (asus_armoury.gpu_mux_dev_id) {
++		err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
++					 &gpu_mux_mode_attr_group);
++		if (err) {
++			pr_err("Failed to create sysfs-group for gpu_mux\n");
++			goto err_remove_mini_led_group;
++		}
++	}
++
++	for (i = 0; i < ARRAY_SIZE(armoury_attr_groups); i++) {
++		if (!asus_wmi_is_present(armoury_attr_groups[i].wmi_devid))
++			continue;
++
++		/* Always create by default, unless PPT is not present */
++		should_create = true;
++		name = armoury_attr_groups[i].attr_group->name;
++
++		/* Check if this is a power-related tunable requiring limits */
++		if (asus_armoury.rog_tunables[1] && asus_armoury.rog_tunables[1]->power_limits &&
++		    is_power_tunable_attr(name)) {
++			limits = asus_armoury.rog_tunables[1]->power_limits;
++			/* Check only AC, if DC is not present then AC won't be either */
++			should_create = has_valid_limit(name, limits);
++			if (!should_create) {
++				pr_debug("Missing max value on %s for tunable: %s\n",
++					 dmi_get_system_info(DMI_BOARD_NAME), name);
++			}
++		}
++
++		if (should_create) {
++			err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj,
++				armoury_attr_groups[i].attr_group);
++			if (err) {
++				pr_err("Failed to create sysfs-group for %s\n",
++				       armoury_attr_groups[i].attr_group->name);
++				goto err_remove_groups;
++			}
++		}
++	}
++
++	return 0;
++
++err_remove_groups:
++	while (i--) {
++		if (asus_wmi_is_present(armoury_attr_groups[i].wmi_devid))
++			sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj,
++					   armoury_attr_groups[i].attr_group);
++	}
++	if (asus_armoury.gpu_mux_dev_id)
++		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group);
++err_remove_mini_led_group:
++	if (asus_armoury.mini_led_dev_id)
++		sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group);
++err_remove_file:
++	sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
++err_destroy_kset:
++	kset_unregister(asus_armoury.fw_attr_kset);
++err_destroy_classdev:
++fail_class_get:
++	device_destroy(&firmware_attributes_class, MKDEV(0, 0));
++	return err;
++}
++
++/* Init / exit ****************************************************************/
++
++/* Set up the min/max and defaults for ROG tunables */
++static void init_rog_tunables(void)
++{
++	const struct power_limits *ac_limits, *dc_limits;
++	const struct power_data *power_data;
++	const struct dmi_system_id *dmi_id;
++	bool ac_initialized = false, dc_initialized = false;
++
++	/* Match the system against the power_limits table */
++	dmi_id = dmi_first_match(power_limits);
++	if (!dmi_id) {
++		pr_warn("No matching power limits found for this system\n");
++		return;
++	}
++
++	/* Get the power data for this system */
++	power_data = dmi_id->driver_data;
++	if (!power_data) {
++		pr_info("No power data available for this system\n");
++		return;
++	}
++
++	/* Initialize AC power tunables */
++	ac_limits = power_data->ac_data;
++	if (ac_limits) {
++		asus_armoury.rog_tunables[1] =
++			kzalloc(sizeof(*asus_armoury.rog_tunables[1]), GFP_KERNEL);
++		if (!asus_armoury.rog_tunables[1])
++			goto err_nomem;
++
++		asus_armoury.rog_tunables[1]->power_limits = ac_limits;
++
++		/* Set initial AC values */
++		asus_armoury.rog_tunables[1]->ppt_pl1_spl =
++			ac_limits->ppt_pl1_spl_def ?
++				ac_limits->ppt_pl1_spl_def :
++				ac_limits->ppt_pl1_spl_max;
++
++		asus_armoury.rog_tunables[1]->ppt_pl2_sppt =
++			ac_limits->ppt_pl2_sppt_def ?
++				ac_limits->ppt_pl2_sppt_def :
++				ac_limits->ppt_pl2_sppt_max;
++
++		asus_armoury.rog_tunables[1]->ppt_pl3_fppt =
++			ac_limits->ppt_pl3_fppt_def ?
++				ac_limits->ppt_pl3_fppt_def :
++				ac_limits->ppt_pl3_fppt_max;
++
++		asus_armoury.rog_tunables[1]->ppt_apu_sppt =
++			ac_limits->ppt_apu_sppt_def ?
++				ac_limits->ppt_apu_sppt_def :
++				ac_limits->ppt_apu_sppt_max;
++
++		asus_armoury.rog_tunables[1]->ppt_platform_sppt =
++			ac_limits->ppt_platform_sppt_def ?
++				ac_limits->ppt_platform_sppt_def :
++				ac_limits->ppt_platform_sppt_max;
++
++		asus_armoury.rog_tunables[1]->nv_dynamic_boost =
++			ac_limits->nv_dynamic_boost_max;
++		asus_armoury.rog_tunables[1]->nv_temp_target =
++			ac_limits->nv_temp_target_max;
++		asus_armoury.rog_tunables[1]->nv_tgp = ac_limits->nv_tgp_max;
++
++		ac_initialized = true;
++		pr_debug("AC power limits initialized for %s\n", dmi_id->matches[0].substr);
++	}
++
++	/* Initialize DC power tunables */
++	dc_limits = power_data->dc_data;
++	if (dc_limits) {
++		asus_armoury.rog_tunables[0] =
++			kzalloc(sizeof(*asus_armoury.rog_tunables[0]), GFP_KERNEL);
++		if (!asus_armoury.rog_tunables[0]) {
++			if (ac_initialized)
++				kfree(asus_armoury.rog_tunables[1]);
++			goto err_nomem;
++		}
++
++		asus_armoury.rog_tunables[0]->power_limits = dc_limits;
++
++		/* Set initial DC values */
++		asus_armoury.rog_tunables[0]->ppt_pl1_spl =
++			dc_limits->ppt_pl1_spl_def ?
++				dc_limits->ppt_pl1_spl_def :
++				dc_limits->ppt_pl1_spl_max;
++
++		asus_armoury.rog_tunables[0]->ppt_pl2_sppt =
++			dc_limits->ppt_pl2_sppt_def ?
++				dc_limits->ppt_pl2_sppt_def :
++				dc_limits->ppt_pl2_sppt_max;
++
++		asus_armoury.rog_tunables[0]->ppt_pl3_fppt =
++			dc_limits->ppt_pl3_fppt_def ?
++				dc_limits->ppt_pl3_fppt_def :
++				dc_limits->ppt_pl3_fppt_max;
++
++		asus_armoury.rog_tunables[0]->ppt_apu_sppt =
++			dc_limits->ppt_apu_sppt_def ?
++				dc_limits->ppt_apu_sppt_def :
++				dc_limits->ppt_apu_sppt_max;
++
++		asus_armoury.rog_tunables[0]->ppt_platform_sppt =
++			dc_limits->ppt_platform_sppt_def ?
++				dc_limits->ppt_platform_sppt_def :
++				dc_limits->ppt_platform_sppt_max;
++
++		asus_armoury.rog_tunables[0]->nv_dynamic_boost =
++			dc_limits->nv_dynamic_boost_max;
++		asus_armoury.rog_tunables[0]->nv_temp_target =
++			dc_limits->nv_temp_target_max;
++		asus_armoury.rog_tunables[0]->nv_tgp = dc_limits->nv_tgp_max;
++
++		dc_initialized = true;
++		pr_debug("DC power limits initialized for %s\n", dmi_id->matches[0].substr);
++	}
++
++	if (!ac_initialized)
++		pr_debug("No AC PPT limits defined\n");
++
++	if (!dc_initialized)
++		pr_debug("No DC PPT limits defined\n");
++
++	return;
++
++err_nomem:
++	pr_err("Failed to allocate memory for tunables\n");
++}
++
++static int __init asus_fw_init(void)
++{
++	char *wmi_uid;
++	int err;
++
++	wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID);
++	if (!wmi_uid)
++		return -ENODEV;
++
++	/*
++	 * if equal to "ASUSWMI" then it's DCTS that can't be used for this
++	 * driver, DSTS is required.
++	 */
++	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI))
++		return -ENODEV;
++
++	if (asus_wmi_is_present(ASUS_WMI_DEVID_CORES_MAX)) {
++		err = init_max_cpu_cores();
++		if (err) {
++			pr_err("Could not initialise CPU core control %d\n", err);
++			return err;
++		}
++	}
++
++	init_rog_tunables();
++
++	/* Must always be last step to ensure data is available */
++	return asus_fw_attr_add();
++}
++
++static void __exit asus_fw_exit(void)
++{
++	sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr);
++	kset_unregister(asus_armoury.fw_attr_kset);
++	device_destroy(&firmware_attributes_class, MKDEV(0, 0));
++
++	kfree(asus_armoury.rog_tunables[0]);
++	kfree(asus_armoury.rog_tunables[1]);
++}
++
++module_init(asus_fw_init);
++module_exit(asus_fw_exit);
++
++MODULE_IMPORT_NS("ASUS_WMI");
++MODULE_AUTHOR("Luke Jones <luke@ljones.dev>");
++MODULE_DESCRIPTION("ASUS BIOS Configuration Driver");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS("wmi:" ASUS_NB_WMI_EVENT_GUID);
+diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h
+new file mode 100644
+index 000000000000..438768ea14cc
+--- /dev/null
++++ b/drivers/platform/x86/asus-armoury.h
+@@ -0,0 +1,1278 @@
++/* SPDX-License-Identifier: GPL-2.0
++ *
++ * Definitions for kernel modules using asus-armoury driver
++ *
++ *  Copyright (c) 2024 Luke Jones <luke@ljones.dev>
++ */
++
++#ifndef _ASUS_ARMOURY_H_
++#define _ASUS_ARMOURY_H_
++
++#include <linux/dmi.h>
++#include <linux/types.h>
++#include <linux/platform_device.h>
++
++#define DRIVER_NAME "asus-armoury"
++
++#define __ASUS_ATTR_RO(_func, _name)					\
++	{								\
++		.attr = { .name = __stringify(_name), .mode = 0444 },	\
++		.show = _func##_##_name##_show,				\
++	}
++
++#define __ASUS_ATTR_RO_AS(_name, _show)					\
++	{								\
++		.attr = { .name = __stringify(_name), .mode = 0444 },	\
++		.show = _show,						\
++	}
++
++#define __ASUS_ATTR_RW(_func, _name) \
++	__ATTR(_name, 0644, _func##_##_name##_show, _func##_##_name##_store)
++
++#define __WMI_STORE_INT(_attr, _min, _max, _wmi)			\
++	static ssize_t _attr##_store(struct kobject *kobj,		\
++				     struct kobj_attribute *attr,	\
++				     const char *buf, size_t count)	\
++	{								\
++		return attr_uint_store(kobj, attr, buf, count, _min,	\
++					_max, NULL, _wmi);		\
++	}
++
++#define WMI_SHOW_INT(_attr, _fmt, _wmi)						\
++	static ssize_t _attr##_show(struct kobject *kobj,			\
++				    struct kobj_attribute *attr, char *buf)	\
++	{									\
++		u32 result;							\
++		int err;							\
++										\
++		err = asus_wmi_get_devstate_dsts(_wmi, &result);		\
++		if (err)							\
++			return err;						\
++		return sysfs_emit(buf, _fmt,					\
++				  result & ~ASUS_WMI_DSTS_PRESENCE_BIT);	\
++	}
++
++/* Create functions and attributes for use in other macros or on their own */
++
++/* Shows a formatted static variable */
++#define __ATTR_SHOW_FMT(_prop, _attrname, _fmt, _val)				\
++	static ssize_t _attrname##_##_prop##_show(				\
++		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
++	{									\
++		return sysfs_emit(buf, _fmt, _val);				\
++	}									\
++	static struct kobj_attribute attr_##_attrname##_##_prop =		\
++		__ASUS_ATTR_RO(_attrname, _prop)
++
++#define __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)\
++	WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi);			\
++	static struct kobj_attribute attr_##_attrname##_current_value =		\
++		__ASUS_ATTR_RO(_attrname, current_value);			\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
++	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);		\
++	static struct kobj_attribute attr_##_attrname##_type =			\
++		__ASUS_ATTR_RO_AS(type, enum_type_show);			\
++	static struct attribute *_attrname##_attrs[] = {			\
++		&attr_##_attrname##_current_value.attr,				\
++		&attr_##_attrname##_display_name.attr,				\
++		&attr_##_attrname##_possible_values.attr,			\
++		&attr_##_attrname##_type.attr,					\
++		NULL								\
++	};									\
++	static const struct attribute_group _attrname##_attr_group = {		\
++		.name = _fsname, .attrs = _attrname##_attrs			\
++	}
++
++#define __ATTR_RW_INT_GROUP_ENUM(_attrname, _minv, _maxv, _wmi, _fsname,\
++				 _possible, _dispname)			\
++	__WMI_STORE_INT(_attrname##_current_value, _minv, _maxv, _wmi);	\
++	WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi);		\
++	static struct kobj_attribute attr_##_attrname##_current_value =	\
++		__ASUS_ATTR_RW(_attrname, current_value);		\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
++	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);	\
++	static struct kobj_attribute attr_##_attrname##_type =		\
++		__ASUS_ATTR_RO_AS(type, enum_type_show);		\
++	static struct attribute *_attrname##_attrs[] = {		\
++		&attr_##_attrname##_current_value.attr,			\
++		&attr_##_attrname##_display_name.attr,			\
++		&attr_##_attrname##_possible_values.attr,		\
++		&attr_##_attrname##_type.attr,				\
++		NULL							\
++	};								\
++	static const struct attribute_group _attrname##_attr_group = {	\
++		.name = _fsname, .attrs = _attrname##_attrs		\
++	}
++
++/* Boolean style enumeration, base macro. Requires adding show/store */
++#define __ATTR_GROUP_ENUM(_attrname, _fsname, _possible, _dispname)	\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
++	__ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible);	\
++	static struct kobj_attribute attr_##_attrname##_type =		\
++		__ASUS_ATTR_RO_AS(type, enum_type_show);		\
++	static struct attribute *_attrname##_attrs[] = {		\
++		&attr_##_attrname##_current_value.attr,			\
++		&attr_##_attrname##_display_name.attr,			\
++		&attr_##_attrname##_possible_values.attr,		\
++		&attr_##_attrname##_type.attr,				\
++		NULL							\
++	};								\
++	static const struct attribute_group _attrname##_attr_group = {	\
++		.name = _fsname, .attrs = _attrname##_attrs		\
++	}
++
++#define ATTR_GROUP_BOOL_RO(_attrname, _fsname, _wmi, _dispname)	\
++	__ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, "0;1", _dispname)
++
++
++#define ATTR_GROUP_BOOL_RW(_attrname, _fsname, _wmi, _dispname)	\
++	__ATTR_RW_INT_GROUP_ENUM(_attrname, 0, 1, _wmi, _fsname, "0;1", _dispname)
++
++#define ATTR_GROUP_ENUM_INT_RO(_attrname, _fsname, _wmi, _possible, _dispname)	\
++	__ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)
++
++/*
++ * Requires <name>_current_value_show(), <name>_current_value_show()
++ */
++#define ATTR_GROUP_BOOL_CUSTOM(_attrname, _fsname, _dispname)		\
++	static struct kobj_attribute attr_##_attrname##_current_value =	\
++		__ASUS_ATTR_RW(_attrname, current_value);		\
++	__ATTR_GROUP_ENUM(_attrname, _fsname, "0;1", _dispname)
++
++/*
++ * Requires <name>_current_value_show(), <name>_current_value_show()
++ * and <name>_possible_values_show()
++ */
++#define ATTR_GROUP_ENUM_CUSTOM(_attrname, _fsname, _dispname)			\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
++	static struct kobj_attribute attr_##_attrname##_current_value =		\
++		__ASUS_ATTR_RW(_attrname, current_value);			\
++	static struct kobj_attribute attr_##_attrname##_possible_values =	\
++		__ASUS_ATTR_RO(_attrname, possible_values);			\
++	static struct kobj_attribute attr_##_attrname##_type =			\
++		__ASUS_ATTR_RO_AS(type, enum_type_show);			\
++	static struct attribute *_attrname##_attrs[] = {			\
++		&attr_##_attrname##_current_value.attr,				\
++		&attr_##_attrname##_display_name.attr,				\
++		&attr_##_attrname##_possible_values.attr,			\
++		&attr_##_attrname##_type.attr,					\
++		NULL								\
++	};									\
++	static const struct attribute_group _attrname##_attr_group = {		\
++		.name = _fsname, .attrs = _attrname##_attrs			\
++	}
++
++/* CPU core attributes need a little different in setup */
++#define ATTR_GROUP_CORES_RW(_attrname, _fsname, _dispname)		\
++	__ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1);	\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
++	static struct kobj_attribute attr_##_attrname##_current_value =	\
++		__ASUS_ATTR_RW(_attrname, current_value);		\
++	static struct kobj_attribute attr_##_attrname##_default_value = \
++		__ASUS_ATTR_RO(_attrname, default_value);		\
++	static struct kobj_attribute attr_##_attrname##_min_value =	\
++		__ASUS_ATTR_RO(_attrname, min_value);			\
++	static struct kobj_attribute attr_##_attrname##_max_value =	\
++		__ASUS_ATTR_RO(_attrname, max_value);			\
++	static struct kobj_attribute attr_##_attrname##_type =		\
++		__ASUS_ATTR_RO_AS(type, int_type_show);			\
++	static struct attribute *_attrname##_attrs[] = {		\
++		&attr_##_attrname##_current_value.attr,			\
++		&attr_##_attrname##_default_value.attr,			\
++		&attr_##_attrname##_min_value.attr,			\
++		&attr_##_attrname##_max_value.attr,			\
++		&attr_##_attrname##_scalar_increment.attr,		\
++		&attr_##_attrname##_display_name.attr,			\
++		&attr_##_attrname##_type.attr,				\
++		NULL							\
++	};								\
++	static const struct attribute_group _attrname##_attr_group = {	\
++		.name = _fsname, .attrs = _attrname##_attrs		\
++	}
++
++#define ATTR_GROUP_INT_VALUE_ONLY_RO(_attrname, _fsname, _wmi, _dispname)	\
++	WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi);			\
++	static struct kobj_attribute attr_##_attrname##_current_value =		\
++		__ASUS_ATTR_RO(_attrname, current_value);			\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);		\
++	static struct kobj_attribute attr_##_attrname##_type =			\
++		__ASUS_ATTR_RO_AS(type, int_type_show);				\
++	static struct attribute *_attrname##_attrs[] = {			\
++		&attr_##_attrname##_current_value.attr,				\
++		&attr_##_attrname##_display_name.attr,				\
++		&attr_##_attrname##_type.attr, NULL				\
++	};									\
++	static const struct attribute_group _attrname##_attr_group = {		\
++		.name = _fsname, .attrs = _attrname##_attrs			\
++	}
++
++/*
++ * ROG PPT attributes need a little different in setup as they
++ * require rog_tunables members.
++ */
++
++#define __ROG_TUNABLE_SHOW(_prop, _attrname, _val)				\
++	static ssize_t _attrname##_##_prop##_show(				\
++		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
++	{									\
++		struct rog_tunables *tunables = get_current_tunables();		\
++										\
++		if (!tunables || !tunables->power_limits)			\
++			return -ENODEV;						\
++										\
++		return sysfs_emit(buf, "%d\n", tunables->power_limits->_val);	\
++	}									\
++	static struct kobj_attribute attr_##_attrname##_##_prop =		\
++		__ASUS_ATTR_RO(_attrname, _prop)
++
++#define __ROG_TUNABLE_SHOW_DEFAULT(_attrname)					\
++	static ssize_t _attrname##_default_value_show(				\
++		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
++	{									\
++		struct rog_tunables *tunables = get_current_tunables();		\
++										\
++		if (!tunables || !tunables->power_limits)			\
++			return -ENODEV;						\
++										\
++		return sysfs_emit(						\
++			buf, "%d\n",						\
++			tunables->power_limits->_attrname##_def ?		\
++				tunables->power_limits->_attrname##_def :	\
++				tunables->power_limits->_attrname##_max);	\
++	}									\
++	static struct kobj_attribute attr_##_attrname##_default_value =		\
++		__ASUS_ATTR_RO(_attrname, default_value)
++
++#define __ROG_TUNABLE_RW(_attr, _wmi)						\
++	static ssize_t _attr##_current_value_store(				\
++		struct kobject *kobj, struct kobj_attribute *attr,		\
++		const char *buf, size_t count)					\
++	{									\
++		struct rog_tunables *tunables = get_current_tunables();		\
++										\
++		if (!tunables || !tunables->power_limits)			\
++			return -ENODEV;						\
++										\
++		return attr_uint_store(kobj, attr, buf, count,			\
++				       tunables->power_limits->_attr##_min,	\
++				       tunables->power_limits->_attr##_max,	\
++				       &tunables->_attr, _wmi);			\
++	}									\
++	static ssize_t _attr##_current_value_show(				\
++		struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
++	{									\
++		struct rog_tunables *tunables = get_current_tunables();		\
++										\
++		if (!tunables)							\
++			return -ENODEV;						\
++										\
++		return sysfs_emit(buf, "%u\n", tunables->_attr);		\
++	}									\
++	static struct kobj_attribute attr_##_attr##_current_value =		\
++		__ASUS_ATTR_RW(_attr, current_value)
++
++#define ATTR_GROUP_ROG_TUNABLE(_attrname, _fsname, _wmi, _dispname)	\
++	__ROG_TUNABLE_RW(_attrname, _wmi);				\
++	__ROG_TUNABLE_SHOW_DEFAULT(_attrname);				\
++	__ROG_TUNABLE_SHOW(min_value, _attrname, _attrname##_min);	\
++	__ROG_TUNABLE_SHOW(max_value, _attrname, _attrname##_max);	\
++	__ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1);	\
++	__ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname);	\
++	static struct kobj_attribute attr_##_attrname##_type =		\
++		__ASUS_ATTR_RO_AS(type, int_type_show);			\
++	static struct attribute *_attrname##_attrs[] = {		\
++		&attr_##_attrname##_current_value.attr,			\
++		&attr_##_attrname##_default_value.attr,			\
++		&attr_##_attrname##_min_value.attr,			\
++		&attr_##_attrname##_max_value.attr,			\
++		&attr_##_attrname##_scalar_increment.attr,		\
++		&attr_##_attrname##_display_name.attr,			\
++		&attr_##_attrname##_type.attr,				\
++		NULL							\
++	};								\
++	static const struct attribute_group _attrname##_attr_group = {	\
++		.name = _fsname, .attrs = _attrname##_attrs		\
++	}
++
++/* Default is always the maximum value unless *_def is specified */
++struct power_limits {
++	u8 ppt_pl1_spl_min;
++	u8 ppt_pl1_spl_def;
++	u8 ppt_pl1_spl_max;
++	u8 ppt_pl2_sppt_min;
++	u8 ppt_pl2_sppt_def;
++	u8 ppt_pl2_sppt_max;
++	u8 ppt_pl3_fppt_min;
++	u8 ppt_pl3_fppt_def;
++	u8 ppt_pl3_fppt_max;
++	u8 ppt_apu_sppt_min;
++	u8 ppt_apu_sppt_def;
++	u8 ppt_apu_sppt_max;
++	u8 ppt_platform_sppt_min;
++	u8 ppt_platform_sppt_def;
++	u8 ppt_platform_sppt_max;
++	/* Nvidia GPU specific, default is always max */
++	u8 nv_dynamic_boost_def; // unused. exists for macro
++	u8 nv_dynamic_boost_min;
++	u8 nv_dynamic_boost_max;
++	u8 nv_temp_target_def; // unused. exists for macro
++	u8 nv_temp_target_min;
++	u8 nv_temp_target_max;
++	u8 nv_tgp_def; // unused. exists for macro
++	u8 nv_tgp_min;
++	u8 nv_tgp_max;
++};
++
++struct power_data {
++		const struct power_limits *ac_data;
++		const struct power_limits *dc_data;
++		bool requires_fan_curve;
++};
++
++/*
++ * For each avilable attribute there must be a min and a max.
++ * _def is not required and will be assumed to be default == max if missing.
++ */
++static const struct dmi_system_id power_limits[] = {
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA401W"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 75,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 30,
++				.ppt_pl2_sppt_min = 31,
++				.ppt_pl2_sppt_max = 44,
++				.ppt_pl3_fppt_min = 45,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA507N"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 45,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 54,
++				.ppt_pl2_sppt_max = 65,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA507R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80
++			},
++			.dc_data = NULL
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA507X"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 85,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 45,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 54,
++				.ppt_pl2_sppt_max = 65,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA507Z"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 105,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 15,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 85,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 45,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 60,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA607P"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 30,
++				.ppt_pl1_spl_def = 100,
++				.ppt_pl1_spl_max = 135,
++				.ppt_pl2_sppt_min = 30,
++				.ppt_pl2_sppt_def = 115,
++				.ppt_pl2_sppt_max = 135,
++				.ppt_pl3_fppt_min = 30,
++				.ppt_pl3_fppt_max = 135,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 115,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_def = 45,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_def = 60,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 25,
++				.ppt_pl3_fppt_max = 80,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA617NS"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 15,
++				.ppt_apu_sppt_max = 80,
++				.ppt_platform_sppt_min = 30,
++				.ppt_platform_sppt_max = 120
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 25,
++				.ppt_apu_sppt_max = 35,
++				.ppt_platform_sppt_min = 45,
++				.ppt_platform_sppt_max = 100
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA617NT"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 15,
++				.ppt_apu_sppt_max = 80,
++				.ppt_platform_sppt_min = 30,
++				.ppt_platform_sppt_max = 115
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 15,
++				.ppt_apu_sppt_max = 45,
++				.ppt_platform_sppt_min = 30,
++				.ppt_platform_sppt_max = 50
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FA617XS"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 15,
++				.ppt_apu_sppt_max = 80,
++				.ppt_platform_sppt_min = 30,
++				.ppt_platform_sppt_max = 120,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 25,
++				.ppt_apu_sppt_max = 35,
++				.ppt_platform_sppt_min = 45,
++				.ppt_platform_sppt_max = 100,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "FX507Z"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 90,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 135,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 15,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 45,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 60,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GA401Q"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_max = 80,
++			},
++			.dc_data = NULL
++		},
++	},
++	{
++		.matches = {
++			// This model is full AMD. No Nvidia dGPU.
++			DMI_MATCH(DMI_BOARD_NAME, "GA402R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 15,
++				.ppt_apu_sppt_max = 80,
++				.ppt_platform_sppt_min = 30,
++				.ppt_platform_sppt_max = 115,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_apu_sppt_min = 25,
++				.ppt_apu_sppt_def = 30,
++				.ppt_apu_sppt_max = 45,
++				.ppt_platform_sppt_min = 40,
++				.ppt_platform_sppt_max = 60,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GA402X"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 35,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_def = 65,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 35,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GA403U"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 65,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 35,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GA503R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 35,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 65,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 25,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 54,
++				.ppt_pl2_sppt_max = 60,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GA605W"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 85,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 31,
++				.ppt_pl2_sppt_max = 44,
++				.ppt_pl3_fppt_min = 45,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GU603Z"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 60,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 135,
++				/* Only allowed in AC mode */
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 40,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 40,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GU604V"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 65,
++				.ppt_pl1_spl_max = 120,
++				.ppt_pl2_sppt_min = 65,
++				.ppt_pl2_sppt_max = 150,
++				/* Only allowed in AC mode */
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 40,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 40,
++				.ppt_pl2_sppt_max = 60,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GU605M"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 90,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 135,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 38,
++				.ppt_pl2_sppt_max = 53,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GV301Q"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 45,
++				.ppt_pl2_sppt_min = 65,
++				.ppt_pl2_sppt_max = 80,
++			},
++			.dc_data = NULL
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GV301R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 45,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 54,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 35,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GV601R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 35,
++				.ppt_pl1_spl_max = 90,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 54,
++				.ppt_pl2_sppt_max = 100,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_def = 80,
++				.ppt_pl3_fppt_max = 125,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 28,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 54,
++				.ppt_pl2_sppt_def = 40,
++				.ppt_pl2_sppt_max = 60,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_def = 80,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GV601V"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_def = 100,
++				.ppt_pl1_spl_max = 110,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 135,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 40,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 40,
++				.ppt_pl2_sppt_max = 60,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "GX650P"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 110,
++				.ppt_pl1_spl_max = 130,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 125,
++				.ppt_pl2_sppt_max = 130,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_def = 125,
++				.ppt_pl3_fppt_max = 135,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_def = 25,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_def = 35,
++				.ppt_pl2_sppt_max = 65,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_def = 42,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G513I"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				/* Yes this laptop is very limited */
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_max = 80,
++			},
++			.dc_data = NULL,
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G513QM"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				/* Yes this laptop is very limited */
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 100,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_max = 190,
++			},
++			.dc_data = NULL,
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G513R"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 35,
++				.ppt_pl1_spl_max = 90,
++				.ppt_pl2_sppt_min = 54,
++				.ppt_pl2_sppt_max = 100,
++				.ppt_pl3_fppt_min = 54,
++				.ppt_pl3_fppt_max = 125,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 50,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 50,
++				.ppt_pl3_fppt_min = 28,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G614J"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 140,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 175,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 55,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 70,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G634J"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 140,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 175,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 55,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 70,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G733C"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 170,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 175,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 35,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G733P"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 30,
++				.ppt_pl1_spl_def = 100,
++				.ppt_pl1_spl_max = 130,
++				.ppt_pl2_sppt_min = 65,
++				.ppt_pl2_sppt_def = 125,
++				.ppt_pl2_sppt_max = 130,
++				.ppt_pl3_fppt_min = 65,
++				.ppt_pl3_fppt_def = 125,
++				.ppt_pl3_fppt_max = 130,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 65,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 65,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 75,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G814J"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 140,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 140,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 55,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 70,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "G834J"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 28,
++				.ppt_pl1_spl_max = 140,
++				.ppt_pl2_sppt_min = 28,
++				.ppt_pl2_sppt_max = 175,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 25,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 55,
++				.ppt_pl2_sppt_min = 25,
++				.ppt_pl2_sppt_max = 70,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			},
++			.requires_fan_curve = true,
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "H7606W"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 15,
++				.ppt_pl1_spl_max = 80,
++				.ppt_pl2_sppt_min = 35,
++				.ppt_pl2_sppt_max = 80,
++				.ppt_pl3_fppt_min = 35,
++				.ppt_pl3_fppt_max = 80,
++				.nv_dynamic_boost_min = 5,
++				.nv_dynamic_boost_max = 20,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++				.nv_tgp_min = 55,
++				.nv_tgp_max = 85,
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 25,
++				.ppt_pl1_spl_max = 35,
++				.ppt_pl2_sppt_min = 31,
++				.ppt_pl2_sppt_max = 44,
++				.ppt_pl3_fppt_min = 45,
++				.ppt_pl3_fppt_max = 65,
++				.nv_temp_target_min = 75,
++				.nv_temp_target_max = 87,
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "RC71"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 7,
++				.ppt_pl1_spl_max = 30,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_max = 43,
++				.ppt_pl3_fppt_min = 15,
++				.ppt_pl3_fppt_max = 53
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 7,
++				.ppt_pl1_spl_def = 15,
++				.ppt_pl1_spl_max = 25,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_def = 20,
++				.ppt_pl2_sppt_max = 30,
++				.ppt_pl3_fppt_min = 15,
++				.ppt_pl3_fppt_def = 25,
++				.ppt_pl3_fppt_max = 35
++			}
++		},
++	},
++	{
++		.matches = {
++			DMI_MATCH(DMI_BOARD_NAME, "RC72"),
++		},
++		.driver_data = &(struct power_data) {
++			.ac_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 7,
++				.ppt_pl1_spl_max = 30,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_max = 43,
++				.ppt_pl3_fppt_min = 15,
++				.ppt_pl3_fppt_max = 53
++			},
++			.dc_data = &(struct power_limits) {
++				.ppt_pl1_spl_min = 7,
++				.ppt_pl1_spl_def = 17,
++				.ppt_pl1_spl_max = 25,
++				.ppt_pl2_sppt_min = 15,
++				.ppt_pl2_sppt_def = 24,
++				.ppt_pl2_sppt_max = 30,
++				.ppt_pl3_fppt_min = 15,
++				.ppt_pl3_fppt_def = 30,
++				.ppt_pl3_fppt_max = 35
++			}
++		},
++	},
++	{}
++};
++
++#endif /* _ASUS_ARMOURY_H_ */
+diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
+index f7191fdded14..a6d6efdb50b7 100644
+--- a/drivers/platform/x86/asus-wmi.c
++++ b/drivers/platform/x86/asus-wmi.c
+@@ -55,8 +55,6 @@ module_param(fnlock_default, bool, 0444);
+ #define to_asus_wmi_driver(pdrv)					\
+ 	(container_of((pdrv), struct asus_wmi_driver, platform_driver))
+ 
+-#define ASUS_WMI_MGMT_GUID	"97845ED0-4E6D-11DE-8A39-0800200C9A66"
+-
+ #define NOTIFY_BRNUP_MIN		0x11
+ #define NOTIFY_BRNUP_MAX		0x1f
+ #define NOTIFY_BRNDOWN_MIN		0x20
+@@ -105,8 +103,6 @@ module_param(fnlock_default, bool, 0444);
+ #define USB_INTEL_XUSB2PR		0xD0
+ #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
+ 
+-#define ASUS_ACPI_UID_ASUSWMI		"ASUSWMI"
+-
+ #define WMI_EVENT_MASK			0xFFFF
+ 
+ #define FAN_CURVE_POINTS		8
+@@ -340,6 +336,13 @@ struct asus_wmi {
+ /* Global to allow setting externally without requiring driver data */
+ static enum asus_ally_mcu_hack use_ally_mcu_hack = ASUS_WMI_ALLY_MCU_HACK_INIT;
+ 
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
++static void asus_wmi_show_deprecated(void)
++{
++	pr_notice_once("Accessing attributes through /sys/bus/platform/asus_wmi is deprecated and will be removed in a future release. Please switch over to /sys/class/firmware_attributes.\n");
++}
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
++
+ /* WMI ************************************************************************/
+ 
+ static int asus_wmi_evaluate_method3(u32 method_id,
+@@ -390,7 +393,7 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval)
+ {
+ 	return asus_wmi_evaluate_method3(method_id, arg0, arg1, 0, retval);
+ }
+-EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method);
++EXPORT_SYMBOL_NS_GPL(asus_wmi_evaluate_method, "ASUS_WMI");
+ 
+ static int asus_wmi_evaluate_method5(u32 method_id,
+ 		u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 *retval)
+@@ -554,12 +557,46 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval)
+ 	return 0;
+ }
+ 
+-int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param,
+-				 u32 *retval)
++/**
++ * asus_wmi_get_devstate_dsts() - Get the WMI function state.
++ * @dev_id: The WMI method ID to call.
++ * @retval: A pointer to where to store the value returned from WMI.
++ * @return: 0 on success and retval is filled.
++ * @return: -ENODEV if the method ID is unsupported.
++ * @return: everything else is an error from WMI call.
++ */
++int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval)
++{
++	int err;
++
++	err = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, retval);
++	if (err)
++		return err;
++
++	if (*retval == ASUS_WMI_UNSUPPORTED_METHOD)
++		return -ENODEV;
++
++	return 0;
++}
++EXPORT_SYMBOL_NS_GPL(asus_wmi_get_devstate_dsts, "ASUS_WMI");
++
++/**
++ * asus_wmi_set_devstate() - Set the WMI function state.
++ * @dev_id: The WMI function to call.
++ * @ctrl_param: The argument to be used for this WMI function.
++ * @retval: A pointer to where to store the value returned from WMI.
++ * @return: 0 on success and retval is filled.
++ * @return: everything else is an error from WMI call.
++ *
++ * A asus_wmi_set_devstate() call must be paired with a
++ * asus_wmi_get_devstate_dsts() to check if the WMI function is supported.
++ */
++int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval)
+ {
+ 	return asus_wmi_evaluate_method(ASUS_WMI_METHODID_DEVS, dev_id,
+ 					ctrl_param, retval);
+ }
++EXPORT_SYMBOL_NS_GPL(asus_wmi_set_devstate, "ASUS_WMI");
+ 
+ /* Helper for special devices with magic return codes */
+ static int asus_wmi_get_devstate_bits(struct asus_wmi *asus,
+@@ -692,6 +729,7 @@ static void asus_wmi_tablet_mode_get_state(struct asus_wmi *asus)
+ }
+ 
+ /* Charging mode, 1=Barrel, 2=USB ******************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t charge_mode_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -702,12 +740,16 @@ static ssize_t charge_mode_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", value & 0xff);
+ }
+ 
+ static DEVICE_ATTR_RO(charge_mode);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* dGPU ********************************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t dgpu_disable_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -718,6 +760,8 @@ static ssize_t dgpu_disable_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -771,8 +815,10 @@ static ssize_t dgpu_disable_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(dgpu_disable);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* eGPU ********************************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t egpu_enable_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -783,6 +829,8 @@ static ssize_t egpu_enable_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -839,8 +887,10 @@ static ssize_t egpu_enable_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(egpu_enable);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Is eGPU connected? *********************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t egpu_connected_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -851,12 +901,16 @@ static ssize_t egpu_connected_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+ static DEVICE_ATTR_RO(egpu_connected);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* gpu mux switch *************************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t gpu_mux_mode_show(struct device *dev,
+ 				 struct device_attribute *attr, char *buf)
+ {
+@@ -867,6 +921,8 @@ static ssize_t gpu_mux_mode_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -925,6 +981,7 @@ static ssize_t gpu_mux_mode_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(gpu_mux_mode);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* TUF Laptop Keyboard RGB Modes **********************************************/
+ static ssize_t kbd_rgb_mode_store(struct device *dev,
+@@ -1048,6 +1105,7 @@ static const struct attribute_group *kbd_rgb_mode_groups[] = {
+ };
+ 
+ /* Tunable: PPT: Intel=PL1, AMD=SPPT *****************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t ppt_pl2_sppt_store(struct device *dev,
+ 				    struct device_attribute *attr,
+ 				    const char *buf, size_t count)
+@@ -1086,6 +1144,8 @@ static ssize_t ppt_pl2_sppt_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->ppt_pl2_sppt);
+ }
+ static DEVICE_ATTR_RW(ppt_pl2_sppt);
+@@ -1128,6 +1188,8 @@ static ssize_t ppt_pl1_spl_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->ppt_pl1_spl);
+ }
+ static DEVICE_ATTR_RW(ppt_pl1_spl);
+@@ -1171,6 +1233,8 @@ static ssize_t ppt_fppt_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->ppt_fppt);
+ }
+ static DEVICE_ATTR_RW(ppt_fppt);
+@@ -1214,6 +1278,8 @@ static ssize_t ppt_apu_sppt_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->ppt_apu_sppt);
+ }
+ static DEVICE_ATTR_RW(ppt_apu_sppt);
+@@ -1257,6 +1323,8 @@ static ssize_t ppt_platform_sppt_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->ppt_platform_sppt);
+ }
+ static DEVICE_ATTR_RW(ppt_platform_sppt);
+@@ -1300,6 +1368,8 @@ static ssize_t nv_dynamic_boost_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->nv_dynamic_boost);
+ }
+ static DEVICE_ATTR_RW(nv_dynamic_boost);
+@@ -1343,9 +1413,12 @@ static ssize_t nv_temp_target_show(struct device *dev,
+ {
+ 	struct asus_wmi *asus = dev_get_drvdata(dev);
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%u\n", asus->nv_temp_target);
+ }
+ static DEVICE_ATTR_RW(nv_temp_target);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Ally MCU Powersave ********************************************************/
+ 
+@@ -1386,6 +1459,7 @@ void set_ally_mcu_powersave(bool enabled)
+ }
+ EXPORT_SYMBOL_NS_GPL(set_ally_mcu_powersave, "ASUS_WMI");
+ 
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t mcu_powersave_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -1396,6 +1470,8 @@ static ssize_t mcu_powersave_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -1431,6 +1507,7 @@ static ssize_t mcu_powersave_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(mcu_powersave);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Battery ********************************************************************/
+ 
+@@ -2304,6 +2381,7 @@ static int asus_wmi_rfkill_init(struct asus_wmi *asus)
+ }
+ 
+ /* Panel Overdrive ************************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t panel_od_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -2314,6 +2392,8 @@ static ssize_t panel_od_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -2350,9 +2430,10 @@ static ssize_t panel_od_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(panel_od);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Bootup sound ***************************************************************/
+-
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t boot_sound_show(struct device *dev,
+ 			     struct device_attribute *attr, char *buf)
+ {
+@@ -2363,6 +2444,8 @@ static ssize_t boot_sound_show(struct device *dev,
+ 	if (result < 0)
+ 		return result;
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", result);
+ }
+ 
+@@ -2398,8 +2481,10 @@ static ssize_t boot_sound_store(struct device *dev,
+ 	return count;
+ }
+ static DEVICE_ATTR_RW(boot_sound);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Mini-LED mode **************************************************************/
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t mini_led_mode_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -2430,6 +2515,8 @@ static ssize_t mini_led_mode_show(struct device *dev,
+ 		}
+ 	}
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "%d\n", value);
+ }
+ 
+@@ -2500,10 +2587,13 @@ static ssize_t available_mini_led_mode_show(struct device *dev,
+ 		return sysfs_emit(buf, "0 1 2\n");
+ 	}
+ 
++	asus_wmi_show_deprecated();
++
+ 	return sysfs_emit(buf, "0\n");
+ }
+ 
+ static DEVICE_ATTR_RO(available_mini_led_mode);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Quirks *********************************************************************/
+ 
+@@ -3791,6 +3881,7 @@ static int throttle_thermal_policy_set_default(struct asus_wmi *asus)
+ 	return throttle_thermal_policy_write(asus);
+ }
+ 
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ static ssize_t throttle_thermal_policy_show(struct device *dev,
+ 				   struct device_attribute *attr, char *buf)
+ {
+@@ -3834,6 +3925,7 @@ static ssize_t throttle_thermal_policy_store(struct device *dev,
+  * Throttle thermal policy: 0 - default, 1 - overboost, 2 - silent
+  */
+ static DEVICE_ATTR_RW(throttle_thermal_policy);
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ /* Platform profile ***********************************************************/
+ static int asus_wmi_platform_profile_get(struct device *dev,
+@@ -3853,7 +3945,7 @@ static int asus_wmi_platform_profile_get(struct device *dev,
+ 		*profile = PLATFORM_PROFILE_PERFORMANCE;
+ 		break;
+ 	case ASUS_THROTTLE_THERMAL_POLICY_SILENT:
+-		*profile = PLATFORM_PROFILE_QUIET;
++		*profile = PLATFORM_PROFILE_LOW_POWER;
+ 		break;
+ 	default:
+ 		return -EINVAL;
+@@ -3877,7 +3969,7 @@ static int asus_wmi_platform_profile_set(struct device *dev,
+ 	case PLATFORM_PROFILE_BALANCED:
+ 		tp = ASUS_THROTTLE_THERMAL_POLICY_DEFAULT;
+ 		break;
+-	case PLATFORM_PROFILE_QUIET:
++	case PLATFORM_PROFILE_LOW_POWER:
+ 		tp = ASUS_THROTTLE_THERMAL_POLICY_SILENT;
+ 		break;
+ 	default:
+@@ -3890,7 +3982,7 @@ static int asus_wmi_platform_profile_set(struct device *dev,
+ 
+ static int asus_wmi_platform_profile_probe(void *drvdata, unsigned long *choices)
+ {
+-	set_bit(PLATFORM_PROFILE_QUIET, choices);
++	set_bit(PLATFORM_PROFILE_LOW_POWER, choices);
+ 	set_bit(PLATFORM_PROFILE_BALANCED, choices);
+ 	set_bit(PLATFORM_PROFILE_PERFORMANCE, choices);
+ 
+@@ -4435,27 +4527,29 @@ static struct attribute *platform_attributes[] = {
+ 	&dev_attr_camera.attr,
+ 	&dev_attr_cardr.attr,
+ 	&dev_attr_touchpad.attr,
+-	&dev_attr_charge_mode.attr,
+-	&dev_attr_egpu_enable.attr,
+-	&dev_attr_egpu_connected.attr,
+-	&dev_attr_dgpu_disable.attr,
+-	&dev_attr_gpu_mux_mode.attr,
+ 	&dev_attr_lid_resume.attr,
+ 	&dev_attr_als_enable.attr,
+ 	&dev_attr_fan_boost_mode.attr,
+-	&dev_attr_throttle_thermal_policy.attr,
+-	&dev_attr_ppt_pl2_sppt.attr,
+-	&dev_attr_ppt_pl1_spl.attr,
+-	&dev_attr_ppt_fppt.attr,
+-	&dev_attr_ppt_apu_sppt.attr,
+-	&dev_attr_ppt_platform_sppt.attr,
+-	&dev_attr_nv_dynamic_boost.attr,
+-	&dev_attr_nv_temp_target.attr,
+-	&dev_attr_mcu_powersave.attr,
+-	&dev_attr_boot_sound.attr,
+-	&dev_attr_panel_od.attr,
+-	&dev_attr_mini_led_mode.attr,
+-	&dev_attr_available_mini_led_mode.attr,
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
++		&dev_attr_charge_mode.attr,
++		&dev_attr_egpu_enable.attr,
++		&dev_attr_egpu_connected.attr,
++		&dev_attr_dgpu_disable.attr,
++		&dev_attr_gpu_mux_mode.attr,
++		&dev_attr_ppt_pl2_sppt.attr,
++		&dev_attr_ppt_pl1_spl.attr,
++		&dev_attr_ppt_fppt.attr,
++		&dev_attr_ppt_apu_sppt.attr,
++		&dev_attr_ppt_platform_sppt.attr,
++		&dev_attr_nv_dynamic_boost.attr,
++		&dev_attr_nv_temp_target.attr,
++		&dev_attr_mcu_powersave.attr,
++		&dev_attr_boot_sound.attr,
++		&dev_attr_panel_od.attr,
++		&dev_attr_mini_led_mode.attr,
++		&dev_attr_available_mini_led_mode.attr,
++		&dev_attr_throttle_thermal_policy.attr,
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 	NULL
+ };
+ 
+@@ -4477,7 +4571,11 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
+ 		devid = ASUS_WMI_DEVID_LID_RESUME;
+ 	else if (attr == &dev_attr_als_enable.attr)
+ 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
+-	else if (attr == &dev_attr_charge_mode.attr)
++	else if (attr == &dev_attr_fan_boost_mode.attr)
++		ok = asus->fan_boost_mode_available;
++
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
++	if (attr == &dev_attr_charge_mode.attr)
+ 		devid = ASUS_WMI_DEVID_CHARGE_MODE;
+ 	else if (attr == &dev_attr_egpu_enable.attr)
+ 		ok = asus->egpu_enable_available;
+@@ -4515,6 +4613,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
+ 		ok = asus->mini_led_dev_id != 0;
+ 	else if (attr == &dev_attr_available_mini_led_mode.attr)
+ 		ok = asus->mini_led_dev_id != 0;
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ 	if (devid != -1) {
+ 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
+@@ -4770,6 +4869,7 @@ static int asus_wmi_add(struct platform_device *pdev)
+ 	}
+ 
+ 	/* ensure defaults for tunables */
++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS)
+ 	asus->ppt_pl2_sppt = 5;
+ 	asus->ppt_pl1_spl = 5;
+ 	asus->ppt_apu_sppt = 5;
+@@ -4792,17 +4892,18 @@ static int asus_wmi_add(struct platform_device *pdev)
+ 		asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX;
+ 	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX_VIVO))
+ 		asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX_VIVO;
+-
+-	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE))
+-		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE;
+-	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE2))
+-		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE2;
++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */
+ 
+ 	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY))
+ 		asus->throttle_thermal_policy_dev = ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY;
+ 	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO))
+ 		asus->throttle_thermal_policy_dev = ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO;
+ 
++	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE))
++		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE;
++	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE2))
++		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE2;
++
+ 	err = fan_boost_mode_check_present(asus);
+ 	if (err)
+ 		goto fail_fan_boost_mode;
+diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
+index 8a515179113d..86279da06ea2 100644
+--- a/include/linux/platform_data/x86/asus-wmi.h
++++ b/include/linux/platform_data/x86/asus-wmi.h
+@@ -6,6 +6,9 @@
+ #include <linux/types.h>
+ #include <linux/dmi.h>
+ 
++#define ASUS_WMI_MGMT_GUID	"97845ED0-4E6D-11DE-8A39-0800200C9A66"
++#define ASUS_ACPI_UID_ASUSWMI	"ASUSWMI"
++
+ /* WMI Methods */
+ #define ASUS_WMI_METHODID_SPEC	        0x43455053 /* BIOS SPECification */
+ #define ASUS_WMI_METHODID_SFBD		0x44424653 /* Set First Boot Device */
+@@ -73,12 +76,14 @@
+ #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019
+ 
+ /* Misc */
++#define ASUS_WMI_DEVID_PANEL_HD		0x0005001C
+ #define ASUS_WMI_DEVID_PANEL_OD		0x00050019
+ #define ASUS_WMI_DEVID_CAMERA		0x00060013
+ #define ASUS_WMI_DEVID_LID_FLIP		0x00060062
+ #define ASUS_WMI_DEVID_LID_FLIP_ROG	0x00060077
+ #define ASUS_WMI_DEVID_MINI_LED_MODE	0x0005001E
+ #define ASUS_WMI_DEVID_MINI_LED_MODE2	0x0005002E
++#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS	0x0005002A
+ 
+ /* Storage */
+ #define ASUS_WMI_DEVID_CARDREADER	0x00080013
+@@ -133,6 +138,16 @@
+ /* dgpu on/off */
+ #define ASUS_WMI_DEVID_DGPU		0x00090020
+ 
++/* Intel E-core and P-core configuration in a format 0x0[E]0[P] */
++#define ASUS_WMI_DEVID_CORES		0x001200D2
++ /* Maximum Intel E-core and P-core availability */
++#define ASUS_WMI_DEVID_CORES_MAX	0x001200D3
++
++#define ASUS_WMI_DEVID_APU_MEM		0x000600C1
++
++#define ASUS_WMI_DEVID_DGPU_BASE_TGP	0x00120099
++#define ASUS_WMI_DEVID_DGPU_SET_TGP	0x00120098
++
+ /* gpu mux switch, 0 = dGPU, 1 = Optimus */
+ #define ASUS_WMI_DEVID_GPU_MUX		0x00090016
+ #define ASUS_WMI_DEVID_GPU_MUX_VIVO	0x00090026
+@@ -166,6 +181,7 @@ enum asus_ally_mcu_hack {
+ #if IS_REACHABLE(CONFIG_ASUS_WMI)
+ void set_ally_mcu_hack(enum asus_ally_mcu_hack status);
+ void set_ally_mcu_powersave(bool enabled);
++int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval);
+ int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval);
+ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval);
+ #else
+@@ -179,6 +195,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval)
+ {
+ 	return -ENODEV;
+ }
++static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval)
++{
++	return -ENODEV;
++}
+ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1,
+ 					   u32 *retval)
+ {
+@@ -187,6 +207,7 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1,
+ #endif
+ 
+ /* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */
++#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS)
+ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = {
+ 	{
+ 		.matches = {
+@@ -225,5 +246,6 @@ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = {
+ 	},
+ 	{ },
+ };
++#endif
+ 
+ #endif	/* __PLATFORM_DATA_X86_ASUS_WMI_H */
+-- 
+2.51.0
+
diff --git a/sys-kernel/git-sources/0002-bbr3.patch b/sys-kernel/git-sources/0002-bbr3.patch
new file mode 100644
index 0000000..dcc5932
--- /dev/null
+++ b/sys-kernel/git-sources/0002-bbr3.patch
@@ -0,0 +1,3404 @@
+From 3205f6b619a4a9a62d914442d0925738f05854ac Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 1 Sep 2025 09:38:54 +0800
+Subject: [PATCH 2/4] bbr3
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ include/linux/tcp.h                |    6 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   73 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/bpf_tcp_ca.c              |    4 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2232 +++++++++++++++++++++-------
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    4 +-
+ 16 files changed, 1941 insertions(+), 555 deletions(-)
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index 57e478bfaef2..0ea92792629c 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -247,7 +247,8 @@ struct tcp_sock {
+ 	void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq);
+ #endif
+ 	u32	snd_ssthresh;	/* Slow start size threshold		*/
+-	u8	recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
++	u32	recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
++		fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */
+ 	__cacheline_group_end(tcp_sock_read_rx);
+ 
+ 	/* TX read-write hotpath cache lines */
+@@ -304,7 +305,8 @@ struct tcp_sock {
+  */
+ 	struct tcp_options_received rx_opt;
+ 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+-		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
++		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
++		tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */
+ 	__cacheline_group_end(tcp_sock_write_txrx);
+ 
+ 	/* RX read-write hotpath cache lines */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index 1735db332aab..2c4a94af7093 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -132,8 +132,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
+ };
+ 
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index 526a26e7a150..564084c537c7 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -378,11 +378,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
+ #define	TCP_ECN_DEMAND_CWR	BIT(2)
+ #define	TCP_ECN_SEEN		BIT(3)
+ #define	TCP_ECN_MODE_ACCECN	BIT(4)
++#define	TCP_ECN_LOW		BIT(5)
++#define	TCP_ECN_ECT_PERMANENT	BIT(6)
+ 
+ #define	TCP_ECN_DISABLED	0
+ #define	TCP_ECN_MODE_PENDING	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ #define	TCP_ECN_MODE_ANY	(TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+ 
++
+ static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp)
+ {
+ 	return tp->ecn_flags & TCP_ECN_MODE_ANY;
+@@ -840,6 +843,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 
+ u32 tcp_delack_max(const struct sock *sk);
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(const struct sock *sk)
+ {
+@@ -945,6 +957,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ /* provide the departure time in us unit */
+ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+ {
+@@ -1043,9 +1060,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1158,6 +1180,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1180,7 +1203,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED		BIT(0)
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN		BIT(1)
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	BIT(2)
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1200,10 +1227,13 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1214,7 +1244,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1238,8 +1270,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1305,6 +1340,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1324,6 +1367,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+@@ -1336,6 +1380,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2483,7 +2542,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 86bb2e8b17c9..9d9a3eb2ce9b 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
+ };
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index dab9493c791b..cce4975fdcfe 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -517,12 +517,14 @@ enum {
+ #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+ #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
+ #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
++#define RTAX_FEATURE_ECN_LOW		(1 << 5)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+ 				 RTAX_FEATURE_SACK |		\
+ 				 RTAX_FEATURE_TIMESTAMP |	\
+ 				 RTAX_FEATURE_ALLFRAG |		\
+-				 RTAX_FEATURE_TCP_USEC_TS)
++				 RTAX_FEATURE_TCP_USEC_TS |	\
++				 RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index bdac8c42fa82..362644a272ba 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+ #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+ #define TCPI_OPT_TFO_CHILD	128 /* child from a Fast Open option on SYN */
++#define TCPI_OPT_ECN_LOW	256 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 12850a277251..3b8b96692fb4 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -669,15 +669,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
+ 
+ choice
+ 	prompt "Default TCP congestion control"
+diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
+index e01492234b0b..27893b774e08 100644
+--- a/net/ipv4/bpf_tcp_ca.c
++++ b/net/ipv4/bpf_tcp_ca.c
+@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
+ {
+ }
+ 
+-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	return 0;
+ }
+@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ 	.cwnd_event = bpf_tcp_ca_cwnd_event,
+ 	.in_ack_event = bpf_tcp_ca_in_ack_event,
+ 	.pkts_acked = bpf_tcp_ca_pkts_acked,
+-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
++	.tso_segs = bpf_tcp_ca_tso_segs,
+ 	.cong_control = bpf_tcp_ca_cong_control,
+ 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 71a956fbfc55..f9866bd97ac4 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3439,6 +3439,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+@@ -4191,6 +4192,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 	if (tp->tcp_usec_ts)
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index 760941e55153..066da5e5747c 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++
++	return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+ 	WRITE_ONCE(sk->sk_pacing_rate,
+-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
++		   bbr_bw_to_pacing_rate(sk, bw,
++					 bbr_param(sk, startup_pacing_gain)));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		WRITE_ONCE(sk->sk_pacing_rate, rate);
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+ }
+ 
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
++	 * case we need to slow-start up toward something safe: initial cwnd.
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ 
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
++}
+ 
+-	bbr_lt_bw_sampling(sk, rs);
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	u64 bw = 0;
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
+ 	}
++
++	ctx->sample_bw = bw;
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
++ * trips for non-startup phase, and 1-2 round trips for startup.
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
++	probe_rtt_expired = after(tcp_jiffies32, expire);
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+-
+-	bbr_update_model(sk, rs);
+ 
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
++
++	bbr->full_bw_reached = 1;
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
++		return;
++
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++	bool want_ecn_alpha;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    !!bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return -1;
++
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++
++	gain = bbr_param(sk, ecn_alpha_gain);
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
++		return;  /* not fully using inflight_hi, so don't grow it */
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
++	}
++
++	if (bbr->round_start)
++		bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * and in PROBE_UP.
++ */
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh) {
++			return true;
++		}
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
++				BBR_SCALE;
++		if (rs->delivered_ce > ecn_thresh) {
++			return true;
++		}
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
++		return ~0U;
++	}
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr_param(sk, inflight_headroom);
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr_param(sk, cwnd_min_target));
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++static u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_inflight_lo = ~0U;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
++	}
++
++	/* Adjust to the lower of the levels implied by loss/ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr_take_max_bw_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	if (!bbr->loss_round_start)
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr_adapt_lower_bounds(sk, rs);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
++}
++
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr_param(sk, beta);
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited) {
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr_advance_max_bw_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++	if (bbr_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case cross-traffic has ceased and freed up bw).
++	 */
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr_inflight_with_headroom(sk))
++		return false;
++
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_bw_probe_done = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr_start_bw_probe_up(sk, ctx);
++		}
++		break;
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
++	 * (3) Loss filter says loss rate is "too high".
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
++		}
++		if (is_bw_probe_done) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr_check_time_to_probe_bw(sk, rs))
++			return;		/* already decided state transition */
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path) {
++			return true;
++		}
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag,
++				 const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
++
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		ce_ratio = bbr_update_ecn_alpha(sk);
++	}
++	bbr_plb(sk, rs, ce_ratio);
++
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
++
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++}
++
++__bpf_kfunc static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
++
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
++
++	if (bbr_can_use_ecn(sk))
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++static void bbr_run_loss_probe_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
++
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
++	bbr->loss_in_round = 0;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
++	struct bbr *bbr = inet_csk_ca(sk);
++
+ 	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR_PHASE_PROBE_RTT;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR_PHASE_INVALID;
++	}
++}
++
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
++			    union tcp_cc_info *info)
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
++
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
++	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
++		}
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
++
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+ 
+@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
++
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index df758adbb445..e98e5dbc050e 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 71b76e98371a..d7bdfbae1a1e 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1134,7 +1134,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1498,6 +1503,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3716,7 +3732,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3733,6 +3750,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3743,6 +3761,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3862,6 +3885,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -3927,7 +3951,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_in_ack_event(sk, flag);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -3951,6 +3975,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -3971,7 +3996,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5677,13 +5702,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 2994c9222c9c..a53af9d32e09 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -475,6 +475,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index caf11920a878..61e45fbd3e5f 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1609,7 +1612,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int old_factor;
++	int old_factor, inflight_prev;
+ 	long limit;
+ 	u16 flags;
+ 	int nlen;
+@@ -1684,6 +1687,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
++			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -2040,13 +2067,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2771,6 +2797,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+@@ -2983,6 +3010,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..8737f2134648 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index a207877270fb..0e67c7281410 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk)
+ 		struct inet_sock *inet = inet_sk(sk);
+ 		u32 rtx_delta;
+ 
+-		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
++		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
+ 				tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+ 		if (tp->tcp_usec_ts)
+ 			rtx_delta /= USEC_PER_MSEC;
+@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk)
+ 			       icsk_timeout(icsk));
+ 		return;
+ 	}
++
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.51.0
+
diff --git a/sys-kernel/git-sources/0003-cachy.patch b/sys-kernel/git-sources/0003-cachy.patch
new file mode 100644
index 0000000..0a55a31
--- /dev/null
+++ b/sys-kernel/git-sources/0003-cachy.patch
@@ -0,0 +1,9540 @@
+From 657b2f3ce3beb8717754f7b0c4ab900f8f3fe0a6 Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 1 Sep 2025 09:38:54 +0800
+Subject: [PATCH 3/4] cachy
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ .gitignore                                    |    2 +
+ .../admin-guide/kernel-parameters.txt         |   12 +
+ Documentation/admin-guide/sysctl/vm.rst       |   72 +
+ Makefile                                      |   33 +-
+ arch/Kconfig                                  |   19 +
+ arch/x86/Kconfig.cpu                          |   46 +
+ arch/x86/Makefile                             |   16 +-
+ arch/x86/include/asm/pci.h                    |    6 +
+ arch/x86/pci/common.c                         |    7 +-
+ block/Kconfig.iosched                         |   14 +
+ block/Makefile                                |    8 +
+ block/adios.c                                 | 1881 ++++++++++
+ block/elevator.c                              |   26 +-
+ drivers/Makefile                              |   13 +-
+ drivers/ata/ahci.c                            |   23 +-
+ drivers/cpufreq/Kconfig.x86                   |    2 -
+ drivers/cpufreq/intel_pstate.c                |    2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h           |    1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |   10 +
+ drivers/gpu/drm/amd/display/Kconfig           |    6 +
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |    2 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_color.c   |    2 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |    6 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   |    6 +-
+ drivers/gpu/drm/amd/pm/amdgpu_pm.c            |    3 +
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     |   14 +-
+ drivers/input/evdev.c                         |   19 +-
+ drivers/md/dm-crypt.c                         |    5 +
+ drivers/media/v4l2-core/Kconfig               |    5 +
+ drivers/media/v4l2-core/Makefile              |    2 +
+ drivers/media/v4l2-core/v4l2loopback.c        | 3316 +++++++++++++++++
+ drivers/media/v4l2-core/v4l2loopback.h        |  108 +
+ .../media/v4l2-core/v4l2loopback_formats.h    |  445 +++
+ drivers/pci/controller/Makefile               |    6 +
+ drivers/pci/controller/intel-nvme-remap.c     |  462 +++
+ drivers/pci/quirks.c                          |  101 +
+ drivers/scsi/Kconfig                          |    2 +
+ drivers/scsi/Makefile                         |    1 +
+ drivers/scsi/vhba/Kconfig                     |    9 +
+ drivers/scsi/vhba/Makefile                    |    4 +
+ drivers/scsi/vhba/vhba.c                      | 1132 ++++++
+ include/linux/mm.h                            |    8 +
+ include/linux/pagemap.h                       |    2 +-
+ include/linux/user_namespace.h                |    4 +
+ init/Kconfig                                  |   26 +
+ kernel/Kconfig.hz                             |   24 +
+ kernel/Kconfig.preempt                        |    2 +-
+ kernel/fork.c                                 |   14 +
+ kernel/locking/rwsem.c                        |    4 +-
+ kernel/sched/fair.c                           |   13 +
+ kernel/sched/sched.h                          |    2 +-
+ kernel/sysctl.c                               |   13 +
+ kernel/user_namespace.c                       |    7 +
+ mm/Kconfig                                    |   65 +-
+ mm/compaction.c                               |    4 +
+ mm/huge_memory.c                              |    4 +
+ mm/mm_init.c                                  |    1 +
+ mm/page-writeback.c                           |    8 +
+ mm/page_alloc.c                               |    4 +
+ mm/swap.c                                     |    5 +
+ mm/util.c                                     |   34 +
+ mm/vmpressure.c                               |    4 +
+ mm/vmscan.c                                   |  157 +-
+ scripts/Makefile.thinlto                      |   38 +
+ scripts/Makefile.vmlinux_a                    |   83 +
+ scripts/mod/modpost.c                         |   15 +-
+ 66 files changed, 8314 insertions(+), 76 deletions(-)
+ create mode 100644 block/adios.c
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback.c
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback.h
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h
+ create mode 100644 drivers/pci/controller/intel-nvme-remap.c
+ create mode 100644 drivers/scsi/vhba/Kconfig
+ create mode 100644 drivers/scsi/vhba/Makefile
+ create mode 100644 drivers/scsi/vhba/vhba.c
+ create mode 100644 scripts/Makefile.thinlto
+ create mode 100644 scripts/Makefile.vmlinux_a
+
+diff --git a/.gitignore b/.gitignore
+index 929054df5212..e4b492cc3993 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -55,6 +55,7 @@
+ *.zst
+ Module.symvers
+ dtbs-list
++builtin.order
+ modules.order
+ 
+ #
+@@ -66,6 +67,7 @@ modules.order
+ /vmlinux.32
+ /vmlinux.map
+ /vmlinux.symvers
++/vmlinux.thinlto-index
+ /vmlinux.unstripped
+ /vmlinux-gdb.py
+ /vmlinuz
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 747a55abf494..71751ccf0755 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -2384,6 +2384,9 @@
+ 			disable
+ 			  Do not enable intel_pstate as the default
+ 			  scaling driver for the supported processors
++			enable
++			  Enable intel_pstate in-case "disable" was passed
++			  previously in the kernel boot parameters
+                         active
+                           Use intel_pstate driver to bypass the scaling
+                           governors layer of cpufreq and provides it own
+@@ -4799,6 +4802,15 @@
+ 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
+ 				enabled, this kernel boot option can be used to
+ 				disable the use of MSI interrupts system-wide.
++		pcie_acs_override =
++					[PCIE] Override missing PCIe ACS support for:
++				downstream
++					All downstream ports - full ACS capabilities
++				multfunction
++					All multifunction devices - multifunction ACS subset
++				id:nnnn:nnnn
++					Specfic device - full ACS capabilities
++					Specified as vid:did (vendor/device ID) in hex
+ 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
+ 				Safety option to keep boot IRQs enabled. This
+ 				should never be necessary.
+diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
+index 4d71211fdad8..57af938f1969 100644
+--- a/Documentation/admin-guide/sysctl/vm.rst
++++ b/Documentation/admin-guide/sysctl/vm.rst
+@@ -25,6 +25,9 @@ files can be found in mm/swap.c.
+ Currently, these files are in /proc/sys/vm:
+ 
+ - admin_reserve_kbytes
++- anon_min_ratio
++- clean_low_ratio
++- clean_min_ratio
+ - compact_memory
+ - compaction_proactiveness
+ - compact_unevictable_allowed
+@@ -110,6 +113,67 @@ On x86_64 this is about 128MB.
+ Changing this takes effect whenever an application requests memory.
+ 
+ 
++anon_min_ratio
++==============
++
++This knob provides *hard* protection of anonymous pages. The anonymous pages
++on the current node won't be reclaimed under any conditions when their amount
++is below vm.anon_min_ratio.
++
++This knob may be used to prevent excessive swap thrashing when anonymous
++memory is low (for example, when memory is going to be overfilled by
++compressed data of zram module).
++
++Setting this value too high (close to 100) can result in inability to
++swap and can lead to early OOM under memory pressure.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 1.
++
++
++clean_low_ratio
++================
++
++This knob provides *best-effort* protection of clean file pages. The file pages
++on the current node won't be reclaimed under memory pressure when the amount of
++clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM.
++
++Protection of clean file pages using this knob may be used when swapping is
++still possible to
++  - prevent disk I/O thrashing under memory pressure;
++  - improve performance in disk cache-bound tasks under memory pressure.
++
++Setting it to a high value may result in a early eviction of anonymous pages
++into the swap space by attempting to hold the protected amount of clean file
++pages in memory.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 15.
++
++
++clean_min_ratio
++================
++
++This knob provides *hard* protection of clean file pages. The file pages on the
++current node won't be reclaimed under memory pressure when the amount of clean
++file pages is below vm.clean_min_ratio.
++
++Hard protection of clean file pages using this knob may be used to
++  - prevent disk I/O thrashing under memory pressure even with no free swap space;
++  - improve performance in disk cache-bound tasks under memory pressure;
++  - avoid high latency and prevent livelock in near-OOM conditions.
++
++Setting it to a high value may result in a early out-of-memory condition due to
++the inability to reclaim the protected amount of clean file pages when other
++types of pages cannot be reclaimed.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 4.
++
++
+ compact_memory
+ ==============
+ 
+@@ -980,6 +1044,14 @@ be 133 (x + 2x = 200, 2x = 133.33).
+ At 0, the kernel will not initiate swap until the amount of free and
+ file-backed pages is less than the high watermark in a zone.
+ 
++This knob has no effect if the amount of clean file pages on the current
++node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case,
++only anonymous pages can be reclaimed.
++
++If the number of anonymous pages on the current node is below
++vm.anon_min_ratio, then only file pages can be reclaimed with
++any vm.swappiness value.
++
+ 
+ unprivileged_userfaultfd
+ ========================
+diff --git a/Makefile b/Makefile
+index b9c661913250..8fc00895b0ba 100644
+--- a/Makefile
++++ b/Makefile
+@@ -869,11 +869,19 @@ KBUILD_CFLAGS	+= -fno-delete-null-pointer-checks
+ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+ KBUILD_CFLAGS += -O2
+ KBUILD_RUSTFLAGS += -Copt-level=2
++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
++KBUILD_CFLAGS += -O3
++KBUILD_RUSTFLAGS += -Copt-level=3
+ else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ KBUILD_CFLAGS += -Os
+ KBUILD_RUSTFLAGS += -Copt-level=s
+ endif
+ 
++# Perform swing modulo scheduling immediately before the first scheduling pass.
++# This pass looks at innermost loops and reorders their instructions by
++# overlapping different iterations.
++KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves -fivopts -fmodulo-sched)
++
+ # Always set `debug-assertions` and `overflow-checks` because their default
+ # depends on `opt-level` and `debug-assertions`, respectively.
+ KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n)
+@@ -1003,10 +1011,10 @@ export CC_FLAGS_SCS
+ endif
+ 
+ ifdef CONFIG_LTO_CLANG
+-ifdef CONFIG_LTO_CLANG_THIN
+-CC_FLAGS_LTO	:= -flto=thin -fsplit-lto-unit
+-else
++ifdef CONFIG_LTO_CLANG_FULL
+ CC_FLAGS_LTO	:= -flto
++else
++CC_FLAGS_LTO	:= -flto=thin -fsplit-lto-unit
+ endif
+ CC_FLAGS_LTO	+= -fvisibility=hidden
+ 
+@@ -1200,7 +1208,7 @@ export ARCH_DRIVERS	:= $(drivers-y) $(drivers-m)
+ KBUILD_VMLINUX_OBJS := built-in.a $(patsubst %/, %/lib.a, $(filter %/, $(libs-y)))
+ KBUILD_VMLINUX_LIBS := $(filter-out %/, $(libs-y))
+ 
+-export KBUILD_VMLINUX_LIBS
++export KBUILD_VMLINUX_OBJS KBUILD_VMLINUX_LIBS
+ export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
+ 
+ ifdef CONFIG_TRIM_UNUSED_KSYMS
+@@ -1209,16 +1217,12 @@ ifdef CONFIG_TRIM_UNUSED_KSYMS
+ KBUILD_MODULES := y
+ endif
+ 
+-# '$(AR) mPi' needs 'T' to workaround the bug of llvm-ar <= 14
+-quiet_cmd_ar_vmlinux.a = AR      $@
+-      cmd_ar_vmlinux.a = \
+-	rm -f $@; \
+-	$(AR) cDPrST $@ $(KBUILD_VMLINUX_OBJS); \
+-	$(AR) mPiT $$($(AR) t $@ | sed -n 1p) $@ $$($(AR) t $@ | grep -F -f $(srctree)/scripts/head-object-list.txt)
++PHONY += vmlinux_a
++vmlinux_a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE
++	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.vmlinux_a
+ 
+-targets += vmlinux.a
+-vmlinux.a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE
+-	$(call if_changed,ar_vmlinux.a)
++vmlinux.a: vmlinux_a
++	@:
+ 
+ PHONY += vmlinux_o
+ vmlinux_o: vmlinux.a $(KBUILD_VMLINUX_LIBS)
+@@ -1578,6 +1582,7 @@ endif # CONFIG_MODULES
+ CLEAN_FILES += vmlinux.symvers modules-only.symvers \
+ 	       modules.builtin modules.builtin.modinfo modules.nsdeps \
+ 	       modules.builtin.ranges vmlinux.o.map vmlinux.unstripped \
++	       vmlinux.thinlto-index builtin.order \
+ 	       compile_commands.json rust/test \
+ 	       rust-project.json .vmlinux.objs .vmlinux.export.c \
+                .builtin-dtbs-list .builtin-dtb.S
+@@ -2019,7 +2024,7 @@ clean: $(clean-dirs)
+ 	$(call cmd,rmfiles)
+ 	@find . $(RCS_FIND_IGNORE) \
+ 		\( -name '*.[aios]' -o -name '*.rsi' -o -name '*.ko' -o -name '.*.cmd' \
+-		-o -name '*.ko.*' \
++		-o -name '*.ko.*' -o -name '*.o.thinlto.bc' \
+ 		-o -name '*.dtb' -o -name '*.dtbo' \
+ 		-o -name '*.dtb.S' -o -name '*.dtbo.S' \
+ 		-o -name '*.dt.yaml' -o -name 'dtbs-list' \
+diff --git a/arch/Kconfig b/arch/Kconfig
+index d1b4ffd6e085..9ea0ac45923e 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -826,6 +826,25 @@ config LTO_CLANG_THIN
+ 	    https://clang.llvm.org/docs/ThinLTO.html
+ 
+ 	  If unsure, say Y.
++
++config LTO_CLANG_THIN_DIST
++	bool "Clang ThinLTO in distributed mode (EXPERIMENTAL)"
++	depends on HAS_LTO_CLANG && ARCH_SUPPORTS_LTO_CLANG_THIN
++	select LTO_CLANG
++	help
++	  This option enables Clang's ThinLTO in distributed build mode.
++	  In this mode, the linker performs the thin-link, generating
++	  ThinLTO index files. Subsequently, the build system explicitly
++	  invokes ThinLTO backend compilation using these index files
++	  and pre-linked IR objects. The resulting native object files
++	  are with the .thinlto-native.o suffix.
++
++	  This build mode offers improved visibility into the ThinLTO
++	  process through explicit subcommand exposure. It also makes
++	  final native object files directly available, benefiting
++	  tools like objtool and kpatch. Additionally, it provides
++	  crucial granular control over back-end options, enabling
++	  module-specific compiler options, and simplifies debugging.
+ endchoice
+ 
+ config ARCH_SUPPORTS_AUTOFDO_CLANG
+diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
+index f928cf6e3252..d4ce964d9713 100644
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -255,6 +255,11 @@ config CC_HAS_MARCH_NATIVE
+ 	# usage warnings that only appear wth '-march=native'.
+ 	depends on CC_IS_GCC || CLANG_VERSION >= 190100
+ 
++
++choice
++	prompt "x86_64 Compiler Build Optimization"
++	default GENERIC_CPU
++
+ config X86_NATIVE_CPU
+ 	bool "Build and optimize for local/native CPU"
+ 	depends on X86_64
+@@ -269,6 +274,47 @@ config X86_NATIVE_CPU
+ 
+ 	  If unsure, say N.
+ 
++config GENERIC_CPU
++	bool "Generic-x86-64"
++	depends on X86_64
++	help
++	  Generic x86-64 CPU.
++          Runs equally well on all x86-64 CPUs.
++
++config MZEN4
++	bool "AMD Ryzen 4"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
++	help
++	  Select this for AMD Family 19h Zen 4 processors.
++
++          Enables -march=znver4
++
++endchoice
++
++config X86_64_VERSION
++	int "x86-64 compiler ISA level"
++	range 1 4
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64 && GENERIC_CPU
++	help
++	  Specify a specific x86-64 compiler ISA level.
++
++	  There are three x86-64 ISA levels that work on top of
++	  the x86-64 baseline, namely: x86-64-v2 and x86-64-v3.
++
++	  x86-64-v2 brings support for vector instructions up to Streaming SIMD
++	  Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3
++	  (SSSE3), the POPCNT instruction, and CMPXCHG16B.
++
++	  x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional
++	  bit-manipulation instructions.
++
++	  x86-64-v4 is not included since the kernel does not use AVX512 instructions
++
++	  You can find the best version for your CPU by running one of the following:
++	  /lib/ld-linux-x86-64.so.2 --help | grep supported
++	  /lib64/ld-linux-x86-64.so.2 --help | grep supported
++
+ config X86_GENERIC
+ 	bool "Generic x86 support"
+ 	depends on X86_32
+diff --git a/arch/x86/Makefile b/arch/x86/Makefile
+index 1913d342969b..82358ed864bb 100644
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -176,10 +176,22 @@ else
+ ifdef CONFIG_X86_NATIVE_CPU
+         KBUILD_CFLAGS += -march=native
+         KBUILD_RUSTFLAGS += -Ctarget-cpu=native
+-else
++endif
++
++ifdef CONFIG_MZEN4
++        KBUILD_CFLAGS += -march=znver4
++        KBUILD_RUSTFLAGS += -Ctarget-cpu=znver4
++endif
++
++ifdef CONFIG_GENERIC_CPU
++ifeq ($(CONFIG_X86_64_VERSION),1)
+         KBUILD_CFLAGS += -march=x86-64 -mtune=generic
+         KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic
+-endif
++else
++        KBUILD_CFLAGS +=-march=x86-64-v$(CONFIG_X86_64_VERSION)
++        KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION)
++endif # CONFIG_X86_64_VERSION
++endif # CONFIG_GENERIC_CPU
+ 
+         KBUILD_CFLAGS += -mno-red-zone
+         KBUILD_CFLAGS += -mcmodel=kernel
+diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
+index b3ab80a03365..5e883b397ff3 100644
+--- a/arch/x86/include/asm/pci.h
++++ b/arch/x86/include/asm/pci.h
+@@ -26,6 +26,7 @@ struct pci_sysdata {
+ #if IS_ENABLED(CONFIG_VMD)
+ 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
+ #endif
++	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
+ };
+ 
+ extern int pci_routeirq;
+@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus)
+ #define is_vmd(bus)		false
+ #endif /* CONFIG_VMD */
+ 
++static inline bool is_nvme_remap(struct pci_bus *bus)
++{
++	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
++}
++
+ /* Can be used to override the logic in pci_scan_bus for skipping
+    already-configured bus numbers - to be used for buggy BIOSes
+    or architectures with incomplete PCI setup by the loader */
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index ddb798603201..7c20387d8202 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
+ 		return 0;
+ }
+ 
+-#if IS_ENABLED(CONFIG_VMD)
+ struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
+ {
++#if IS_ENABLED(CONFIG_VMD)
+ 	if (is_vmd(dev->bus))
+ 		return to_pci_sysdata(dev->bus)->vmd_dev;
++#endif
++
++	if (is_nvme_remap(dev->bus))
++		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
+ 
+ 	return dev;
+ }
+-#endif
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
+index 27f11320b8d1..e98585dd83e0 100644
+--- a/block/Kconfig.iosched
++++ b/block/Kconfig.iosched
+@@ -16,6 +16,20 @@ config MQ_IOSCHED_KYBER
+ 	  synchronous writes, it will self-tune queue depths to achieve that
+ 	  goal.
+ 
++config MQ_IOSCHED_ADIOS
++	tristate "Adaptive Deadline I/O scheduler"
++	default m
++	help
++	  The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O
++	  scheduler with learning-based adaptive latency control.
++
++config MQ_IOSCHED_DEFAULT_ADIOS
++	bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler"
++	depends on MQ_IOSCHED_ADIOS=y
++	default n
++	help
++	  Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O.
++
+ config IOSCHED_BFQ
+ 	tristate "BFQ I/O scheduler"
+ 	select BLK_ICQ
+diff --git a/block/Makefile b/block/Makefile
+index c65f4da93702..105b12fd86b8 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
+ obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
+ obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
+ obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
++obj-$(CONFIG_MQ_IOSCHED_ADIOS)	+= adios.o
+ bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
+ obj-$(CONFIG_IOSCHED_BFQ)	+= bfq.o
+ 
+@@ -36,3 +37,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
+ 					   blk-crypto-sysfs.o
+ obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
+ obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
++
++all:
++	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
++
++clean:
++	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
++
+diff --git a/block/adios.c b/block/adios.c
+new file mode 100644
+index 000000000000..bcc90564b9ce
+--- /dev/null
++++ b/block/adios.c
+@@ -0,0 +1,1881 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Adaptive Deadline I/O Scheduler (ADIOS)
++ * Copyright (C) 2025 Masahito Suzuki
++ */
++#include <linux/bio.h>
++#include <linux/blkdev.h>
++#include <linux/compiler.h>
++#include <linux/fs.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/math.h>
++#include <linux/module.h>
++#include <linux/rbtree.h>
++#include <linux/sbitmap.h>
++#include <linux/slab.h>
++#include <linux/timekeeping.h>
++#include <linux/percpu.h>
++#include <linux/string.h>
++#include <linux/list_sort.h>
++#include <linux/rcupdate.h>
++
++#include "elevator.h"
++#include "blk.h"
++#include "blk-mq.h"
++#include "blk-mq-sched.h"
++
++#define ADIOS_VERSION "3.0.1"
++
++/* Request Types:
++ *
++ * Tier 0 (Highest Priority): Emergency & System Integrity Requests
++ * -----------------------------------------------------------------
++ * - Target: Requests with the BLK_MQ_INSERT_AT_HEAD flag.
++ * - Purpose: For critical, non-negotiable operations such as device error
++ *   recovery or flush sequences that must bypass all other scheduling logic.
++ * - Implementation: Placed in a dedicated, high-priority FIFO queue
++ *   (`prio_queue[0]`) for immediate dispatch.
++ *
++ * Tier 1 (High Priority): Data Persistence & Ordering Guarantees
++ * ---------------------------------------------------------------
++ * - Target: Requests with integrity-sensitive flags like REQ_FUA or
++ *   REQ_PREFLUSH, typically originating from O_DIRECT I/O.
++ * - Purpose: To ensure strict ordering and data persistence guarantees,
++ *   preventing data corruption in applications like databases.
++ * - Implementation: Handled in a separate, secondary FIFO queue
++ *   (`prio_queue[1]`) to ensure they are processed in submission order and
++ *   before any lower-priority requests.
++ *
++ * Tier 2 (Medium Priority): Application Responsiveness
++ * ----------------------------------------------------
++ * - Target: Normal synchronous requests (e.g., from standard file reads).
++ * - Purpose: To ensure correct application behavior for operations that
++ *   depend on sequential I/O completion (e.g., file system mounts) and to
++ *   provide low latency for interactive applications.
++ * - Implementation: The deadline for these requests is set to their start
++ *   time (`rq->start_time_ns`). This effectively enforces FIFO-like behavior
++ *   within the deadline-sorted red-black tree, preventing out-of-order
++ *   execution of dependent synchronous operations.
++ *
++ * Tier 3 (Normal Priority): Background Throughput
++ * -----------------------------------------------
++ * - Target: Asynchronous requests.
++ * - Purpose: To maximize disk throughput for background tasks where latency
++ *   is not critical.
++ * - Implementation: These are the only requests where ADIOS's adaptive
++ *   latency prediction model is used. A dynamic deadline is calculated based
++ *   on the predicted I/O latency, allowing for aggressive reordering to
++ *   optimize I/O efficiency.
++ *
++ * Dispatch Logic:
++ * The scheduler always dispatches requests in strict priority order:
++ * 1. prio_queue[0] (Tier 0)
++ * 2. prio_queue[1] (Tier 1)
++ * 3. The deadline-sorted batch queue (which naturally prioritizes Tier 2
++ *    over Tier 3 due to their calculated deadlines).
++ */
++
++// Global variable to control the latency
++static u64 default_global_latency_window            = 16000000ULL;
++static u64 default_global_latency_window_rotational = 22000000ULL;
++// Ratio below which batch queues should be refilled
++static u8  default_bq_refill_below_ratio = 20;
++// Maximum latency sample to input
++static u64 default_lat_model_latency_limit = 500000000ULL;
++// Batch ordering strategy
++static u64 default_batch_order = 0;
++// Flags to control compliance with block layer constraints
++static u64 default_compliance_flags = 0x7;
++
++/* Compliance Flags:
++ * 0x1: REQ_FUA requests will be handled as Tier-1, strictly prioritized
++ * 0x2: REQ_PREFLUSH requests will be handled as Tier-1, strictly prioritized
++ * 0x4: Async requests will not be reordered based on the predicted latency
++ */
++enum adios_compliance_flags {
++	ADIOS_CF_PRIO_FUA = 1U << 0,
++	ADIOS_CF_PRIO_PF  = 1U << 1,
++	ADIOS_CF_FIXORDER = 1U << 2,
++};
++
++// Dynamic thresholds for shrinkage
++static u32 default_lm_shrink_at_kreqs  =  5000;
++static u32 default_lm_shrink_at_gbytes =    50;
++static u32 default_lm_shrink_resist    =     2;
++
++enum adios_optype {
++	ADIOS_READ    = 0,
++	ADIOS_WRITE   = 1,
++	ADIOS_DISCARD = 2,
++	ADIOS_OTHER   = 3,
++	ADIOS_OPTYPES = 4,
++};
++
++// Latency targets for each operation type
++static u64 default_latency_target[ADIOS_OPTYPES] = {
++	[ADIOS_READ]    =     2ULL * NSEC_PER_MSEC,
++	[ADIOS_WRITE]   =  2000ULL * NSEC_PER_MSEC,
++	[ADIOS_DISCARD] =  8000ULL * NSEC_PER_MSEC,
++	[ADIOS_OTHER]   =     0ULL * NSEC_PER_MSEC,
++};
++
++// Maximum batch size limits for each operation type
++static u32 default_batch_limit[ADIOS_OPTYPES] = {
++	[ADIOS_READ]    = 36,
++	[ADIOS_WRITE]   = 72,
++	[ADIOS_DISCARD] =  1,
++	[ADIOS_OTHER]   =  1,
++};
++
++enum adios_batch_order {
++	ADIOS_BO_OPTYPE   = 0,
++	ADIOS_BO_ELEVATOR = 1,
++};
++
++// Thresholds for latency model control
++#define LM_BLOCK_SIZE_THRESHOLD 4096
++#define LM_SAMPLES_THRESHOLD    1024
++#define LM_INTERVAL_THRESHOLD   1500
++#define LM_OUTLIER_PERCENTILE     99
++#define LM_LAT_BUCKET_COUNT       64
++
++#define ADIOS_PQ_LEVELS 2
++#define ADIOS_DL_TYPES  2
++#define ADIOS_BQ_PAGES  2
++
++static u32 default_dl_prio[ADIOS_DL_TYPES] = {8, 0};
++
++// Bit flags for the atomic state variable, indicating which queues have requests.
++enum adios_state_flags {
++	ADIOS_STATE_PQ_0      = 1U << 0,
++	ADIOS_STATE_PQ_1      = 1U << 1,
++	ADIOS_STATE_DL_0      = 1U << 2,
++	ADIOS_STATE_DL_1      = 1U << 3,
++	ADIOS_STATE_BQ_PAGE_0 = 1U << 4,
++	ADIOS_STATE_BQ_PAGE_1 = 1U << 5,
++};
++#define ADIOS_STATE_PQ 0
++#define ADIOS_STATE_DL 2
++#define ADIOS_STATE_BQ 4
++
++// Temporal granularity of the deadline tree node (dl_group)
++#define ADIOS_QUANTUM_SHIFT 20
++
++#define ADIOS_MAX_INSERTS_PER_LOCK 72
++#define ADIOS_MAX_DELETES_PER_LOCK 24
++
++// Structure to hold latency bucket data for small requests
++struct latency_bucket_small {
++	u64 weighted_sum_latency;
++	u64 sum_of_weights;
++};
++
++// Structure to hold latency bucket data for large requests
++struct latency_bucket_large {
++	u64 weighted_sum_latency;
++	u64 weighted_sum_block_size;
++	u64 sum_of_weights;
++};
++
++// Structure to hold per-cpu buckets, improving data locality and code clarity.
++struct lm_buckets {
++	struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT];
++	struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT];
++};
++
++// Structure to hold RCU-protected latency model parameters
++struct latency_model_params {
++	u64 base;
++	u64 slope;
++	u64 small_sum_delay;
++	u64 small_count;
++	u64 large_sum_delay;
++	u64 large_sum_bsize;
++	u64 last_update_jiffies;
++	struct rcu_head rcu;
++};
++
++// Structure to hold the latency model context data
++struct latency_model {
++	spinlock_t update_lock;
++	struct latency_model_params __rcu *params;
++
++	// Per-CPU buckets to avoid lock contention on the completion path
++	struct lm_buckets __percpu *pcpu_buckets;
++
++	u32 lm_shrink_at_kreqs;
++	u32 lm_shrink_at_gbytes;
++	u8  lm_shrink_resist;
++};
++
++// Adios scheduler data
++struct adios_data {
++	spinlock_t pq_lock;
++	struct list_head prio_queue[2];
++
++	struct rb_root_cached dl_tree[2];
++	spinlock_t lock;
++	s64 dl_bias;
++	s32 dl_prio[2];
++
++	atomic_t state;
++	u8  bq_state[ADIOS_BQ_PAGES];
++
++	void (*insert_request_fn)(struct blk_mq_hw_ctx *, struct request *,
++								blk_insert_t, struct list_head *);
++
++	u64 global_latency_window;
++	u64 compliance_flags;
++	u64 latency_target[ADIOS_OPTYPES];
++	u32 batch_limit[ADIOS_OPTYPES];
++	u32 batch_actual_max_size[ADIOS_OPTYPES];
++	u32 batch_actual_max_total;
++	u32 async_depth;
++	u32 lat_model_latency_limit;
++	u8  bq_refill_below_ratio;
++	u8  is_rotational;
++	u8  batch_order;
++	u8  elv_direction;
++	sector_t head_pos;
++	sector_t last_completed_pos;
++
++	bool bq_page;
++	struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
++	u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
++	u8  bq_batch_order[ADIOS_BQ_PAGES];
++	spinlock_t bq_lock;
++
++	struct lm_buckets *aggr_buckets;
++
++	struct latency_model latency_model[ADIOS_OPTYPES];
++	struct timer_list update_timer;
++
++	atomic64_t total_pred_lat;
++	u64 last_completed_time;
++
++	struct kmem_cache *rq_data_pool;
++	struct kmem_cache *dl_group_pool;
++
++	struct request_queue *queue;
++};
++
++// List of requests with the same deadline in the deadline-sorted tree
++struct dl_group {
++	struct rb_node node;
++	struct list_head rqs;
++	u64 deadline;
++} __attribute__((aligned(64)));
++
++// Structure to hold scheduler-specific data for each request
++struct adios_rq_data {
++	struct list_head *dl_group;
++	struct list_head dl_node;
++
++	struct request *rq;
++	u64 deadline;
++	u64 pred_lat;
++	u32 block_size;
++} __attribute__((aligned(64)));
++
++static const int adios_prio_to_wmult[40] = {
++ /* -20 */     88761,     71755,     56483,     46273,     36291,
++ /* -15 */     29154,     23254,     18705,     14949,     11916,
++ /* -10 */      9548,      7620,      6100,      4904,      3906,
++ /*  -5 */      3121,      2501,      1991,      1586,      1277,
++ /*   0 */      1024,       820,       655,       526,       423,
++ /*   5 */       335,       272,       215,       172,       137,
++ /*  10 */       110,        87,        70,        56,        45,
++ /*  15 */        36,        29,        23,        18,        15,
++};
++
++static inline bool compliant(struct adios_data *ad, u32 flag) {
++	return ad->compliance_flags & flag;
++}
++
++// Count the number of entries in aggregated small buckets
++static u64 lm_count_small_entries(struct latency_bucket_small *buckets) {
++	u64 total_weight = 0;
++	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
++		total_weight += buckets[i].sum_of_weights;
++	return total_weight;
++}
++
++// Update the small buckets in the latency model from aggregated data
++static bool lm_update_small_buckets(struct latency_model *model,
++		struct latency_model_params *params,
++		struct latency_bucket_small *buckets,
++		u64 total_weight, bool count_all) {
++	u64 sum_latency = 0;
++	u64 sum_weight = 0;
++	u64 cumulative_weight = 0, threshold_weight = 0;
++	u8  outlier_threshold_bucket = 0;
++	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
++	u8  reduction;
++
++	if (count_all)
++		outlier_percentile = 100;
++
++	// Calculate the threshold weight for outlier detection
++	threshold_weight = (total_weight * outlier_percentile) / 100;
++
++	// Identify the bucket that corresponds to the outlier threshold
++	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
++		cumulative_weight += buckets[i].sum_of_weights;
++		if (cumulative_weight >= threshold_weight) {
++			outlier_threshold_bucket = i;
++			break;
++		}
++	}
++
++	// Calculate the average latency, excluding outliers
++	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
++		struct latency_bucket_small *bucket = &buckets[i];
++		if (i < outlier_threshold_bucket) {
++			sum_latency += bucket->weighted_sum_latency;
++			sum_weight += bucket->sum_of_weights;
++		} else {
++			// The threshold bucket's contribution is proportional
++			u64 remaining_weight =
++				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
++			if (bucket->sum_of_weights > 0) {
++				sum_latency += div_u64(bucket->weighted_sum_latency *
++					remaining_weight, bucket->sum_of_weights);
++				sum_weight += remaining_weight;
++			}
++		}
++	}
++
++	// Shrink the model if it reaches at the readjustment threshold
++	if (params->small_count >= 1000ULL * model->lm_shrink_at_kreqs) {
++		reduction = model->lm_shrink_resist;
++		if (params->small_count >> reduction) {
++			params->small_sum_delay -= params->small_sum_delay >> reduction;
++			params->small_count     -= params->small_count     >> reduction;
++		}
++	}
++
++	if (!sum_weight)
++		return false;
++
++	// Accumulate the average latency into the statistics
++	params->small_sum_delay += sum_latency;
++	params->small_count     += sum_weight;
++
++	return true;
++}
++
++// Count the number of entries in aggregated large buckets
++static u64 lm_count_large_entries(struct latency_bucket_large *buckets) {
++	u64 total_weight = 0;
++	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
++		total_weight += buckets[i].sum_of_weights;
++	return total_weight;
++}
++
++// Update the large buckets in the latency model from aggregated data
++static bool lm_update_large_buckets(struct latency_model *model,
++		struct latency_model_params *params,
++		struct latency_bucket_large *buckets,
++		u64 total_weight, bool count_all) {
++	s64 sum_latency = 0;
++	u64 sum_block_size = 0, intercept;
++	u64 cumulative_weight = 0, threshold_weight = 0;
++	u64 sum_weight = 0;
++	u8  outlier_threshold_bucket = 0;
++	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
++	u8  reduction;
++
++	if (count_all)
++		outlier_percentile = 100;
++
++	// Calculate the threshold weight for outlier detection
++	threshold_weight = (total_weight * outlier_percentile) / 100;
++
++	// Identify the bucket that corresponds to the outlier threshold
++	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
++		cumulative_weight += buckets[i].sum_of_weights;
++		if (cumulative_weight >= threshold_weight) {
++			outlier_threshold_bucket = i;
++			break;
++		}
++	}
++
++	// Calculate the average latency and block size, excluding outliers
++	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
++		struct latency_bucket_large *bucket = &buckets[i];
++		if (i < outlier_threshold_bucket) {
++			sum_latency += bucket->weighted_sum_latency;
++			sum_block_size += bucket->weighted_sum_block_size;
++			sum_weight += bucket->sum_of_weights;
++		} else {
++			// The threshold bucket's contribution is proportional
++			u64 remaining_weight =
++				threshold_weight - (cumulative_weight - bucket->sum_of_weights);
++			if (bucket->sum_of_weights > 0) {
++				sum_latency += div_u64(bucket->weighted_sum_latency *
++					remaining_weight, bucket->sum_of_weights);
++				sum_block_size += div_u64(bucket->weighted_sum_block_size *
++					remaining_weight, bucket->sum_of_weights);
++				sum_weight += remaining_weight;
++			}
++		}
++	}
++
++	if (!sum_weight)
++		return false;
++
++	// Shrink the model if it reaches at the readjustment threshold
++	if (params->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) {
++		reduction = model->lm_shrink_resist;
++		if (params->large_sum_bsize >> reduction) {
++			params->large_sum_delay -= params->large_sum_delay >> reduction;
++			params->large_sum_bsize -= params->large_sum_bsize >> reduction;
++		}
++	}
++
++	// Accumulate the average delay into the statistics
++	intercept = params->base;
++	if (sum_latency > intercept)
++		sum_latency -= intercept;
++
++	params->large_sum_delay += sum_latency;
++	params->large_sum_bsize += sum_block_size;
++
++	return true;
++}
++
++static void reset_buckets(struct lm_buckets *buckets)
++{ memset(buckets, 0, sizeof(*buckets)); }
++
++static void lm_reset_pcpu_buckets(struct latency_model *model) {
++	int cpu;
++	for_each_possible_cpu(cpu)
++		reset_buckets(per_cpu_ptr(model->pcpu_buckets, cpu));
++}
++
++// Update the latency model parameters and statistics
++static void latency_model_update(
++		struct adios_data *ad, struct latency_model *model) {
++	u64 now;
++	u64 small_weight, large_weight;
++	bool time_elapsed;
++	bool small_processed = false, large_processed = false;
++	struct lm_buckets *aggr = ad->aggr_buckets;
++	struct latency_bucket_small *asb;
++	struct latency_bucket_large *alb;
++	struct lm_buckets *pcpu_b;
++	unsigned long flags;
++	int cpu;
++	struct latency_model_params *old_params, *new_params;
++
++	spin_lock_irqsave(&model->update_lock, flags);
++
++	old_params = rcu_dereference_protected(model->params,
++				lockdep_is_held(&model->update_lock));
++	new_params = kmemdup(old_params, sizeof(*new_params), GFP_ATOMIC);
++	if (!new_params) {
++		spin_unlock_irqrestore(&model->update_lock, flags);
++		return;
++	}
++
++	// Aggregate data from all CPUs and reset per-cpu buckets.
++	for_each_possible_cpu(cpu) {
++		pcpu_b = per_cpu_ptr(model->pcpu_buckets, cpu);
++
++		for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
++			if (pcpu_b->small_bucket[i].sum_of_weights) {
++				asb = &aggr->small_bucket[i];
++				asb->sum_of_weights +=
++					pcpu_b->small_bucket[i].sum_of_weights;
++				asb->weighted_sum_latency +=
++					pcpu_b->small_bucket[i].weighted_sum_latency;
++			}
++			if (pcpu_b->large_bucket[i].sum_of_weights) {
++				alb = &aggr->large_bucket[i];
++				alb->sum_of_weights +=
++					pcpu_b->large_bucket[i].sum_of_weights;
++				alb->weighted_sum_latency +=
++					pcpu_b->large_bucket[i].weighted_sum_latency;
++				alb->weighted_sum_block_size +=
++					pcpu_b->large_bucket[i].weighted_sum_block_size;
++			}
++		}
++		// Reset per-cpu buckets after aggregating
++		reset_buckets(pcpu_b);
++	}
++
++	// Count the number of entries in aggregated buckets
++	small_weight = lm_count_small_entries(aggr->small_bucket);
++	large_weight = lm_count_large_entries(aggr->large_bucket);
++
++	// Whether enough time has elapsed since the last update
++	now = jiffies;
++	time_elapsed = unlikely(!new_params->base) ||
++		new_params->last_update_jiffies +
++		msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now;
++
++	// Update small buckets
++	if (small_weight && (time_elapsed ||
++			LM_SAMPLES_THRESHOLD <= small_weight || !new_params->base)) {
++		small_processed = lm_update_small_buckets(model, new_params,
++			aggr->small_bucket, small_weight, !new_params->base);
++		memset(&aggr->small_bucket[0], 0, sizeof(aggr->small_bucket));
++	}
++	// Update large buckets
++	if (large_weight && (time_elapsed ||
++			LM_SAMPLES_THRESHOLD <= large_weight || !new_params->slope)) {
++		large_processed = lm_update_large_buckets(model, new_params,
++			aggr->large_bucket, large_weight, !new_params->slope);
++		memset(&aggr->large_bucket[0], 0, sizeof(aggr->large_bucket));
++	}
++
++	// Update the base parameter if small bucket was processed
++	if (small_processed && likely(new_params->small_count))
++		new_params->base = div_u64(new_params->small_sum_delay,
++			new_params->small_count);
++
++	// Update the slope parameter if large bucket was processed
++	if (large_processed && likely(new_params->large_sum_bsize))
++		new_params->slope = div_u64(new_params->large_sum_delay,
++			DIV_ROUND_UP_ULL(new_params->large_sum_bsize, 1024));
++
++	// Update last updated jiffies if update happened or time has elapsed
++	if (small_processed || large_processed || time_elapsed)
++		new_params->last_update_jiffies = now;
++
++	rcu_assign_pointer(model->params, new_params);
++	spin_unlock_irqrestore(&model->update_lock, flags);
++
++	kfree_rcu(old_params, rcu);
++}
++
++// Determine the bucket index for a given measured and predicted latency
++static u8 lm_input_bucket_index(u64 measured, u64 predicted) {
++	u8 bucket_index;
++
++	if (measured < predicted * 2)
++		bucket_index = div_u64((measured * 20), predicted);
++	else if (measured < predicted * 5)
++		bucket_index = div_u64((measured * 10), predicted) + 20;
++	else
++		bucket_index = div_u64((measured * 3), predicted) + 40;
++
++	return bucket_index;
++}
++
++// Input latency data into the latency model
++static void latency_model_input(struct adios_data *ad,
++		struct latency_model *model,
++		u32 block_size, u64 latency, u64 pred_lat, u32 weight) {
++	unsigned long flags;
++	u8 bucket_index;
++	struct lm_buckets *buckets;
++	u64 current_base;
++	struct latency_model_params *params;
++
++	local_irq_save(flags);
++	buckets = per_cpu_ptr(model->pcpu_buckets, __smp_processor_id());
++
++	rcu_read_lock();
++	params = rcu_dereference(model->params);
++	current_base = params->base;
++	rcu_read_unlock();
++
++	if (block_size <= LM_BLOCK_SIZE_THRESHOLD) {
++		// Handle small requests
++		bucket_index = lm_input_bucket_index(latency, current_base ?: 1);
++
++		if (bucket_index >= LM_LAT_BUCKET_COUNT)
++			bucket_index = LM_LAT_BUCKET_COUNT - 1;
++
++		buckets->small_bucket[bucket_index].sum_of_weights += weight;
++		buckets->small_bucket[bucket_index].weighted_sum_latency +=
++			latency * weight;
++
++		local_irq_restore(flags);
++
++		if (unlikely(!current_base)) {
++			latency_model_update(ad, model);
++			return;
++		}
++	} else {
++		// Handle large requests
++		if (!current_base || !pred_lat) {
++			local_irq_restore(flags);
++			return;
++		}
++
++		bucket_index = lm_input_bucket_index(latency, pred_lat);
++
++		if (bucket_index >= LM_LAT_BUCKET_COUNT)
++			bucket_index = LM_LAT_BUCKET_COUNT - 1;
++
++		buckets->large_bucket[bucket_index].sum_of_weights += weight;
++		buckets->large_bucket[bucket_index].weighted_sum_latency +=
++			latency * weight;
++		buckets->large_bucket[bucket_index].weighted_sum_block_size +=
++			block_size * weight;
++
++		local_irq_restore(flags);
++	}
++}
++
++// Predict the latency for a given block size using the latency model
++static u64 latency_model_predict(struct latency_model *model, u32 block_size) {
++	u64 result;
++	struct latency_model_params *params;
++
++	rcu_read_lock();
++	params = rcu_dereference(model->params);
++
++	result = params->base;
++	if (block_size > LM_BLOCK_SIZE_THRESHOLD)
++		result += params->slope *
++			DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024);
++
++	rcu_read_unlock();
++
++	return result;
++}
++
++// Determine the type of operation based on request flags
++static u8 adios_optype(struct request *rq) {
++	switch (rq->cmd_flags & REQ_OP_MASK) {
++	case REQ_OP_READ:
++		return ADIOS_READ;
++	case REQ_OP_WRITE:
++		return ADIOS_WRITE;
++	case REQ_OP_DISCARD:
++		return ADIOS_DISCARD;
++	default:
++		return ADIOS_OTHER;
++	}
++}
++
++static inline u8 adios_optype_not_read(struct request *rq) {
++	return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ;
++}
++
++// Helper function to retrieve adios_rq_data from a request
++static inline struct adios_rq_data *get_rq_data(struct request *rq) {
++	return rq->elv.priv[0];
++}
++
++static inline
++void set_adios_state(struct adios_data *ad, u32 shift, u32 idx, bool flag) {
++	if (flag)
++		atomic_or(1U << (idx + shift), &ad->state);
++	else
++		atomic_andnot(1U << (idx + shift), &ad->state);
++}
++
++static inline u32 get_adios_state(struct adios_data *ad, u32 shift)
++{ return (atomic_read(&ad->state) >> shift) & 0x3; }
++
++// Add a request to the deadline-sorted red-black tree
++static void add_to_dl_tree(
++		struct adios_data *ad, bool dl_idx, struct request *rq) {
++	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
++	struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL;
++	bool leftmost = true;
++	struct adios_rq_data *rd = get_rq_data(rq);
++	struct dl_group *dlg;
++	u64 deadline;
++	bool was_empty = RB_EMPTY_ROOT(&root->rb_root);
++
++	/* Tier-2: Synchronous Requests
++	 * - Needs to be FIFO within a same optype
++	 * - Relaxed order between different optypes
++	 * - basically needs to be processed in early time */
++	rd->deadline = rq->start_time_ns;
++
++	/* Tier-3: Aynchronous Requests
++	 * - Can be reordered and delayed freely */
++	if (!(rq->cmd_flags & REQ_SYNC)) {
++		rd->deadline += ad->latency_target[adios_optype(rq)];
++		if (!compliant(ad, ADIOS_CF_FIXORDER))
++			rd->deadline += rd->pred_lat;
++	}
++
++	// Now quantize the deadline (-> dlg->deadline == RB-Tree key)
++	deadline = rd->deadline & ~((1ULL << ADIOS_QUANTUM_SHIFT) - 1);
++
++	while (*link) {
++		dlg = rb_entry(*link, struct dl_group, node);
++		s64 diff = deadline - dlg->deadline;
++
++		parent = *link;
++		if (diff < 0) {
++			link = &((*link)->rb_left);
++		} else if (diff > 0) {
++			link = &((*link)->rb_right);
++			leftmost = false;
++		} else { // diff == 0
++			goto found;
++		}
++	}
++
++	dlg = rb_entry_safe(parent, struct dl_group, node);
++	if (!dlg || dlg->deadline != deadline) {
++		dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC);
++		if (!dlg)
++			return;
++		dlg->deadline = deadline;
++		INIT_LIST_HEAD(&dlg->rqs);
++		rb_link_node(&dlg->node, parent, link);
++		rb_insert_color_cached(&dlg->node, root, leftmost);
++	}
++found:
++	list_add_tail(&rd->dl_node, &dlg->rqs);
++	rd->dl_group = &dlg->rqs;
++
++	if (was_empty)
++		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, true);
++}
++
++// Remove a request from the deadline-sorted red-black tree
++static void del_from_dl_tree(
++		struct adios_data *ad, bool dl_idx, struct request *rq) {
++	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
++	struct adios_rq_data *rd = get_rq_data(rq);
++	struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs);
++
++	list_del_init(&rd->dl_node);
++	if (list_empty(&dlg->rqs)) {
++		rb_erase_cached(&dlg->node, root);
++		kmem_cache_free(ad->dl_group_pool, dlg);
++	}
++	rd->dl_group = NULL;
++
++	if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root))
++		set_adios_state(ad, ADIOS_STATE_DL, dl_idx, false);
++}
++
++// Remove a request from the scheduler
++static void remove_request(struct adios_data *ad, struct request *rq) {
++	bool dl_idx = adios_optype_not_read(rq);
++	struct request_queue *q = rq->q;
++	struct adios_rq_data *rd = get_rq_data(rq);
++
++	list_del_init(&rq->queuelist);
++
++	// We might not be on the rbtree, if we are doing an insert merge
++	if (rd->dl_group)
++		del_from_dl_tree(ad, dl_idx, rq);
++
++	elv_rqhash_del(q, rq);
++	if (q->last_merge == rq)
++		q->last_merge = NULL;
++}
++
++// Convert a queue depth to the corresponding word depth for shallow allocation
++static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) {
++	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
++	const unsigned int nrr = hctx->queue->nr_requests;
++
++	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
++}
++
++// We limit the depth of request allocation for asynchronous and write requests
++static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) {
++	struct adios_data *ad = data->q->elevator->elevator_data;
++
++	// Do not throttle synchronous reads
++	if (op_is_sync(opf) && !op_is_write(opf))
++		return;
++
++	data->shallow_depth = to_word_depth(data->hctx, ad->async_depth);
++}
++
++// The number of requests in the queue was notified from the block layer
++static void adios_depth_updated(struct blk_mq_hw_ctx *hctx) {
++	struct request_queue *q = hctx->queue;
++	struct adios_data *ad = q->elevator->elevator_data;
++	struct blk_mq_tags *tags = hctx->sched_tags;
++
++	ad->async_depth = q->nr_requests;
++
++	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
++}
++
++// Handle request merging after a merge operation
++static void adios_request_merged(struct request_queue *q, struct request *req,
++				  enum elv_merge type) {
++	bool dl_idx = adios_optype_not_read(req);
++	struct adios_data *ad = q->elevator->elevator_data;
++
++	// Reposition request in the deadline-sorted tree
++	del_from_dl_tree(ad, dl_idx, req);
++	add_to_dl_tree(ad, dl_idx, req);
++}
++
++// Handle merging of requests after one has been merged into another
++static void adios_merged_requests(struct request_queue *q, struct request *req,
++				   struct request *next) {
++	struct adios_data *ad = q->elevator->elevator_data;
++
++	lockdep_assert_held(&ad->lock);
++
++	// kill knowledge of next, this one is a goner
++	remove_request(ad, next);
++}
++
++// Try to merge a bio into an existing rq before associating it with an rq
++static bool adios_bio_merge(struct request_queue *q, struct bio *bio,
++		unsigned int nr_segs) {
++	unsigned long flags;
++	struct adios_data *ad = q->elevator->elevator_data;
++	struct request *free = NULL;
++	bool ret;
++
++	if (!spin_trylock_irqsave(&ad->lock, flags))
++		return false;
++
++	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
++	spin_unlock_irqrestore(&ad->lock, flags);
++
++	if (free)
++		blk_mq_free_request(free);
++
++	return ret;
++}
++
++// Insert a request into the scheduler (after Read & Write models stabilized)
++static void insert_request_post_stability(struct blk_mq_hw_ctx *hctx,
++		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
++	struct request_queue *q = hctx->queue;
++	struct adios_data *ad = q->elevator->elevator_data;
++	struct adios_rq_data *rd = get_rq_data(rq);
++	bool dl_idx;
++	u8 optype = adios_optype(rq);
++	u8 insert_pq_flags = 0;
++
++	rd->block_size = blk_rq_bytes(rq);
++	rd->pred_lat =
++		latency_model_predict(&ad->latency_model[optype], rd->block_size);
++
++	/* Tier-0: BLK_MQ_INSERT_AT_HEAD Requests
++	 * - Needs to be processed ASAP at all costs in any case */
++	if (insert_flags & BLK_MQ_INSERT_AT_HEAD)
++	{ insert_pq_flags |= 0x2; }
++	/* Tier-1: Integrity-sensitive Requests
++	 * - Needs to be FIFO across all optypes */
++	if ((compliant(ad, ADIOS_CF_PRIO_FUA) && (rq->cmd_flags & REQ_FUA)) ||
++		(compliant(ad, ADIOS_CF_PRIO_PF ) && (rq->cmd_flags & REQ_PREFLUSH)))
++	{ insert_pq_flags |= 0x1; }
++
++	if (insert_pq_flags) {
++		u8 pq_idx = !(insert_pq_flags >> 1);
++		if (rd->pred_lat)
++			atomic64_add(rd->pred_lat, &ad->total_pred_lat);
++		scoped_guard(spinlock_irqsave, &ad->pq_lock) {
++			bool was_empty = list_empty(&ad->prio_queue[pq_idx]);
++			list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]);
++			if (was_empty)
++				set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true);
++		}
++		return;
++	}
++
++	if (blk_mq_sched_try_insert_merge(q, rq, free))
++		return;
++
++	dl_idx = adios_optype_not_read(rq);
++	add_to_dl_tree(ad, dl_idx, rq);
++
++	if (rq_mergeable(rq)) {
++		elv_rqhash_add(q, rq);
++		if (!q->last_merge)
++			q->last_merge = rq;
++	}
++}
++
++// Insert a request into the scheduler (before Read & Write models stabilizes)
++static void insert_request_pre_stability(struct blk_mq_hw_ctx *hctx,
++		struct request *rq, blk_insert_t insert_flags, struct list_head *free) {
++	struct adios_data *ad = hctx->queue->elevator->elevator_data;
++	struct adios_rq_data *rd = get_rq_data(rq);
++	u8 optype = adios_optype(rq);
++	u8 pq_idx = !(insert_flags & BLK_MQ_INSERT_AT_HEAD);
++	bool models_stable = false;
++
++	rd->block_size = blk_rq_bytes(rq);
++	rd->pred_lat =
++		latency_model_predict(&ad->latency_model[optype], rd->block_size);
++
++	if (rd->pred_lat)
++		atomic64_add(rd->pred_lat, &ad->total_pred_lat);
++
++	scoped_guard(spinlock_irqsave, &ad->pq_lock) {
++		bool was_empty = list_empty(&ad->prio_queue[pq_idx]);
++		list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]);
++		if (was_empty)
++			set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true);
++	}
++
++	rcu_read_lock();
++	if (rcu_dereference(ad->latency_model[ADIOS_READ].params)->base > 0 &&
++		rcu_dereference(ad->latency_model[ADIOS_WRITE].params)->base > 0)
++			models_stable = true;
++	rcu_read_unlock();
++
++	if (models_stable)
++		ad->insert_request_fn = insert_request_post_stability;
++}
++
++// Insert multiple requests into the scheduler
++static void adios_insert_requests(struct blk_mq_hw_ctx *hctx,
++				   struct list_head *list,
++				   blk_insert_t insert_flags) {
++	struct request_queue *q = hctx->queue;
++	struct adios_data *ad = q->elevator->elevator_data;
++	struct request *rq;
++	bool stop = false;
++	LIST_HEAD(free);
++
++	do {
++	scoped_guard(spinlock_irqsave, &ad->lock)
++	for (int i = 0; i < ADIOS_MAX_INSERTS_PER_LOCK; i++) {
++		if (list_empty(list)) {
++			stop = true;
++			break;
++		}
++		rq = list_first_entry(list, struct request, queuelist);
++		list_del_init(&rq->queuelist);
++		ad->insert_request_fn(hctx, rq, insert_flags, &free);
++	}} while (!stop);
++
++	blk_mq_free_requests(&free);
++}
++
++// Prepare a request before it is inserted into the scheduler
++static void adios_prepare_request(struct request *rq) {
++	struct adios_data *ad = rq->q->elevator->elevator_data;
++	struct adios_rq_data *rd = get_rq_data(rq);
++
++	rq->elv.priv[0] = NULL;
++
++	/* Allocate adios_rq_data from the memory pool */
++	rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC);
++	if (WARN(!rd, "adios_prepare_request: "
++			"Failed to allocate memory from rq_data_pool. rd is NULL\n"))
++		return;
++
++	rd->rq = rq;
++	rq->elv.priv[0] = rd;
++}
++
++static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) {
++	struct rb_root_cached *root = &ad->dl_tree[idx];
++	struct rb_node *first = rb_first_cached(root);
++	struct dl_group *dl_group = rb_entry(first, struct dl_group, node);
++
++	return list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node);
++}
++
++// Comparison function for sorting requests by block address
++static int cmp_rq_pos(void *priv,
++		const struct list_head *a, const struct list_head *b) {
++	struct request *rq_a = list_entry(a, struct request, queuelist);
++	struct request *rq_b = list_entry(b, struct request, queuelist);
++	u64 pos_a = blk_rq_pos(rq_a);
++	u64 pos_b = blk_rq_pos(rq_b);
++
++	return (int)(pos_a > pos_b) - (int)(pos_a < pos_b);
++}
++
++#ifndef list_last_entry_or_null
++#define list_last_entry_or_null(ptr, type, member) \
++	(!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL)
++#endif
++
++// Update the elevator direction
++static void update_elv_direction(struct adios_data *ad) {
++	if (!ad->is_rotational)
++		return;
++
++	bool page = ad->bq_page;
++	struct list_head *q = &ad->batch_queue[page][1];
++	if (ad->bq_batch_order[page] < ADIOS_BO_ELEVATOR || list_empty(q)) {
++		ad->elv_direction = 0;
++		return;
++	}
++
++	// Get first and last request positions in the queue
++	struct request *rq_a = list_first_entry(q, struct request, queuelist);
++	struct request *rq_b = list_last_entry (q, struct request, queuelist);
++	u64 pos_a = blk_rq_pos(rq_a);
++	u64 pos_b = blk_rq_pos(rq_b);
++	u64 avg_rq_pos = (pos_a + pos_b) >> 1;
++
++	ad->elv_direction = !!(ad->head_pos > avg_rq_pos);
++}
++
++// Fill the batch queues with requests from the deadline-sorted red-black tree
++static bool fill_batch_queues(struct adios_data *ad, u64 tpl) {
++	struct adios_rq_data *rd;
++	struct request *rq;
++	struct list_head *dest_q;
++	u8  dest_idx;
++	u64 added_lat = 0;
++	u32 optype_count[ADIOS_OPTYPES] = {0};
++	u32 count = 0;
++	u8 optype;
++	bool page = !ad->bq_page, dl_idx, bias_idx, update_bias;
++	u32 dl_queued;
++	u8 bq_batch_order;
++	bool stop = false;
++
++	// Reset batch queue counts for the back page
++	memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page]));
++
++	ad->bq_batch_order[page] =
++		bq_batch_order = ad->batch_order;
++
++	do {
++	scoped_guard(spinlock_irqsave, &ad->lock)
++	for (int i = 0; i < ADIOS_MAX_DELETES_PER_LOCK; i++) {
++		bool has_base = false;
++
++		dl_queued = get_adios_state(ad, ADIOS_STATE_DL);
++		// Check if there are any requests queued in the deadline tree
++		if (!dl_queued) {
++			stop = true;
++			break;
++		}
++
++		// Reads if both queues have requests, otherwise pick the non-empty.
++		dl_idx = dl_queued >> 1;
++
++		// Get the first request from the deadline-sorted tree
++		rd = get_dl_first_rd(ad, dl_idx);
++
++		bias_idx = ad->dl_bias < 0;
++		// If read and write requests are queued, choose one based on bias
++		if (dl_queued == 0x3) {
++			struct adios_rq_data *trd[2] = {get_dl_first_rd(ad, 0), rd};
++			rd = trd[bias_idx];
++
++			update_bias = (trd[bias_idx]->deadline > trd[!bias_idx]->deadline);
++		} else
++			update_bias = (bias_idx == dl_idx);
++
++		rq = rd->rq;
++		optype = adios_optype(rq);
++
++		rcu_read_lock();
++		has_base =
++			!!rcu_dereference(ad->latency_model[optype].params)->base;
++		rcu_read_unlock();
++
++		// Check batch size and total predicted latency
++		if (count && (!has_base ||
++				ad->batch_count[page][optype] >= ad->batch_limit[optype] ||
++				(tpl + added_lat + rd->pred_lat) > ad->global_latency_window)) {
++			stop = true;
++			break;
++		}
++
++		if (update_bias) {
++			s64 sign = ((s64)bias_idx << 1) - 1;
++			if (unlikely(!rd->pred_lat))
++				ad->dl_bias = sign;
++			else
++				// Adjust the bias based on the predicted latency
++				ad->dl_bias += sign * (s64)((rd->pred_lat *
++					adios_prio_to_wmult[ad->dl_prio[bias_idx] + 20]) >> 10);
++		}
++
++		remove_request(ad, rq);
++
++		// Add request to the corresponding batch queue
++		dest_idx = (bq_batch_order == ADIOS_BO_OPTYPE || optype == ADIOS_OTHER)?
++			optype : !!(rd->deadline != rq->start_time_ns);
++		dest_q = &ad->batch_queue[page][dest_idx];
++		list_add_tail(&rq->queuelist, dest_q);
++		ad->bq_state[page] |= 1U << dest_idx;
++		ad->batch_count[page][optype]++;
++		optype_count[optype]++;
++		added_lat += rd->pred_lat;
++		count++;
++	}} while (!stop);
++
++	if (bq_batch_order == ADIOS_BO_ELEVATOR && ad->batch_count[page][1] > 1)
++			list_sort(NULL, &ad->batch_queue[page][1], cmp_rq_pos);
++
++	if (count) {
++		if (added_lat)
++			atomic64_add(added_lat, &ad->total_pred_lat);
++
++		set_adios_state(ad, ADIOS_STATE_BQ, page, true);
++
++		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
++			if (ad->batch_actual_max_size[optype] < optype_count[optype])
++				ad->batch_actual_max_size[optype] = optype_count[optype];
++		if (ad->batch_actual_max_total < count)
++			ad->batch_actual_max_total = count;
++	}
++	return count;
++}
++
++// Flip to the next batch queue page
++static void flip_bq_page(struct adios_data *ad) {
++	ad->bq_page = !ad->bq_page;
++	update_elv_direction(ad);
++}
++
++// Pop a request from the specified index (optype or elevator tier)
++static inline struct request *pop_bq_request(
++		struct adios_data *ad, u8 idx, bool direction) {
++	bool page = ad->bq_page;
++	struct list_head *q = &ad->batch_queue[page][idx];
++	struct request *rq = direction ?
++		list_last_entry_or_null (q, struct request, queuelist):
++		list_first_entry_or_null(q, struct request, queuelist);
++	if (rq) {
++		list_del_init(&rq->queuelist);
++		if (list_empty(q))
++			ad->bq_state[page] &= ~(1U << idx);
++	}
++	return rq;
++}
++
++static struct request *pop_next_bq_request_optype(struct adios_data *ad) {
++	u32 bq_state = ad->bq_state[ad->bq_page];
++	if (!bq_state) return NULL;
++
++	struct request *rq;
++	u32 bq_idx = 31 - __builtin_clz(bq_state);
++
++	// Dispatch based on optype (FIFO within each) or single-queue elevator
++	rq = pop_bq_request(ad, bq_idx, false);
++	return rq;
++}
++
++static struct request *pop_next_bq_request_elevator(struct adios_data *ad) {
++	u32 bq_state = ad->bq_state[ad->bq_page];
++	if (!bq_state) return NULL;
++
++	struct request *rq;
++	u32 bq_idx = 31 - __builtin_clz(bq_state);
++	bool direction = (bq_idx == 1) & ad->elv_direction;
++
++	// Tier-2 (sync) is always high priority
++	// Tier-3 (async) uses the pre-calculated elevator direction
++	rq = pop_bq_request(ad, bq_idx, direction);
++
++	/* If batch queue for the sync requests just became empty */
++	if (bq_idx == 0 && rq && !(bq_state & 0x1))
++		update_elv_direction(ad);
++
++	return rq;
++}
++
++// Returns the state of the other batch queue page
++static bool more_bq_ready(struct adios_data *ad, bool page) {
++	u32 state = get_adios_state(ad, ADIOS_STATE_BQ);
++	return state & (1U << !page);
++}
++
++// Dispatch a request from the batch queues
++static struct request *dispatch_from_bq(struct adios_data *ad) {
++	struct request *rq;
++
++	guard(spinlock_irqsave)(&ad->bq_lock);
++
++	u64 tpl = atomic64_read(&ad->total_pred_lat);
++
++	if (!more_bq_ready(ad, ad->bq_page) && (!tpl || tpl < div_u64(
++			ad->global_latency_window * ad->bq_refill_below_ratio, 100)))
++		fill_batch_queues(ad, tpl);
++
++again:
++	// Use the per-page state to decide the dispatch logic, ensuring correctness
++	rq = (ad->bq_batch_order[ad->bq_page] == ADIOS_BO_ELEVATOR) ?
++		pop_next_bq_request_elevator(ad):
++		pop_next_bq_request_optype(ad);
++
++	if (rq) {
++		bool page = ad->bq_page;
++		bool is_empty = !ad->bq_state[page];
++		if (is_empty)
++			set_adios_state(ad, ADIOS_STATE_BQ, page, false);
++		return rq;
++	}
++
++	// If there's more batch queue page available, flip to it and retry
++	if (more_bq_ready(ad, ad->bq_page)) {
++		flip_bq_page(ad);
++		goto again;
++	}
++
++	return NULL;
++}
++
++// Dispatch a request from the priority queue
++static struct request *dispatch_from_pq(struct adios_data *ad) {
++	struct request *rq = NULL;
++
++	guard(spinlock_irqsave)(&ad->pq_lock);
++	u32 pq_state = get_adios_state(ad, ADIOS_STATE_PQ);
++	u8  pq_idx = pq_state >> 1;
++	struct list_head *q = &ad->prio_queue[pq_idx];
++
++	if (unlikely(list_empty(q))) return NULL;
++
++	rq = list_first_entry(q, struct request, queuelist);
++	list_del_init(&rq->queuelist);
++	if (list_empty(q)) {
++		set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, false);
++		update_elv_direction(ad);
++	}
++	return rq;
++}
++
++// Dispatch a request to the hardware queue
++static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) {
++	struct adios_data *ad = hctx->queue->elevator->elevator_data;
++	struct request *rq;
++
++	rq = dispatch_from_pq(ad);
++	if (rq) goto found;
++	rq = dispatch_from_bq(ad);
++	if (!rq) return NULL;
++found:
++	if (ad->is_rotational)
++		ad->head_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
++
++	rq->rq_flags |= RQF_STARTED;
++	return rq;
++}
++
++// Timer callback function to periodically update latency models
++static void update_timer_callback(struct timer_list *t) {
++	struct adios_data *ad = timer_container_of(ad, t, update_timer);
++
++	for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++)
++		latency_model_update(ad, &ad->latency_model[optype]);
++}
++
++// Handle the completion of a request
++static void adios_completed_request(struct request *rq, u64 now) {
++	struct adios_data *ad = rq->q->elevator->elevator_data;
++	struct adios_rq_data *rd = get_rq_data(rq);
++
++	u64 tpl_after = atomic64_sub_return(rd->pred_lat, &ad->total_pred_lat);
++	u8 optype = adios_optype(rq);
++
++	if (optype == ADIOS_OTHER) {
++		// Non-positional commands make the head position unpredictable.
++		// Invalidate our knowledge of the last completed position.
++		if (ad->is_rotational)
++			ad->last_completed_pos = 0;
++		return;
++	}
++
++	u64 lct = ad->last_completed_time ?: rq->io_start_time_ns;
++	ad->last_completed_time = (tpl_after) ? now : 0;
++
++	if (!rq->io_start_time_ns || !rd->block_size || unlikely(now < lct))
++		return;
++
++	u64 latency = now - lct;
++	if (latency > ad->lat_model_latency_limit)
++		return;
++
++	u32 weight = 1;
++	if (ad->is_rotational) {
++		sector_t current_pos = blk_rq_pos(rq);
++		// Only calculate seek distance if we have a valid last position.
++		if (ad->last_completed_pos > 0) {
++			u64 seek_distance = abs(
++				(s64)current_pos - (s64)ad->last_completed_pos);
++			weight = 65 - __builtin_clzll(seek_distance);
++		}
++		// Update (or re-synchronize) our knowledge of the head position.
++		ad->last_completed_pos = current_pos + blk_rq_sectors(rq);
++	}
++
++	latency_model_input(ad, &ad->latency_model[optype],
++		rd->block_size, latency, rd->pred_lat, weight);
++	timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100));
++}
++
++// Clean up after a request is finished
++static void adios_finish_request(struct request *rq) {
++	struct adios_data *ad = rq->q->elevator->elevator_data;
++
++	if (rq->elv.priv[0]) {
++		// Free adios_rq_data back to the memory pool
++		kmem_cache_free(ad->rq_data_pool, get_rq_data(rq));
++		rq->elv.priv[0] = NULL;
++	}
++}
++
++// Check if there are any requests available for dispatch
++static bool adios_has_work(struct blk_mq_hw_ctx *hctx) {
++	struct adios_data *ad = hctx->queue->elevator->elevator_data;
++
++	return atomic_read(&ad->state) != 0;
++}
++
++// Initialize the scheduler-specific data for a hardware queue
++static int adios_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) {
++	adios_depth_updated(hctx);
++	return 0;
++}
++
++// Initialize the scheduler-specific data when initializing the request queue
++static int adios_init_sched(struct request_queue *q, struct elevator_type *e) {
++	struct adios_data *ad;
++	struct elevator_queue *eq;
++	int ret = -ENOMEM;
++	u8 optype = 0;
++
++	eq = elevator_alloc(q, e);
++	if (!eq) {
++		pr_err("adios: Failed to allocate the elevator\n");
++		return ret;
++	}
++
++	ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
++	if (!ad) {
++		pr_err("adios: Failed to create adios_data\n");
++		goto put_eq;
++	}
++
++	// Create a memory pool for adios_rq_data
++	ad->rq_data_pool = kmem_cache_create("rq_data_pool",
++						sizeof(struct adios_rq_data),
++						0, SLAB_HWCACHE_ALIGN, NULL);
++	if (!ad->rq_data_pool) {
++		pr_err("adios: Failed to create rq_data_pool\n");
++		goto free_ad;
++	}
++
++	/* Create a memory pool for dl_group */
++	ad->dl_group_pool = kmem_cache_create("dl_group_pool",
++						sizeof(struct dl_group),
++						0, SLAB_HWCACHE_ALIGN, NULL);
++	if (!ad->dl_group_pool) {
++		pr_err("adios: Failed to create dl_group_pool\n");
++		goto destroy_rq_data_pool;
++	}
++
++	for (int i = 0; i < ADIOS_PQ_LEVELS; i++)
++		INIT_LIST_HEAD(&ad->prio_queue[i]);
++
++	for (u8 i = 0; i < ADIOS_DL_TYPES; i++) {
++		ad->dl_tree[i] = RB_ROOT_CACHED;
++		ad->dl_prio[i] = default_dl_prio[i];
++	}
++	ad->dl_bias = 0;
++
++	for (u8 page = 0; page < ADIOS_BQ_PAGES; page++)
++		for (optype = 0; optype < ADIOS_OPTYPES; optype++)
++			INIT_LIST_HEAD(&ad->batch_queue[page][optype]);
++
++	ad->aggr_buckets = kzalloc(sizeof(*ad->aggr_buckets), GFP_KERNEL);
++	if (!ad->aggr_buckets) {
++		pr_err("adios: Failed to allocate aggregation buckets\n");
++		goto destroy_dl_group_pool;
++	}
++
++	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
++		struct latency_model *model = &ad->latency_model[optype];
++		struct latency_model_params *params;
++
++		spin_lock_init(&model->update_lock);
++		params = kzalloc(sizeof(*params), GFP_KERNEL);
++		if (!params) {
++			pr_err("adios: Failed to allocate latency_model_params\n");
++			goto free_buckets;
++		}
++		params->last_update_jiffies = jiffies;
++		RCU_INIT_POINTER(model->params, params);
++
++		model->pcpu_buckets = alloc_percpu(struct lm_buckets);
++		if (!model->pcpu_buckets) {
++			pr_err("adios: Failed to allocate per-CPU buckets\n");
++			kfree(params);
++			goto free_buckets;
++		}
++
++		model->lm_shrink_at_kreqs  = default_lm_shrink_at_kreqs;
++		model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes;
++		model->lm_shrink_resist    = default_lm_shrink_resist;
++	}
++
++	for (optype = 0; optype < ADIOS_OPTYPES; optype++) {
++		ad->latency_target[optype] = default_latency_target[optype];
++		ad->batch_limit[optype] = default_batch_limit[optype];
++	}
++
++	eq->elevator_data = ad;
++
++	ad->is_rotational = !!(q->limits.features & BLK_FEAT_ROTATIONAL);
++	ad->global_latency_window = (ad->is_rotational)?
++		default_global_latency_window_rotational:
++		default_global_latency_window;
++	ad->bq_refill_below_ratio = default_bq_refill_below_ratio;
++	ad->lat_model_latency_limit = default_lat_model_latency_limit;
++	ad->batch_order = default_batch_order;
++	ad->compliance_flags = default_compliance_flags;
++
++	ad->insert_request_fn = insert_request_pre_stability;
++
++	atomic_set(&ad->state, 0);
++
++	spin_lock_init(&ad->lock);
++	spin_lock_init(&ad->pq_lock);
++	spin_lock_init(&ad->bq_lock);
++
++	timer_setup(&ad->update_timer, update_timer_callback, 0);
++
++	/* We dispatch from request queue wide instead of hw queue */
++	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
++
++	ad->queue = q;
++	blk_stat_enable_accounting(q);
++
++	q->elevator = eq;
++	return 0;
++
++free_buckets:
++	pr_err("adios: Failed to allocate per-cpu buckets\n");
++	while (optype-- > 0) {
++		struct latency_model *prev_model = &ad->latency_model[optype];
++		kfree(rcu_access_pointer(prev_model->params));
++		free_percpu(prev_model->pcpu_buckets);
++	}
++	kfree(ad->aggr_buckets);
++destroy_dl_group_pool:
++	kmem_cache_destroy(ad->dl_group_pool);
++destroy_rq_data_pool:
++	kmem_cache_destroy(ad->rq_data_pool);
++free_ad:
++	kfree(ad);
++put_eq:
++	kobject_put(&eq->kobj);
++	return ret;
++}
++
++// Clean up and free resources when exiting the scheduler
++static void adios_exit_sched(struct elevator_queue *e) {
++	struct adios_data *ad = e->elevator_data;
++
++	timer_shutdown_sync(&ad->update_timer);
++
++	for (int i = 0; i < 2; i++)
++		WARN_ON_ONCE(!list_empty(&ad->prio_queue[i]));
++
++	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
++		struct latency_model *model = &ad->latency_model[i];
++		struct latency_model_params *params = rcu_access_pointer(model->params);
++
++		RCU_INIT_POINTER(model->params, NULL);
++		kfree_rcu(params, rcu);
++
++		free_percpu(model->pcpu_buckets);
++	}
++
++	synchronize_rcu();
++
++	kfree(ad->aggr_buckets);
++
++	if (ad->rq_data_pool)
++		kmem_cache_destroy(ad->rq_data_pool);
++
++	if (ad->dl_group_pool)
++		kmem_cache_destroy(ad->dl_group_pool);
++
++	blk_stat_disable_accounting(ad->queue);
++
++	kfree(ad);
++}
++
++static void sideload_latency_model(
++		struct latency_model *model, u64 base, u64 slope) {
++	struct latency_model_params *old_params, *new_params;
++	unsigned long flags;
++
++	new_params = kzalloc(sizeof(*new_params), GFP_KERNEL);
++	if (!new_params)
++		return;
++
++	spin_lock_irqsave(&model->update_lock, flags);
++
++	old_params = rcu_dereference_protected(model->params,
++			lockdep_is_held(&model->update_lock));
++
++	new_params->last_update_jiffies = jiffies;
++
++	// Initialize base and its statistics as a single sample.
++	new_params->base = base;
++	new_params->small_sum_delay = base;
++	new_params->small_count = 1;
++
++	// Initialize slope and its statistics as a single sample.
++	new_params->slope = slope;
++	new_params->large_sum_delay = slope;
++	new_params->large_sum_bsize = 1024; /* Corresponds to 1 KiB */
++
++	lm_reset_pcpu_buckets(model);
++
++	rcu_assign_pointer(model->params, new_params);
++	spin_unlock_irqrestore(&model->update_lock, flags);
++
++	kfree_rcu(old_params, rcu);
++}
++
++// Define sysfs attributes for operation types
++#define SYSFS_OPTYPE_DECL(name, optype) \
++static ssize_t adios_lat_model_##name##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	struct latency_model *model = &ad->latency_model[optype]; \
++	struct latency_model_params *params; \
++	ssize_t len = 0; \
++	u64 base, slope; \
++	rcu_read_lock(); \
++	params = rcu_dereference(model->params); \
++	base = params->base; \
++	slope = params->slope; \
++	rcu_read_unlock(); \
++	len += sprintf(page,       "base : %llu ns\n", base); \
++	len += sprintf(page + len, "slope: %llu ns/KiB\n", slope); \
++	return len; \
++} \
++static ssize_t adios_lat_model_##name##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	struct adios_data *ad = e->elevator_data; \
++	struct latency_model *model = &ad->latency_model[optype]; \
++	u64 base, slope; \
++	int ret; \
++	ret = sscanf(page, "%llu %llu", &base, &slope); \
++	if (ret != 2) \
++		return -EINVAL; \
++	sideload_latency_model(model, base, slope); \
++	reset_buckets(ad->aggr_buckets); \
++	return count; \
++} \
++static ssize_t adios_lat_target_##name##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	return sprintf(page, "%llu\n", ad->latency_target[optype]); \
++} \
++static ssize_t adios_lat_target_##name##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	struct adios_data *ad = e->elevator_data; \
++	unsigned long nsec; \
++	int ret; \
++	ret = kstrtoul(page, 10, &nsec); \
++	if (ret) \
++		return ret; \
++	sideload_latency_model(&ad->latency_model[optype], 0, 0); \
++	ad->latency_target[optype] = nsec; \
++	return count; \
++} \
++static ssize_t adios_batch_limit_##name##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	return sprintf(page, "%u\n", ad->batch_limit[optype]); \
++} \
++static ssize_t adios_batch_limit_##name##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	unsigned long max_batch; \
++	int ret; \
++	ret = kstrtoul(page, 10, &max_batch); \
++	if (ret || max_batch == 0) \
++		return -EINVAL; \
++	struct adios_data *ad = e->elevator_data; \
++	ad->batch_limit[optype] = max_batch; \
++	return count; \
++}
++
++SYSFS_OPTYPE_DECL(read, ADIOS_READ);
++SYSFS_OPTYPE_DECL(write, ADIOS_WRITE);
++SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD);
++
++// Show the maximum batch size actually achieved for each operation type
++static ssize_t adios_batch_actual_max_show(
++		struct elevator_queue *e, char *page) {
++	struct adios_data *ad = e->elevator_data;
++	u32 total_count, read_count, write_count, discard_count;
++
++	total_count = ad->batch_actual_max_total;
++	read_count = ad->batch_actual_max_size[ADIOS_READ];
++	write_count = ad->batch_actual_max_size[ADIOS_WRITE];
++	discard_count = ad->batch_actual_max_size[ADIOS_DISCARD];
++
++	return sprintf(page,
++		"Total  : %u\nDiscard: %u\nRead   : %u\nWrite  : %u\n",
++		total_count, discard_count, read_count, write_count);
++}
++
++#define SYSFS_ULL_DECL(field, min_val, max_val) \
++static ssize_t adios_##field##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	return sprintf(page, "%llu\n", ad->field); \
++} \
++static ssize_t adios_##field##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	struct adios_data *ad = e->elevator_data; \
++	unsigned long val; \
++	int ret; \
++	ret = kstrtoul(page, 10, &val); \
++	if (ret || val < (min_val) || val > (max_val)) \
++		return -EINVAL; \
++	ad->field = val; \
++	return count; \
++}
++
++SYSFS_ULL_DECL(global_latency_window, 0, ULLONG_MAX)
++SYSFS_ULL_DECL(compliance_flags, 0, ULLONG_MAX)
++
++#define SYSFS_INT_DECL(field, min_val, max_val) \
++static ssize_t adios_##field##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	return sprintf(page, "%d\n", ad->field); \
++} \
++static ssize_t adios_##field##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	struct adios_data *ad = e->elevator_data; \
++	int val; \
++	int ret; \
++	ret = kstrtoint(page, 10, &val); \
++	if (ret || val < (min_val) || val > (max_val)) \
++		return -EINVAL; \
++	ad->field = val; \
++	return count; \
++}
++
++SYSFS_INT_DECL(bq_refill_below_ratio, 0, 100)
++SYSFS_INT_DECL(lat_model_latency_limit, 0, 2*NSEC_PER_SEC)
++SYSFS_INT_DECL(batch_order, ADIOS_BO_OPTYPE, !!ad->is_rotational)
++
++// Show the read priority
++static ssize_t adios_read_priority_show(
++		struct elevator_queue *e, char *page) {
++	struct adios_data *ad = e->elevator_data;
++	return sprintf(page, "%d\n", ad->dl_prio[0]);
++}
++
++// Set the read priority
++static ssize_t adios_read_priority_store(
++		struct elevator_queue *e, const char *page, size_t count) {
++	struct adios_data *ad = e->elevator_data;
++	int prio;
++	int ret;
++
++	ret = kstrtoint(page, 10, &prio);
++	if (ret || prio < -20 || prio > 19)
++		return -EINVAL;
++
++	guard(spinlock_irqsave)(&ad->lock);
++	ad->dl_prio[0] = prio;
++	ad->dl_bias = 0;
++
++	return count;
++}
++
++// Reset batch queue statistics
++static ssize_t adios_reset_bq_stats_store(
++		struct elevator_queue *e, const char *page, size_t count) {
++	struct adios_data *ad = e->elevator_data;
++	unsigned long val;
++	int ret;
++
++	ret = kstrtoul(page, 10, &val);
++	if (ret || val != 1)
++		return -EINVAL;
++
++	for (u8 i = 0; i < ADIOS_OPTYPES; i++)
++		ad->batch_actual_max_size[i] = 0;
++
++	ad->batch_actual_max_total = 0;
++
++	return count;
++}
++
++// Reset the latency model parameters or load them from user input
++static ssize_t adios_reset_lat_model_store(
++		struct elevator_queue *e, const char *page, size_t count)
++{
++	struct adios_data *ad = e->elevator_data;
++	struct latency_model *model;
++	int ret;
++
++	/*
++	 * Differentiate between two modes based on input format:
++	 * 1. "1": Fully reset the model (backward compatibility).
++	 * 2. "R_base R_slope W_base W_slope D_base D_slope": Load values.
++	 */
++	if (!strchr(page, ' ')) {
++		// Mode 1: Full reset.
++		unsigned long val;
++
++		ret = kstrtoul(page, 10, &val);
++		if (ret || val != 1)
++			return -EINVAL;
++
++		for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
++			model = &ad->latency_model[i];
++			sideload_latency_model(model, 0, 0);
++		}
++	} else {
++		// Mode 2: Load initial values for all latency models.
++		u64 params[3][2]; /* 0:base, 1:slope for R, W, D */
++
++		ret = sscanf(page, "%llu %llu %llu %llu %llu %llu",
++			&params[ADIOS_READ   ][0], &params[ADIOS_READ   ][1],
++			&params[ADIOS_WRITE  ][0], &params[ADIOS_WRITE  ][1],
++			&params[ADIOS_DISCARD][0], &params[ADIOS_DISCARD][1]);
++
++		if (ret != 6)
++			return -EINVAL;
++
++		for (u8 i = ADIOS_READ; i <= ADIOS_DISCARD; i++) {
++			model = &ad->latency_model[i];
++			sideload_latency_model(model, params[i][0], params[i][1]);
++		}
++	}
++	reset_buckets(ad->aggr_buckets);
++
++	return count;
++}
++
++// Show the ADIOS version
++static ssize_t adios_version_show(struct elevator_queue *e, char *page) {
++	return sprintf(page, "%s\n", ADIOS_VERSION);
++}
++
++// Define sysfs attributes for dynamic thresholds
++#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \
++static ssize_t adios_shrink_##name##_store( \
++		struct elevator_queue *e, const char *page, size_t count) { \
++	struct adios_data *ad = e->elevator_data; \
++	unsigned long val; \
++	int ret; \
++	ret = kstrtoul(page, 10, &val); \
++	if (ret || val < min_value || val > max_value) \
++		return -EINVAL; \
++	for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \
++		struct latency_model *model = &ad->latency_model[i]; \
++		unsigned long flags; \
++		spin_lock_irqsave(&model->update_lock, flags); \
++		model->model_field = val; \
++		spin_unlock_irqrestore(&model->update_lock, flags); \
++	} \
++	return count; \
++} \
++static ssize_t adios_shrink_##name##_show( \
++		struct elevator_queue *e, char *page) { \
++	struct adios_data *ad = e->elevator_data; \
++	u32 val = 0; \
++	unsigned long flags; \
++	struct latency_model *model = &ad->latency_model[0]; \
++	spin_lock_irqsave(&model->update_lock, flags); \
++	val = model->model_field; \
++	spin_unlock_irqrestore(&model->update_lock, flags); \
++	return sprintf(page, "%u\n", val); \
++}
++
++SHRINK_THRESHOLD_ATTR_RW(at_kreqs,  lm_shrink_at_kreqs,  1, 100000)
++SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1,   1000)
++SHRINK_THRESHOLD_ATTR_RW(resist,    lm_shrink_resist,    1,      3)
++
++// Define sysfs attributes
++#define AD_ATTR(name, show_func, store_func) \
++	__ATTR(name, 0644, show_func, store_func)
++#define AD_ATTR_RW(name) \
++	__ATTR(name, 0644, adios_##name##_show, adios_##name##_store)
++#define AD_ATTR_RO(name) \
++	__ATTR(name, 0444, adios_##name##_show, NULL)
++#define AD_ATTR_WO(name) \
++	__ATTR(name, 0200, NULL, adios_##name##_store)
++
++// Define sysfs attributes for ADIOS scheduler
++static struct elv_fs_entry adios_sched_attrs[] = {
++	AD_ATTR_RO(batch_actual_max),
++	AD_ATTR_RW(bq_refill_below_ratio),
++	AD_ATTR_RW(global_latency_window),
++	AD_ATTR_RW(lat_model_latency_limit),
++	AD_ATTR_RW(batch_order),
++	AD_ATTR_RW(compliance_flags),
++
++	AD_ATTR_RW(batch_limit_read),
++	AD_ATTR_RW(batch_limit_write),
++	AD_ATTR_RW(batch_limit_discard),
++
++	AD_ATTR_RW(lat_model_read),
++	AD_ATTR_RW(lat_model_write),
++	AD_ATTR_RW(lat_model_discard),
++
++	AD_ATTR_RW(lat_target_read),
++	AD_ATTR_RW(lat_target_write),
++	AD_ATTR_RW(lat_target_discard),
++
++	AD_ATTR_RW(shrink_at_kreqs),
++	AD_ATTR_RW(shrink_at_gbytes),
++	AD_ATTR_RW(shrink_resist),
++
++	AD_ATTR_RW(read_priority),
++
++	AD_ATTR_WO(reset_bq_stats),
++	AD_ATTR_WO(reset_lat_model),
++	AD_ATTR(adios_version, adios_version_show, NULL),
++
++	__ATTR_NULL
++};
++
++// Define the ADIOS scheduler type
++static struct elevator_type mq_adios = {
++	.ops = {
++		.next_request		= elv_rb_latter_request,
++		.former_request		= elv_rb_former_request,
++		.limit_depth		= adios_limit_depth,
++		.depth_updated		= adios_depth_updated,
++		.request_merged		= adios_request_merged,
++		.requests_merged	= adios_merged_requests,
++		.bio_merge			= adios_bio_merge,
++		.insert_requests	= adios_insert_requests,
++		.prepare_request	= adios_prepare_request,
++		.dispatch_request	= adios_dispatch_request,
++		.completed_request	= adios_completed_request,
++		.finish_request		= adios_finish_request,
++		.has_work			= adios_has_work,
++		.init_hctx			= adios_init_hctx,
++		.init_sched			= adios_init_sched,
++		.exit_sched			= adios_exit_sched,
++	},
++	.elevator_attrs = adios_sched_attrs,
++	.elevator_name = "adios",
++	.elevator_owner = THIS_MODULE,
++};
++MODULE_ALIAS("mq-adios-iosched");
++
++#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler"
++#define ADIOS_AUTHOR   "Masahito Suzuki"
++
++// Initialize the ADIOS scheduler module
++static int __init adios_init(void) {
++	printk(KERN_INFO "%s %s by %s\n",
++		ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR);
++	return elv_register(&mq_adios);
++}
++
++// Exit the ADIOS scheduler module
++static void __exit adios_exit(void) {
++	elv_unregister(&mq_adios);
++}
++
++module_init(adios_init);
++module_exit(adios_exit);
++
++MODULE_AUTHOR(ADIOS_AUTHOR);
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION(ADIOS_PROGNAME);
+\ No newline at end of file
+diff --git a/block/elevator.c b/block/elevator.c
+index fe96c6f4753c..7b4f2913841f 100644
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -752,6 +752,21 @@ void elevator_set_default(struct request_queue *q)
+ 	if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ 		return;
+ 
++#ifdef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
++	ctx.name = "adios";
++#else // !CONFIG_MQ_IOSCHED_DEFAULT_ADIOS
++	bool is_sq = q->nr_hw_queues == 1 || blk_mq_is_shared_tags(q->tag_set->flags);
++#ifdef CONFIG_CACHY
++#ifdef CONFIG_IOSCHED_BFQ
++	if (is_sq)
++		ctx.name = "bfq";
++#endif /* CONFIG_IOSCHED_BFQ */
++#else
++	if (!is_sq)
++		return;
++#endif /* CONFIG_CACHY */
++#endif /* CONFIG_MQ_IOSCHED_DEFAULT_ADIOS */
++
+ 	/*
+ 	 * For single queue devices, default to using mq-deadline. If we
+ 	 * have multiple queues or mq-deadline is not available, default
+@@ -761,13 +776,10 @@ void elevator_set_default(struct request_queue *q)
+ 	if (!e)
+ 		return;
+ 
+-	if ((q->nr_hw_queues == 1 ||
+-			blk_mq_is_shared_tags(q->tag_set->flags))) {
+-		err = elevator_change(q, &ctx);
+-		if (err < 0)
+-			pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+-					ctx.name, err);
+-	}
++	err = elevator_change(q, &ctx);
++	if (err < 0)
++		pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
++				ctx.name, err);
+ 	elevator_put(e);
+ }
+ 
+diff --git a/drivers/Makefile b/drivers/Makefile
+index b5749cf67044..5beba9f57254 100644
+--- a/drivers/Makefile
++++ b/drivers/Makefile
+@@ -64,14 +64,8 @@ obj-y				+= char/
+ # iommu/ comes before gpu as gpu are using iommu controllers
+ obj-y				+= iommu/
+ 
+-# gpu/ comes after char for AGP vs DRM startup and after iommu
+-obj-y				+= gpu/
+-
+ obj-$(CONFIG_CONNECTOR)		+= connector/
+ 
+-# i810fb depends on char/agp/
+-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+-
+ obj-$(CONFIG_PARPORT)		+= parport/
+ obj-y				+= base/ block/ misc/ mfd/ nfc/
+ obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
+@@ -83,6 +77,13 @@ obj-y				+= macintosh/
+ obj-y				+= scsi/
+ obj-y				+= nvme/
+ obj-$(CONFIG_ATA)		+= ata/
++
++# gpu/ comes after char for AGP vs DRM startup and after iommu
++obj-y				+= gpu/
++
++# i810fb depends on char/agp/
++obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
++
+ obj-$(CONFIG_TARGET_CORE)	+= target/
+ obj-$(CONFIG_MTD)		+= mtd/
+ obj-$(CONFIG_SPI)		+= spi/
+diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
+index 7a7f88b3fa2b..cb26ab099da2 100644
+--- a/drivers/ata/ahci.c
++++ b/drivers/ata/ahci.c
+@@ -1672,7 +1672,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
+ }
+ #endif
+ 
+-static void ahci_remap_check(struct pci_dev *pdev, int bar,
++static int ahci_remap_check(struct pci_dev *pdev, int bar,
+ 		struct ahci_host_priv *hpriv)
+ {
+ 	int i;
+@@ -1685,7 +1685,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
+ 	    pci_resource_len(pdev, bar) < SZ_512K ||
+ 	    bar != AHCI_PCI_BAR_STANDARD ||
+ 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
+-		return;
++		return 0;
+ 
+ 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
+ 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
+@@ -1700,18 +1700,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
+ 	}
+ 
+ 	if (!hpriv->remapped_nvme)
+-		return;
+-
+-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
+-		 hpriv->remapped_nvme);
+-	dev_warn(&pdev->dev,
+-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
++		return 0;
+ 
+-	/*
+-	 * Don't rely on the msi-x capability in the remap case,
+-	 * share the legacy interrupt across ahci and remapped devices.
+-	 */
+-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
++	/* Abort probe, allowing intel-nvme-remap to step in when available */
++	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
++	return -ENODEV;
+ }
+ 
+ static int ahci_get_irq_vector(struct ata_host *host, int port)
+@@ -1975,7 +1968,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+ 		return -ENOMEM;
+ 
+ 	/* detect remapped nvme devices */
+-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
++	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
++	if (rc)
++		return rc;
+ 
+ 	sysfs_add_file_to_group(&pdev->dev.kobj,
+ 				&dev_attr_remapped_nvme.attr,
+diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
+index 2c5c228408bf..918e2bebfe78 100644
+--- a/drivers/cpufreq/Kconfig.x86
++++ b/drivers/cpufreq/Kconfig.x86
+@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
+ 	select ACPI_PROCESSOR if ACPI
+ 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
+ 	select CPU_FREQ_GOV_PERFORMANCE
+-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
+ 	help
+ 	  This driver provides a P state for Intel core processors.
+ 	  The driver implements an internal governor and will become
+@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
+ 	depends on X86 && ACPI
+ 	select ACPI_PROCESSOR
+ 	select ACPI_CPPC_LIB if X86_64
+-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
+ 	help
+ 	  This driver adds a CPUFreq driver which utilizes a fine grain
+ 	  processor performance frequency control range instead of legacy
+diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
+index f366d35c5840..a04b6bfeb1c2 100644
+--- a/drivers/cpufreq/intel_pstate.c
++++ b/drivers/cpufreq/intel_pstate.c
+@@ -3950,6 +3950,8 @@ static int __init intel_pstate_setup(char *str)
+ 
+ 	if (!strcmp(str, "disable"))
+ 		no_load = 1;
++	else if (!strcmp(str, "enable"))
++		no_load = 0;
+ 	else if (!strcmp(str, "active"))
+ 		default_driver = &intel_pstate;
+ 	else if (!strcmp(str, "passive"))
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index ef3af170dda4..cf918b18db53 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -163,6 +163,7 @@ struct amdgpu_watchdog_timer {
+  */
+ extern int amdgpu_modeset;
+ extern unsigned int amdgpu_vram_limit;
++extern int amdgpu_ignore_min_pcap;
+ extern int amdgpu_vis_vram_limit;
+ extern int amdgpu_gart_size;
+ extern int amdgpu_gtt_size;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+index 395c6be901ce..fb1607b2805a 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -148,6 +148,7 @@ enum AMDGPU_DEBUG_MASK {
+ };
+ 
+ unsigned int amdgpu_vram_limit = UINT_MAX;
++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */
+ int amdgpu_vis_vram_limit;
+ int amdgpu_gart_size = -1; /* auto */
+ int amdgpu_gtt_size = -1; /* auto */
+@@ -269,6 +270,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+ 	.period = 0x0, /* default to 0x0 (timeout disable) */
+ };
+ 
++/**
++ * DOC: ignore_min_pcap (int)
++ * Ignore the minimum power cap.
++ * Useful on graphics cards where the minimum power cap is very high.
++ * The default is 0 (Do not ignore).
++ */
++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap");
++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600);
++
+ /**
+  * DOC: vramlimit (int)
+  * Restrict the total amount of VRAM in MiB for testing.  The default is 0 (Use full VRAM).
+diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
+index abd3b6564373..46937e6fa78d 100644
+--- a/drivers/gpu/drm/amd/display/Kconfig
++++ b/drivers/gpu/drm/amd/display/Kconfig
+@@ -56,4 +56,10 @@ config DRM_AMD_SECURE_DISPLAY
+ 	  This option enables the calculation of crc of specific region via
+ 	  debugfs. Cooperate with specific DMCU FW.
+ 
++config AMD_PRIVATE_COLOR
++	bool "Enable KMS color management by AMD for AMD"
++	default n
++	help
++	  This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck.
++
+ endmenu
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index a0ca3b2c6bd8..c4ea09496f95 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -4675,7 +4675,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
+ 		return r;
+ 	}
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	if (amdgpu_dm_create_color_properties(adev)) {
+ 		dc_state_release(state->context);
+ 		kfree(state);
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+index ebabfe3a512f..4d3ebcaacca1 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x)
+ 	return val;
+ }
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ /* Pre-defined Transfer Functions (TF)
+  *
+  * AMD driver supports pre-defined mathematical functions for transferring
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+index 45feb404b097..ee8672919a05 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+@@ -491,7 +491,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
+ }
+ #endif
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ /**
+  * dm_crtc_additional_color_mgmt - enable additional color properties
+  * @crtc: DRM CRTC
+@@ -573,7 +573,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
+ #if defined(CONFIG_DEBUG_FS)
+ 	.late_register = amdgpu_dm_crtc_late_register,
+ #endif
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
+ 	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
+ #endif
+@@ -770,7 +770,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
+ 
+ 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	dm_crtc_additional_color_mgmt(&acrtc->base);
+ #endif
+ 	return 0;
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+index eef51652ca35..d5c932c191b2 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+@@ -1601,7 +1601,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane,
+ 	drm_atomic_helper_plane_destroy_state(plane, state);
+ }
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ static void
+ dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
+ 					     struct drm_plane *plane)
+@@ -1792,7 +1792,7 @@ static const struct drm_plane_funcs dm_plane_funcs = {
+ 	.atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state,
+ 	.atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state,
+ 	.format_mod_supported = amdgpu_dm_plane_format_mod_supported,
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	.atomic_set_property = dm_atomic_plane_set_property,
+ 	.atomic_get_property = dm_atomic_plane_get_property,
+ #endif
+@@ -1888,7 +1888,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
+ 	else
+ 		drm_plane_helper_add(plane, &dm_plane_helper_funcs);
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
+ #endif
+ 	/* Create (reset) the plane state */
+diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+index 5fbfe7333b54..9e81953043be 100644
+--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+@@ -3073,6 +3073,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
+ 					 struct device_attribute *attr,
+ 					 char *buf)
+ {
++	if (amdgpu_ignore_min_pcap)
++		return sysfs_emit(buf, "%i\n", 0);
++
+ 	return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN);
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+index b47cb4a5f488..f9f6b0d96f97 100644
+--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+@@ -2921,7 +2921,10 @@ int smu_get_power_limit(void *handle,
+ 			*limit = smu->max_power_limit;
+ 			break;
+ 		case SMU_PPT_LIMIT_MIN:
+-			*limit = smu->min_power_limit;
++			if (amdgpu_ignore_min_pcap)
++				*limit = 0;
++			else
++				*limit = smu->min_power_limit;
+ 			break;
+ 		default:
+ 			return -EINVAL;
+@@ -2945,7 +2948,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit)
+ 		if (smu->ppt_funcs->set_power_limit)
+ 			return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);
+ 
+-	if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
++	if (amdgpu_ignore_min_pcap) {
++		if ((limit > smu->max_power_limit)) {
++			dev_err(smu->adev->dev,
++				"New power limit (%d) is over the max allowed %d\n",
++				limit, smu->max_power_limit);
++			return -EINVAL;
++		}
++	} else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
+ 		dev_err(smu->adev->dev,
+ 			"New power limit (%d) is out of range [%d,%d]\n",
+ 			limit, smu->min_power_limit, smu->max_power_limit);
+diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
+index 90ff6be85cf4..15159c1cf6e1 100644
+--- a/drivers/input/evdev.c
++++ b/drivers/input/evdev.c
+@@ -46,6 +46,7 @@ struct evdev_client {
+ 	struct fasync_struct *fasync;
+ 	struct evdev *evdev;
+ 	struct list_head node;
++	struct rcu_head rcu;
+ 	enum input_clock_type clk_type;
+ 	bool revoked;
+ 	unsigned long *evmasks[EV_CNT];
+@@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev,
+ 	spin_unlock(&evdev->client_lock);
+ }
+ 
++static void evdev_reclaim_client(struct rcu_head *rp)
++{
++	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
++	unsigned int i;
++	for (i = 0; i < EV_CNT; ++i)
++		bitmap_free(client->evmasks[i]);
++	kvfree(client);
++}
++
+ static void evdev_detach_client(struct evdev *evdev,
+ 				struct evdev_client *client)
+ {
+ 	spin_lock(&evdev->client_lock);
+ 	list_del_rcu(&client->node);
+ 	spin_unlock(&evdev->client_lock);
+-	synchronize_rcu();
++	call_rcu(&client->rcu, evdev_reclaim_client);
+ }
+ 
+ static int evdev_open_device(struct evdev *evdev)
+@@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file)
+ {
+ 	struct evdev_client *client = file->private_data;
+ 	struct evdev *evdev = client->evdev;
+-	unsigned int i;
+ 
+ 	mutex_lock(&evdev->mutex);
+ 
+@@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file)
+ 
+ 	evdev_detach_client(evdev, client);
+ 
+-	for (i = 0; i < EV_CNT; ++i)
+-		bitmap_free(client->evmasks[i]);
+-
+-	kvfree(client);
+-
+ 	evdev_close_device(evdev);
+ 
+ 	return 0;
+@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file)
+ 
+  err_free_client:
+ 	evdev_detach_client(evdev, client);
+-	kvfree(client);
+ 	return error;
+ }
+ 
+diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
+index 5ef43231fe77..5d754058c023 100644
+--- a/drivers/md/dm-crypt.c
++++ b/drivers/md/dm-crypt.c
+@@ -3305,6 +3305,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+ 			goto bad;
+ 	}
+ 
++#ifdef CONFIG_CACHY
++	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
++	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
++#endif
++
+ 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
+ 	if (ret < 0)
+ 		goto bad;
+diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
+index 331b8e535e5b..80dabeebf580 100644
+--- a/drivers/media/v4l2-core/Kconfig
++++ b/drivers/media/v4l2-core/Kconfig
+@@ -40,6 +40,11 @@ config VIDEO_TUNER
+ config V4L2_JPEG_HELPER
+ 	tristate
+ 
++config V4L2_LOOPBACK
++	tristate "V4L2 loopback device"
++	help
++	  V4L2 loopback device
++
+ # Used by drivers that need v4l2-h264.ko
+ config V4L2_H264
+ 	tristate
+diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
+index 2177b9d63a8f..c179507cedc4 100644
+--- a/drivers/media/v4l2-core/Makefile
++++ b/drivers/media/v4l2-core/Makefile
+@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
+ obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
+ obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
+ 
++obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o
++
+ obj-$(CONFIG_VIDEO_TUNER) += tuner.o
+ obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o
+diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c
+new file mode 100644
+index 000000000000..3be7c4abc1e7
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback.c
+@@ -0,0 +1,3316 @@
++/* -*- c-file-style: "linux" -*- */
++/*
++ * v4l2loopback.c  --  video4linux2 loopback driver
++ *
++ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com)
++ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at)
++ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de)
++ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com)
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ */
++#include <linux/version.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/time.h>
++#include <linux/module.h>
++#include <linux/videodev2.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/capability.h>
++#include <linux/timer.h>
++#include <linux/eventpoll.h>
++#include <media/v4l2-ioctl.h>
++#include <media/v4l2-common.h>
++#include <media/v4l2-device.h>
++#include <media/v4l2-ctrls.h>
++#include <media/v4l2-event.h>
++
++#include <linux/miscdevice.h>
++#include "v4l2loopback.h"
++
++#define V4L2LOOPBACK_CTL_ADD_legacy 0x4C80
++#define V4L2LOOPBACK_CTL_REMOVE_legacy 0x4C81
++#define V4L2LOOPBACK_CTL_QUERY_legacy 0x4C82
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
++#error This module is not supported on kernels before 4.0.0.
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
++#define strscpy strlcpy
++#endif
++
++#if defined(timer_setup)
++#define HAVE_TIMER_SETUP
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
++#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0)
++#define timer_delete_sync del_timer_sync
++#endif
++
++#define V4L2LOOPBACK_VERSION_CODE                                              \
++	KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \
++		       V4L2LOOPBACK_VERSION_BUGFIX)
++
++MODULE_DESCRIPTION("V4L2 loopback video device");
++MODULE_AUTHOR("Vasily Levin, "
++	      "IOhannes m zmoelnig <zmoelnig@iem.at>,"
++	      "Stefan Diewald,"
++	      "Anton Novikov"
++	      "et al.");
++#ifdef SNAPSHOT_VERSION
++MODULE_VERSION(__stringify(SNAPSHOT_VERSION));
++#else
++MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify(
++	V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX));
++#endif
++MODULE_LICENSE("GPL");
++
++/*
++ * helpers
++ */
++#define dprintk(fmt, args...)                                          \
++	do {                                                           \
++		if (debug > 0) {                                       \
++			printk(KERN_INFO "v4l2-loopback[" __stringify( \
++				       __LINE__) "], pid(%d):  " fmt,  \
++			       task_pid_nr(current), ##args);          \
++		}                                                      \
++	} while (0)
++
++#define MARK()                                                             \
++	do {                                                               \
++		if (debug > 1) {                                           \
++			printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \
++			       __LINE__, __func__, task_pid_nr(current));  \
++		}                                                          \
++	} while (0)
++
++#define dprintkrw(fmt, args...)                                        \
++	do {                                                           \
++		if (debug > 2) {                                       \
++			printk(KERN_INFO "v4l2-loopback[" __stringify( \
++				       __LINE__) "], pid(%d): " fmt,   \
++			       task_pid_nr(current), ##args);          \
++		}                                                      \
++	} while (0)
++
++static inline void v4l2l_get_timestamp(struct v4l2_buffer *b)
++{
++	struct timespec64 ts;
++	ktime_get_ts64(&ts);
++
++	b->timestamp.tv_sec = ts.tv_sec;
++	b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC);
++	b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
++	b->flags &= ~V4L2_BUF_FLAG_TIMESTAMP_COPY;
++}
++
++#if BITS_PER_LONG == 32
++#include <asm/div64.h> /* do_div() for 64bit division */
++static inline int v4l2l_mod64(const s64 A, const u32 B)
++{
++	u64 a = (u64)A;
++	u32 b = B;
++
++	if (A > 0)
++		return do_div(a, b);
++	a = -A;
++	return -do_div(a, b);
++}
++#else
++static inline int v4l2l_mod64(const s64 A, const u32 B)
++{
++	return A % B;
++}
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
++typedef unsigned __poll_t;
++#endif
++
++/* module constants
++ *  can be overridden during he build process using something like
++ *	make KCPPFLAGS="-DMAX_DEVICES=100"
++ */
++
++/* maximum number of v4l2loopback devices that can be created */
++#ifndef MAX_DEVICES
++#define MAX_DEVICES 8
++#endif
++
++/* whether the default is to announce capabilities exclusively or not */
++#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
++#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0
++#endif
++
++/* when a producer is considered to have gone stale */
++#ifndef MAX_TIMEOUT
++#define MAX_TIMEOUT (100 * 1000) /* in msecs */
++#endif
++
++/* max buffers that can be mapped, actually they
++ * are all mapped to max_buffers buffers */
++#ifndef MAX_BUFFERS
++#define MAX_BUFFERS 32
++#endif
++
++/* module parameters */
++static int debug = 0;
++module_param(debug, int, S_IRUGO | S_IWUSR);
++MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)");
++
++#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2
++static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS;
++module_param(max_buffers, int, S_IRUGO);
++MODULE_PARM_DESC(max_buffers,
++		 "how many buffers should be allocated [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]");
++
++/* how many times a device can be opened
++ * the per-module default value can be overridden on a per-device basis using
++ * the /sys/devices interface
++ *
++ * note that max_openers should be at least 2 in order to get a working system:
++ *   one opener for the producer and one opener for the consumer
++ *   however, we leave that to the user
++ */
++#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10
++static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS;
++module_param(max_openers, int, S_IRUGO | S_IWUSR);
++MODULE_PARM_DESC(
++	max_openers,
++	"how many users can open the loopback device [DEFAULT: " __stringify(
++		V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]");
++
++static int devices = -1;
++module_param(devices, int, 0);
++MODULE_PARM_DESC(devices, "how many devices should be created");
++
++static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 };
++module_param_array(video_nr, int, NULL, 0444);
++MODULE_PARM_DESC(video_nr,
++		 "video device numbers (-1=auto, 0=/dev/video0, etc.)");
++
++static char *card_label[MAX_DEVICES];
++module_param_array(card_label, charp, NULL, 0000);
++MODULE_PARM_DESC(card_label, "card labels for each device");
++
++static bool exclusive_caps[MAX_DEVICES] = {
++	[0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
++};
++module_param_array(exclusive_caps, bool, NULL, 0444);
++/* FIXXME: wording */
++MODULE_PARM_DESC(
++	exclusive_caps,
++	"whether to announce OUTPUT/CAPTURE capabilities exclusively or not  [DEFAULT: " __stringify(
++		V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]");
++
++/* format specifications */
++#define V4L2LOOPBACK_SIZE_MIN_WIDTH 2
++#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 1
++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192
++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192
++
++#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640
++#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480
++
++static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
++module_param(max_width, int, S_IRUGO);
++MODULE_PARM_DESC(max_width,
++		 "maximum allowed frame width [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]");
++static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
++module_param(max_height, int, S_IRUGO);
++MODULE_PARM_DESC(max_height,
++		 "maximum allowed frame height [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]");
++
++static DEFINE_IDR(v4l2loopback_index_idr);
++static DEFINE_MUTEX(v4l2loopback_ctl_mutex);
++
++/* frame intervals */
++#define V4L2LOOPBACK_FRAME_INTERVAL_MAX __UINT32_MAX__
++#define V4L2LOOPBACK_FPS_DEFAULT 30
++#define V4L2LOOPBACK_FPS_MAX 1000
++
++/* control IDs */
++#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000)
++#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0)
++#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1)
++#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2)
++#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3)
++
++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl);
++static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = {
++	.s_ctrl = v4l2loopback_s_ctrl,
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_KEEP_FORMAT,
++	.name	= "keep_format",
++	.type	= V4L2_CTRL_TYPE_BOOLEAN,
++	.min	= 0,
++	.max	= 1,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_SUSTAIN_FRAMERATE,
++	.name	= "sustain_framerate",
++	.type	= V4L2_CTRL_TYPE_BOOLEAN,
++	.min	= 0,
++	.max	= 1,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_TIMEOUT,
++	.name	= "timeout",
++	.type	= V4L2_CTRL_TYPE_INTEGER,
++	.min	= 0,
++	.max	= MAX_TIMEOUT,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_TIMEOUT_IMAGE_IO,
++	.name	= "timeout_image_io",
++	.type	= V4L2_CTRL_TYPE_BUTTON,
++	.min	= 0,
++	.max	= 0,
++	.step	= 0,
++	.def	= 0,
++	// clang-format on
++};
++
++/* module structures */
++struct v4l2loopback_private {
++	int device_nr;
++};
++
++/* TODO(vasaka) use typenames which are common to kernel, but first find out if
++ * it is needed */
++/* struct keeping state and settings of loopback device */
++
++struct v4l2l_buffer {
++	struct v4l2_buffer buffer;
++	struct list_head list_head;
++	atomic_t use_count;
++};
++
++struct v4l2_loopback_device {
++	struct v4l2_device v4l2_dev;
++	struct v4l2_ctrl_handler ctrl_handler;
++	struct video_device *vdev;
++
++	/* loopback device-specific parameters */
++	char card_label[32];
++	bool announce_all_caps; /* announce both OUTPUT and CAPTURE capabilities
++				 * when true; else announce OUTPUT when no
++				 * writer is streaming, otherwise CAPTURE. */
++	int max_openers; /* how many times can this device be opened */
++	int min_width, max_width;
++	int min_height, max_height;
++
++	/* pixel and stream format */
++	struct v4l2_pix_format pix_format;
++	bool pix_format_has_valid_sizeimage;
++	struct v4l2_captureparm capture_param;
++	unsigned long frame_jiffies;
++
++	/* ctrls */
++	int keep_format; /* CID_KEEP_FORMAT; lock the format, do not free
++			  * on close(), and when `!announce_all_caps` do NOT
++			  * fall back to OUTPUT when no writers attached (clear
++			  * `keep_format` to attach a new writer) */
++	int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain
++				  (close to) nominal framerate */
++	unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */
++	int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will
++			       * queue/dequeue the timeout image buffer */
++
++	/* buffers for OUTPUT and CAPTURE */
++	u8 *image; /* pointer to actual buffers data */
++	unsigned long image_size; /* number of bytes alloc'd for all buffers */
++	struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */
++	u32 buffer_count; /* should not be big, 4 is a good choice */
++	u32 buffer_size; /* number of bytes alloc'd per buffer */
++	u32 used_buffer_count; /* number of buffers allocated to openers */
++	struct list_head outbufs_list; /* FIFO queue for OUTPUT buffers */
++	u32 bufpos2index[MAX_BUFFERS]; /* mapping of `(position % used_buffers)`
++					* to `buffers[index]` */
++	s64 write_position; /* sequence number of last 'displayed' buffer plus
++			     * one */
++
++	/* synchronization between openers */
++	atomic_t open_count;
++	struct mutex image_mutex; /* mutex for allocating image(s) and
++				   * exchanging format tokens */
++	spinlock_t lock; /* lock for the timeout and framerate timers */
++	spinlock_t list_lock; /* lock for the OUTPUT buffer queue */
++	wait_queue_head_t read_event;
++	u32 format_tokens; /* tokens to 'set format' for OUTPUT, CAPTURE, or
++			    * timeout buffers */
++	u32 stream_tokens; /* tokens to 'start' OUTPUT, CAPTURE, or timeout
++			    * stream */
++
++	/* sustain framerate */
++	struct timer_list sustain_timer;
++	unsigned int reread_count;
++
++	/* timeout */
++	u8 *timeout_image; /* copied to outgoing buffers when timeout passes */
++	struct v4l2l_buffer timeout_buffer;
++	u32 timeout_buffer_size; /* number bytes alloc'd for timeout buffer */
++	struct timer_list timeout_timer;
++	int timeout_happened;
++};
++
++enum v4l2l_io_method {
++	V4L2L_IO_NONE = 0,
++	V4L2L_IO_MMAP = 1,
++	V4L2L_IO_FILE = 2,
++	V4L2L_IO_TIMEOUT = 3,
++};
++
++/* struct keeping state and type of opener */
++struct v4l2_loopback_opener {
++	u32 format_token; /* token (if any) for type used in call to S_FMT or
++			   * REQBUFS */
++	u32 stream_token; /* token (if any) for type used in call to STREAMON */
++	u32 buffer_count; /* number of buffers (if any) that opener acquired via
++			   * REQBUFS */
++	s64 read_position; /* sequence number of the next 'captured' frame */
++	unsigned int reread_count;
++	enum v4l2l_io_method io_method;
++
++	struct v4l2_fh fh;
++};
++
++#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh)
++
++/* this is heavily inspired by the bttv driver found in the linux kernel */
++struct v4l2l_format {
++	char *name;
++	int fourcc; /* video4linux 2 */
++	int depth; /* bit/pixel */
++	int flags;
++};
++/* set the v4l2l_format.flags to PLANAR for non-packed formats */
++#define FORMAT_FLAGS_PLANAR 0x01
++#define FORMAT_FLAGS_COMPRESSED 0x02
++
++#include "v4l2loopback_formats.h"
++
++#ifndef V4L2_TYPE_IS_CAPTURE
++#define V4L2_TYPE_IS_CAPTURE(type)                \
++	((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \
++	 (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
++#endif /* V4L2_TYPE_IS_CAPTURE */
++#ifndef V4L2_TYPE_IS_OUTPUT
++#define V4L2_TYPE_IS_OUTPUT(type)                \
++	((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \
++	 (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE)
++#endif /* V4L2_TYPE_IS_OUTPUT */
++
++/* token values for privilege to set format or start/stop stream */
++#define V4L2L_TOKEN_CAPTURE 0x01
++#define V4L2L_TOKEN_OUTPUT 0x02
++#define V4L2L_TOKEN_TIMEOUT 0x04
++#define V4L2L_TOKEN_MASK \
++	(V4L2L_TOKEN_CAPTURE | V4L2L_TOKEN_OUTPUT | V4L2L_TOKEN_TIMEOUT)
++
++/* helpers for token exchange and token status */
++#define token_from_type(type) \
++	(V4L2_TYPE_IS_CAPTURE(type) ? V4L2L_TOKEN_CAPTURE : V4L2L_TOKEN_OUTPUT)
++#define acquire_token(dev, opener, label, token) \
++	do {                                     \
++		(opener)->label##_token = token; \
++		(dev)->label##_tokens &= ~token; \
++	} while (0)
++#define release_token(dev, opener, label)                         \
++	do {                                                      \
++		(dev)->label##_tokens |= (opener)->label##_token; \
++		(opener)->label##_token = 0;                      \
++	} while (0)
++#define has_output_token(token) (token & V4L2L_TOKEN_OUTPUT)
++#define has_capture_token(token) (token & V4L2L_TOKEN_CAPTURE)
++#define has_no_owners(dev) ((~((dev)->format_tokens) & V4L2L_TOKEN_MASK) == 0)
++#define has_other_owners(opener, dev) \
++	(~((dev)->format_tokens ^ (opener)->format_token) & V4L2L_TOKEN_MASK)
++#define need_timeout_buffer(dev, token) \
++	((dev)->timeout_jiffies > 0 || (token) & V4L2L_TOKEN_TIMEOUT)
++
++static const unsigned int FORMATS = ARRAY_SIZE(formats);
++
++static char *fourcc2str(unsigned int fourcc, char buf[5])
++{
++	buf[0] = (fourcc >> 0) & 0xFF;
++	buf[1] = (fourcc >> 8) & 0xFF;
++	buf[2] = (fourcc >> 16) & 0xFF;
++	buf[3] = (fourcc >> 24) & 0xFF;
++	buf[4] = 0;
++
++	return buf;
++}
++
++static const struct v4l2l_format *format_by_fourcc(int fourcc)
++{
++	unsigned int i;
++	char buf[5];
++
++	for (i = 0; i < FORMATS; i++) {
++		if (formats[i].fourcc == fourcc)
++			return formats + i;
++	}
++
++	dprintk("unsupported format '%4s'\n", fourcc2str(fourcc, buf));
++	return NULL;
++}
++
++static void pix_format_set_size(struct v4l2_pix_format *f,
++				const struct v4l2l_format *fmt,
++				unsigned int width, unsigned int height)
++{
++	f->width = width;
++	f->height = height;
++
++	if (fmt->flags & FORMAT_FLAGS_PLANAR) {
++		f->bytesperline = width; /* Y plane */
++		f->sizeimage = (width * height * fmt->depth) >> 3;
++	} else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) {
++		/* doesn't make sense for compressed formats */
++		f->bytesperline = 0;
++		f->sizeimage = (width * height * fmt->depth) >> 3;
++	} else {
++		f->bytesperline = (width * fmt->depth) >> 3;
++		f->sizeimage = height * f->bytesperline;
++	}
++}
++
++static int v4l2l_fill_format(struct v4l2_format *fmt, const u32 minwidth,
++			     const u32 maxwidth, const u32 minheight,
++			     const u32 maxheight)
++{
++	u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height;
++	u32 pixelformat = fmt->fmt.pix.pixelformat;
++	struct v4l2_format fmt0 = *fmt;
++	u32 bytesperline = 0, sizeimage = 0;
++
++	if (!width)
++		width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
++	if (!height)
++		height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
++	width = clamp_val(width, minwidth, maxwidth);
++	height = clamp_val(height, minheight, maxheight);
++
++	/* sets: width,height,pixelformat,bytesperline,sizeimage */
++	if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) {
++		fmt0.fmt.pix.bytesperline = 0;
++		fmt0.fmt.pix.sizeimage = 0;
++	}
++
++	if (0) {
++		;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
++	} else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width,
++				     height)) {
++		;
++	} else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width,
++					height)) {
++		;
++#endif
++	} else {
++		const struct v4l2l_format *format =
++			format_by_fourcc(pixelformat);
++		if (!format)
++			return -EINVAL;
++		pix_format_set_size(&fmt0.fmt.pix, format, width, height);
++		fmt0.fmt.pix.pixelformat = format->fourcc;
++	}
++
++	if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) {
++		*fmt = fmt0;
++
++		if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) ||
++		    (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3))
++			fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB;
++		if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field)
++			fmt->fmt.pix_mp.field = V4L2_FIELD_NONE;
++	} else {
++		bytesperline = fmt->fmt.pix.bytesperline;
++		sizeimage = fmt->fmt.pix.sizeimage;
++
++		*fmt = fmt0;
++
++		if (!fmt->fmt.pix.bytesperline)
++			fmt->fmt.pix.bytesperline = bytesperline;
++		if (!fmt->fmt.pix.sizeimage)
++			fmt->fmt.pix.sizeimage = sizeimage;
++
++		if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) ||
++		    (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3))
++			fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
++		if (V4L2_FIELD_ANY == fmt->fmt.pix.field)
++			fmt->fmt.pix.field = V4L2_FIELD_NONE;
++	}
++
++	return 0;
++}
++
++/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */
++static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
++	const struct v4l2_format_info *info;
++
++	info = v4l2_format_info(fmt->fmt.pix.pixelformat);
++	if (info && info->mem_planes == 1)
++		return true;
++#endif
++
++	return false;
++}
++
++static int pix_format_eq(const struct v4l2_pix_format *ref,
++			 const struct v4l2_pix_format *tgt, int strict)
++{
++	/* check if the two formats are equivalent.
++	 * ANY fields are handled gracefully
++	 */
++#define _pix_format_eq0(x)    \
++	if (ref->x != tgt->x) \
++	result = 0
++#define _pix_format_eq1(x, def)                              \
++	do {                                                 \
++		if ((def != tgt->x) && (ref->x != tgt->x)) { \
++			printk(KERN_INFO #x " failed");      \
++			result = 0;                          \
++		}                                            \
++	} while (0)
++	int result = 1;
++	_pix_format_eq0(width);
++	_pix_format_eq0(height);
++	_pix_format_eq0(pixelformat);
++	if (!strict)
++		return result;
++	_pix_format_eq1(field, V4L2_FIELD_ANY);
++	_pix_format_eq0(bytesperline);
++	_pix_format_eq0(sizeimage);
++	_pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT);
++	return result;
++}
++
++static void set_timeperframe(struct v4l2_loopback_device *dev,
++			     struct v4l2_fract *tpf)
++{
++	if (!tpf->denominator && !tpf->numerator) {
++		tpf->numerator = 1;
++		tpf->denominator = V4L2LOOPBACK_FPS_DEFAULT;
++	} else if (tpf->numerator >
++		   V4L2LOOPBACK_FRAME_INTERVAL_MAX * tpf->denominator) {
++		/* divide-by-zero or greater than maximum interval => min FPS */
++		tpf->numerator = V4L2LOOPBACK_FRAME_INTERVAL_MAX;
++		tpf->denominator = 1;
++	} else if (tpf->numerator * V4L2LOOPBACK_FPS_MAX < tpf->denominator) {
++		/* zero or lower than minimum interval => max FPS */
++		tpf->numerator = 1;
++		tpf->denominator = V4L2LOOPBACK_FPS_MAX;
++	}
++
++	dev->capture_param.timeperframe = *tpf;
++	dev->frame_jiffies =
++		max(1UL, (msecs_to_jiffies(1000) * tpf->numerator) /
++				 tpf->denominator);
++}
++
++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd);
++
++/* device attributes */
++/* available via sysfs: /sys/devices/virtual/video4linux/video* */
++
++static ssize_t attr_show_format(struct device *cd,
++				struct device_attribute *attr, char *buf)
++{
++	/* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++	const struct v4l2_fract *tpf;
++	char buf4cc[5], buf_fps[32];
++
++	if (!dev || (has_no_owners(dev) && !dev->keep_format))
++		return 0;
++	tpf = &dev->capture_param.timeperframe;
++
++	fourcc2str(dev->pix_format.pixelformat, buf4cc);
++	if (tpf->numerator == 1)
++		snprintf(buf_fps, sizeof(buf_fps), "%u", tpf->denominator);
++	else
++		snprintf(buf_fps, sizeof(buf_fps), "%u/%u", tpf->denominator,
++			 tpf->numerator);
++	return sprintf(buf, "%4s:%ux%u@%s\n", buf4cc, dev->pix_format.width,
++		       dev->pix_format.height, buf_fps);
++}
++
++static ssize_t attr_store_format(struct device *cd,
++				 struct device_attribute *attr, const char *buf,
++				 size_t len)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++	int fps_num = 0, fps_den = 1;
++
++	if (!dev)
++		return -ENODEV;
++
++	/* only fps changing is supported */
++	if (sscanf(buf, "@%u/%u", &fps_num, &fps_den) > 0) {
++		struct v4l2_fract f = { .numerator = fps_den,
++					.denominator = fps_num };
++		set_timeperframe(dev, &f);
++		return len;
++	}
++	return -EINVAL;
++}
++
++static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format,
++		   attr_store_format);
++
++static ssize_t attr_show_buffers(struct device *cd,
++				 struct device_attribute *attr, char *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++
++	if (!dev)
++		return -ENODEV;
++
++	return sprintf(buf, "%u\n", dev->used_buffer_count);
++}
++
++static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL);
++
++static ssize_t attr_show_maxopeners(struct device *cd,
++				    struct device_attribute *attr, char *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++
++	if (!dev)
++		return -ENODEV;
++
++	return sprintf(buf, "%d\n", dev->max_openers);
++}
++
++static ssize_t attr_store_maxopeners(struct device *cd,
++				     struct device_attribute *attr,
++				     const char *buf, size_t len)
++{
++	struct v4l2_loopback_device *dev = NULL;
++	unsigned long curr = 0;
++
++	if (kstrtoul(buf, 0, &curr))
++		return -EINVAL;
++
++	dev = v4l2loopback_cd2dev(cd);
++	if (!dev)
++		return -ENODEV;
++
++	if (dev->max_openers == curr)
++		return len;
++
++	if (curr > __INT_MAX__ || dev->open_count.counter > curr) {
++		/* request to limit to less openers as are currently attached to us */
++		return -EINVAL;
++	}
++
++	dev->max_openers = (int)curr;
++
++	return len;
++}
++
++static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners,
++		   attr_store_maxopeners);
++
++static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr,
++			       char *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++
++	if (!dev)
++		return -ENODEV;
++
++	if (!has_output_token(dev->stream_tokens) || dev->keep_format) {
++		return sprintf(buf, "capture\n");
++	} else
++		return sprintf(buf, "output\n");
++
++	return -EAGAIN;
++}
++
++static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL);
++
++static void v4l2loopback_remove_sysfs(struct video_device *vdev)
++{
++#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x)
++
++	if (vdev) {
++		V4L2_SYSFS_DESTROY(format);
++		V4L2_SYSFS_DESTROY(buffers);
++		V4L2_SYSFS_DESTROY(max_openers);
++		V4L2_SYSFS_DESTROY(state);
++		/* ... */
++	}
++}
++
++static void v4l2loopback_create_sysfs(struct video_device *vdev)
++{
++	int res = 0;
++
++#define V4L2_SYSFS_CREATE(x)                                 \
++	res = device_create_file(&vdev->dev, &dev_attr_##x); \
++	if (res < 0)                                         \
++	break
++	if (!vdev)
++		return;
++	do {
++		V4L2_SYSFS_CREATE(format);
++		V4L2_SYSFS_CREATE(buffers);
++		V4L2_SYSFS_CREATE(max_openers);
++		V4L2_SYSFS_CREATE(state);
++		/* ... */
++	} while (0);
++
++	if (res >= 0)
++		return;
++	dev_err(&vdev->dev, "%s error: %d\n", __func__, res);
++}
++
++/* Event APIs */
++
++#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START)
++#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000
++#define V4L2_EVENT_PRI_CLIENT_USAGE \
++	(V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1)
++
++struct v4l2_event_client_usage {
++	__u32 count;
++};
++
++/* global module data */
++/* find a device based on it's device-number (e.g. '3' for /dev/video3) */
++struct v4l2loopback_lookup_cb_data {
++	int device_nr;
++	struct v4l2_loopback_device *device;
++};
++static int v4l2loopback_lookup_cb(int id, void *ptr, void *data)
++{
++	struct v4l2_loopback_device *device = ptr;
++	struct v4l2loopback_lookup_cb_data *cbdata = data;
++	if (cbdata && device && device->vdev) {
++		if (device->vdev->num == cbdata->device_nr) {
++			cbdata->device = device;
++			cbdata->device_nr = id;
++			return 1;
++		}
++	}
++	return 0;
++}
++static int v4l2loopback_lookup(int device_nr,
++			       struct v4l2_loopback_device **device)
++{
++	struct v4l2loopback_lookup_cb_data data = {
++		.device_nr = device_nr,
++		.device = NULL,
++	};
++	int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb,
++			       &data);
++	if (1 == err) {
++		if (device)
++			*device = data.device;
++		return data.device_nr;
++	}
++	return -ENODEV;
++}
++#define v4l2loopback_get_vdev_nr(vdev) \
++	((struct v4l2loopback_private *)video_get_drvdata(vdev))->device_nr
++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd)
++{
++	struct video_device *loopdev = to_video_device(cd);
++	int device_nr = v4l2loopback_get_vdev_nr(loopdev);
++
++	return idr_find(&v4l2loopback_index_idr, device_nr);
++}
++
++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f)
++{
++	struct v4l2loopback_private *ptr = video_drvdata(f);
++	int nr = ptr->device_nr;
++
++	return idr_find(&v4l2loopback_index_idr, nr);
++}
++
++/* forward declarations */
++static void client_usage_queue_event(struct video_device *vdev);
++static bool any_buffers_mapped(struct v4l2_loopback_device *dev);
++static int allocate_buffers(struct v4l2_loopback_device *dev,
++			    struct v4l2_pix_format *pix_format);
++static void init_buffers(struct v4l2_loopback_device *dev, u32 bytes_used,
++			 u32 buffer_size);
++static void free_buffers(struct v4l2_loopback_device *dev);
++static int allocate_timeout_buffer(struct v4l2_loopback_device *dev);
++static void free_timeout_buffer(struct v4l2_loopback_device *dev);
++static void check_timers(struct v4l2_loopback_device *dev);
++static const struct v4l2_file_operations v4l2_loopback_fops;
++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops;
++
++/* V4L2 ioctl caps and params calls */
++/* returns device capabilities
++ * called on VIDIOC_QUERYCAP
++ */
++static int vidioc_querycap(struct file *file, void *fh,
++			   struct v4l2_capability *cap)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	int device_nr = v4l2loopback_get_vdev_nr(dev->vdev);
++	__u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE;
++
++	strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver));
++	snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label);
++	snprintf(cap->bus_info, sizeof(cap->bus_info),
++		 "platform:v4l2loopback-%03d", device_nr);
++
++	if (dev->announce_all_caps) {
++		capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
++	} else {
++		if (opener->io_method == V4L2L_IO_TIMEOUT ||
++		    (has_output_token(dev->stream_tokens) &&
++		     !dev->keep_format)) {
++			capabilities |= V4L2_CAP_VIDEO_OUTPUT;
++		} else
++			capabilities |= V4L2_CAP_VIDEO_CAPTURE;
++	}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
++	dev->vdev->device_caps =
++#endif /* >=linux-4.7.0 */
++		cap->device_caps = cap->capabilities = capabilities;
++
++	cap->capabilities |= V4L2_CAP_DEVICE_CAPS;
++
++	memset(cap->reserved, 0, sizeof(cap->reserved));
++	return 0;
++}
++
++static int vidioc_enum_framesizes(struct file *file, void *fh,
++				  struct v4l2_frmsizeenum *argp)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++
++	/* there can be only one... */
++	if (argp->index)
++		return -EINVAL;
++
++	if (dev->keep_format || has_other_owners(opener, dev)) {
++		/* only current frame size supported */
++		if (argp->pixel_format != dev->pix_format.pixelformat)
++			return -EINVAL;
++
++		argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
++
++		argp->discrete.width = dev->pix_format.width;
++		argp->discrete.height = dev->pix_format.height;
++	} else {
++		/* return continuous sizes if pixel format is supported */
++		if (NULL == format_by_fourcc(argp->pixel_format))
++			return -EINVAL;
++
++		if (dev->min_width == dev->max_width &&
++		    dev->min_height == dev->max_height) {
++			argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
++
++			argp->discrete.width = dev->min_width;
++			argp->discrete.height = dev->min_height;
++		} else {
++			argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
++
++			argp->stepwise.min_width = dev->min_width;
++			argp->stepwise.min_height = dev->min_height;
++
++			argp->stepwise.max_width = dev->max_width;
++			argp->stepwise.max_height = dev->max_height;
++
++			argp->stepwise.step_width = 1;
++			argp->stepwise.step_height = 1;
++		}
++	}
++	return 0;
++}
++
++/* Test if the device is currently 'capable' of the buffer (stream) type when
++ * the `exclusive_caps` parameter is set. `keep_format` should lock the format
++ * and prevent free of buffers */
++static int check_buffer_capability(struct v4l2_loopback_device *dev,
++				   struct v4l2_loopback_opener *opener,
++				   enum v4l2_buf_type type)
++{
++	/* short-circuit for (non-compliant) timeout image mode */
++	if (opener->io_method == V4L2L_IO_TIMEOUT)
++		return 0;
++	if (dev->announce_all_caps)
++		return (type == V4L2_BUF_TYPE_VIDEO_CAPTURE ||
++			type == V4L2_BUF_TYPE_VIDEO_OUTPUT) ?
++			       0 :
++			       -EINVAL;
++	/* CAPTURE if opener has a capture format or a writer is streaming;
++	 * else OUTPUT. */
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if (!(has_capture_token(opener->format_token) ||
++		      !has_output_token(dev->stream_tokens)))
++			return -EINVAL;
++		break;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (!(has_output_token(opener->format_token) ||
++		      has_output_token(dev->stream_tokens)))
++			return -EINVAL;
++		break;
++	default:
++		return -EINVAL;
++	}
++	return 0;
++}
++/* returns frameinterval (fps) for the set resolution
++ * called on VIDIOC_ENUM_FRAMEINTERVALS
++ */
++static int vidioc_enum_frameintervals(struct file *file, void *fh,
++				      struct v4l2_frmivalenum *argp)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++
++	/* there can be only one... */
++	if (argp->index)
++		return -EINVAL;
++
++	if (dev->keep_format || has_other_owners(opener, dev)) {
++		/* keep_format also locks the frame rate */
++		if (argp->width != dev->pix_format.width ||
++		    argp->height != dev->pix_format.height ||
++		    argp->pixel_format != dev->pix_format.pixelformat)
++			return -EINVAL;
++
++		argp->type = V4L2_FRMIVAL_TYPE_DISCRETE;
++		argp->discrete = dev->capture_param.timeperframe;
++	} else {
++		if (argp->width < dev->min_width ||
++		    argp->width > dev->max_width ||
++		    argp->height < dev->min_height ||
++		    argp->height > dev->max_height ||
++		    !format_by_fourcc(argp->pixel_format))
++			return -EINVAL;
++
++		argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
++		argp->stepwise.min.numerator = 1;
++		argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX;
++		argp->stepwise.max.numerator = V4L2LOOPBACK_FRAME_INTERVAL_MAX;
++		argp->stepwise.max.denominator = 1;
++		argp->stepwise.step.numerator = 1;
++		argp->stepwise.step.denominator = 1;
++	}
++
++	return 0;
++}
++
++/* Enumerate device formats
++ * Returns:
++ * -   EINVAL the index is out of bounds; or if non-zero when format is fixed
++ * -   EFAULT unexpected null pointer */
++static int vidioc_enum_fmt_vid(struct file *file, void *fh,
++			       struct v4l2_fmtdesc *f)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	int fixed = dev->keep_format || has_other_owners(opener, dev);
++	const struct v4l2l_format *fmt;
++
++	if (check_buffer_capability(dev, opener, f->type) < 0)
++		return -EINVAL;
++
++	if (!(f->index < FORMATS))
++		return -EINVAL;
++	/* TODO: Support 6.14 V4L2_FMTDESC_FLAG_ENUM_ALL */
++	if (fixed && f->index)
++		return -EINVAL;
++
++	fmt = fixed ? format_by_fourcc(dev->pix_format.pixelformat) :
++		      &formats[f->index];
++	if (!fmt)
++		return -EFAULT;
++
++	f->flags = 0;
++	if (fmt->flags & FORMAT_FLAGS_COMPRESSED)
++		f->flags |= V4L2_FMT_FLAG_COMPRESSED;
++	snprintf(f->description, sizeof(f->description), fmt->name);
++	f->pixelformat = fmt->fourcc;
++	return 0;
++}
++
++/* Tests (or tries) the format.
++ * Returns:
++ * -   EINVAL if the buffer type or format is not supported
++ */
++static int vidioc_try_fmt_vid(struct file *file, void *fh,
++			      struct v4l2_format *f)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++
++	if (check_buffer_capability(dev, opener, f->type) < 0)
++		return -EINVAL;
++	if (v4l2l_fill_format(f, dev->min_width, dev->max_width,
++			      dev->min_height, dev->max_height) != 0)
++		return -EINVAL;
++	if (dev->keep_format || has_other_owners(opener, dev))
++		/* use existing format - including colorspace info */
++		f->fmt.pix = dev->pix_format;
++
++	return 0;
++}
++
++/* Sets new format. Fills 'f' argument with the requested or existing format.
++ * Side-effect: buffers are allocated for the (returned) format.
++ * Returns:
++ * -   EINVAL if the type is not supported
++ * -   EBUSY if buffers are already allocated
++ * TODO: (vasaka) set subregions of input
++ */
++static int vidioc_s_fmt_vid(struct file *file, void *fh, struct v4l2_format *f)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 token = opener->io_method == V4L2L_IO_TIMEOUT ?
++			    V4L2L_TOKEN_TIMEOUT :
++			    token_from_type(f->type);
++	int changed, result;
++	char buf[5];
++
++	result = vidioc_try_fmt_vid(file, fh, f);
++	if (result < 0)
++		return result;
++
++	if (opener->buffer_count > 0)
++		/* must free buffers before format can be set */
++		return -EBUSY;
++
++	result = mutex_lock_killable(&dev->image_mutex);
++	if (result < 0)
++		return result;
++
++	if (opener->format_token)
++		release_token(dev, opener, format);
++	if (!(dev->format_tokens & token)) {
++		result = -EBUSY;
++		goto exit_s_fmt_unlock;
++	}
++
++	dprintk("S_FMT[%s] %4s:%ux%u size=%u\n",
++		V4L2_TYPE_IS_CAPTURE(f->type) ? "CAPTURE" : "OUTPUT",
++		fourcc2str(f->fmt.pix.pixelformat, buf), f->fmt.pix.width,
++		f->fmt.pix.height, f->fmt.pix.sizeimage);
++	changed = !pix_format_eq(&dev->pix_format, &f->fmt.pix, 0);
++	if (changed || has_no_owners(dev)) {
++		result = allocate_buffers(dev, &f->fmt.pix);
++		if (result < 0)
++			goto exit_s_fmt_unlock;
++	}
++	if ((dev->timeout_image && changed) ||
++	    (!dev->timeout_image && need_timeout_buffer(dev, token))) {
++		result = allocate_timeout_buffer(dev);
++		if (result < 0)
++			goto exit_s_fmt_free;
++	}
++	if (changed) {
++		dev->pix_format = f->fmt.pix;
++		dev->pix_format_has_valid_sizeimage =
++			v4l2l_pix_format_has_valid_sizeimage(f);
++	}
++	acquire_token(dev, opener, format, token);
++	if (opener->io_method == V4L2L_IO_TIMEOUT)
++		dev->timeout_image_io = 0;
++	goto exit_s_fmt_unlock;
++exit_s_fmt_free:
++	free_buffers(dev);
++exit_s_fmt_unlock:
++	mutex_unlock(&dev->image_mutex);
++	return result;
++}
++
++/* ------------------ CAPTURE ----------------------- */
++/* ioctl for VIDIOC_ENUM_FMT, _G_FMT, _S_FMT, and _TRY_FMT when buffer type
++ * is V4L2_BUF_TYPE_VIDEO_CAPTURE */
++
++static int vidioc_enum_fmt_cap(struct file *file, void *fh,
++			       struct v4l2_fmtdesc *f)
++{
++	return vidioc_enum_fmt_vid(file, fh, f);
++}
++
++static int vidioc_g_fmt_cap(struct file *file, void *fh, struct v4l2_format *f)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, f->type) < 0)
++		return -EINVAL;
++	f->fmt.pix = dev->pix_format;
++	return 0;
++}
++
++static int vidioc_try_fmt_cap(struct file *file, void *fh,
++			      struct v4l2_format *f)
++{
++	return vidioc_try_fmt_vid(file, fh, f);
++}
++
++static int vidioc_s_fmt_cap(struct file *file, void *fh, struct v4l2_format *f)
++{
++	return vidioc_s_fmt_vid(file, fh, f);
++}
++
++/* ------------------ OUTPUT ----------------------- */
++/* ioctl for VIDIOC_ENUM_FMT, _G_FMT, _S_FMT, and _TRY_FMT when buffer type
++ * is V4L2_BUF_TYPE_VIDEO_OUTPUT */
++
++static int vidioc_enum_fmt_out(struct file *file, void *fh,
++			       struct v4l2_fmtdesc *f)
++{
++	return vidioc_enum_fmt_vid(file, fh, f);
++}
++
++static int vidioc_g_fmt_out(struct file *file, void *fh, struct v4l2_format *f)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, f->type) < 0)
++		return -EINVAL;
++	/*
++	 * LATER: this should return the currently valid format
++	 * gstreamer doesn't like it, if this returns -EINVAL, as it
++	 * then concludes that there is _no_ valid format
++	 * CHECK whether this assumption is wrong,
++	 * or whether we have to always provide a valid format
++	 */
++	f->fmt.pix = dev->pix_format;
++	return 0;
++}
++
++static int vidioc_try_fmt_out(struct file *file, void *fh,
++			      struct v4l2_format *f)
++{
++	return vidioc_try_fmt_vid(file, fh, f);
++}
++
++static int vidioc_s_fmt_out(struct file *file, void *fh, struct v4l2_format *f)
++{
++	return vidioc_s_fmt_vid(file, fh, f);
++}
++
++// #define V4L2L_OVERLAY
++#ifdef V4L2L_OVERLAY
++/* ------------------ OVERLAY ----------------------- */
++/* currently unsupported */
++/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work
++ * while it should only require it, if overlay is requested
++ * once the gstreamer element is fixed, remove the overlay dummies
++ */
++#warning OVERLAY dummies
++static int vidioc_g_fmt_overlay(struct file *file, void *priv,
++				struct v4l2_format *fmt)
++{
++	return 0;
++}
++
++static int vidioc_s_fmt_overlay(struct file *file, void *priv,
++				struct v4l2_format *fmt)
++{
++	return 0;
++}
++#endif /* V4L2L_OVERLAY */
++
++/* ------------------ PARAMs ----------------------- */
++
++/* get some data flow parameters, only capability, fps and readbuffers has
++ * effect on this driver
++ * called on VIDIOC_G_PARM
++ */
++static int vidioc_g_parm(struct file *file, void *fh,
++			 struct v4l2_streamparm *parm)
++{
++	/* do not care about type of opener, hope these enums would always be
++	 * compatible */
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, parm->type) < 0)
++		return -EINVAL;
++	parm->parm.capture = dev->capture_param;
++	return 0;
++}
++
++/* get some data flow parameters, only capability, fps and readbuffers has
++ * effect on this driver
++ * called on VIDIOC_S_PARM
++ */
++static int vidioc_s_parm(struct file *file, void *fh,
++			 struct v4l2_streamparm *parm)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++
++	dprintk("S_PARM(frame-time=%u/%u)\n",
++		parm->parm.capture.timeperframe.numerator,
++		parm->parm.capture.timeperframe.denominator);
++	if (check_buffer_capability(dev, opener, parm->type) < 0)
++		return -EINVAL;
++
++	switch (parm->type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		set_timeperframe(dev, &parm->parm.capture.timeperframe);
++		break;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		set_timeperframe(dev, &parm->parm.output.timeperframe);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	parm->parm.capture = dev->capture_param;
++	return 0;
++}
++
++#ifdef V4L2LOOPBACK_WITH_STD
++/* sets a tv standard, actually we do not need to handle this any special way
++ * added to support effecttv
++ * called on VIDIOC_S_STD
++ */
++static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std)
++{
++	v4l2_std_id req_std = 0, supported_std = 0;
++	const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0;
++
++	if (_std) {
++		req_std = *_std;
++		*_std = all_std;
++	}
++
++	/* we support everything in V4L2_STD_ALL, but not more... */
++	supported_std = (all_std & req_std);
++	if (no_std == supported_std)
++		return -EINVAL;
++
++	return 0;
++}
++
++/* gets a fake video standard
++ * called on VIDIOC_G_STD
++ */
++static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
++{
++	if (norm)
++		*norm = V4L2_STD_ALL;
++	return 0;
++}
++/* gets a fake video standard
++ * called on VIDIOC_QUERYSTD
++ */
++static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
++{
++	if (norm)
++		*norm = V4L2_STD_ALL;
++	return 0;
++}
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id,
++				 s64 val)
++{
++	int result = 0;
++	switch (id) {
++	case CID_KEEP_FORMAT:
++		if (val < 0 || val > 1)
++			return -EINVAL;
++		dev->keep_format = val;
++		result = mutex_lock_killable(&dev->image_mutex);
++		if (result < 0)
++			return result;
++		if (!dev->keep_format) {
++			if (has_no_owners(dev) && !any_buffers_mapped(dev))
++				free_buffers(dev);
++		}
++		mutex_unlock(&dev->image_mutex);
++		break;
++	case CID_SUSTAIN_FRAMERATE:
++		if (val < 0 || val > 1)
++			return -EINVAL;
++		spin_lock_bh(&dev->lock);
++		dev->sustain_framerate = val;
++		check_timers(dev);
++		spin_unlock_bh(&dev->lock);
++		break;
++	case CID_TIMEOUT:
++		if (val < 0 || val > MAX_TIMEOUT)
++			return -EINVAL;
++		if (val > 0) {
++			result = mutex_lock_killable(&dev->image_mutex);
++			if (result < 0)
++				return result;
++			/* on-the-fly allocate if device is owned; else
++			 * allocate occurs on next S_FMT or REQBUFS */
++			if (!has_no_owners(dev))
++				result = allocate_timeout_buffer(dev);
++			mutex_unlock(&dev->image_mutex);
++			if (result < 0) {
++				/* disable timeout as buffer not alloc'd */
++				spin_lock_bh(&dev->lock);
++				dev->timeout_jiffies = 0;
++				spin_unlock_bh(&dev->lock);
++				return result;
++			}
++		}
++		spin_lock_bh(&dev->lock);
++		dev->timeout_jiffies = msecs_to_jiffies(val);
++		check_timers(dev);
++		spin_unlock_bh(&dev->lock);
++		break;
++	case CID_TIMEOUT_IMAGE_IO:
++		dev->timeout_image_io = 1;
++		break;
++	default:
++		return -EINVAL;
++	}
++	return 0;
++}
++
++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl)
++{
++	struct v4l2_loopback_device *dev = container_of(
++		ctrl->handler, struct v4l2_loopback_device, ctrl_handler);
++	return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val);
++}
++
++/* returns set of device outputs, in our case there is only one
++ * called on VIDIOC_ENUMOUTPUT
++ */
++static int vidioc_enum_output(struct file *file, void *fh,
++			      struct v4l2_output *outp)
++{
++	__u32 index = outp->index;
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT))
++		return -ENOTTY;
++	if (index)
++		return -EINVAL;
++
++	/* clear all data (including the reserved fields) */
++	memset(outp, 0, sizeof(*outp));
++
++	outp->index = index;
++	strscpy(outp->name, "loopback in", sizeof(outp->name));
++	outp->type = V4L2_OUTPUT_TYPE_ANALOG;
++	outp->audioset = 0;
++	outp->modulator = 0;
++#ifdef V4L2LOOPBACK_WITH_STD
++	outp->std = V4L2_STD_ALL;
++#ifdef V4L2_OUT_CAP_STD
++	outp->capabilities |= V4L2_OUT_CAP_STD;
++#endif /*  V4L2_OUT_CAP_STD */
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	return 0;
++}
++
++/* which output is currently active,
++ * called on VIDIOC_G_OUTPUT
++ */
++static int vidioc_g_output(struct file *file, void *fh, unsigned int *index)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT))
++		return -ENOTTY;
++	if (index)
++		*index = 0;
++	return 0;
++}
++
++/* set output, can make sense if we have more than one video src,
++ * called on VIDIOC_S_OUTPUT
++ */
++static int vidioc_s_output(struct file *file, void *fh, unsigned int index)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT))
++		return -ENOTTY;
++	return index == 0 ? index : -EINVAL;
++}
++
++/* returns set of device inputs, in our case there is only one,
++ * but later I may add more
++ * called on VIDIOC_ENUMINPUT
++ */
++static int vidioc_enum_input(struct file *file, void *fh,
++			     struct v4l2_input *inp)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	__u32 index = inp->index;
++
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE))
++		return -ENOTTY;
++	if (index)
++		return -EINVAL;
++
++	/* clear all data (including the reserved fields) */
++	memset(inp, 0, sizeof(*inp));
++
++	inp->index = index;
++	strscpy(inp->name, "loopback", sizeof(inp->name));
++	inp->type = V4L2_INPUT_TYPE_CAMERA;
++	inp->audioset = 0;
++	inp->tuner = 0;
++	inp->status = 0;
++
++#ifdef V4L2LOOPBACK_WITH_STD
++	inp->std = V4L2_STD_ALL;
++#ifdef V4L2_IN_CAP_STD
++	inp->capabilities |= V4L2_IN_CAP_STD;
++#endif
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	if (has_output_token(dev->stream_tokens) && !dev->keep_format)
++		/* if no outputs attached; pretend device is powered off */
++		inp->status |= V4L2_IN_ST_NO_SIGNAL;
++
++	return 0;
++}
++
++/* which input is currently active,
++ * called on VIDIOC_G_INPUT
++ */
++static int vidioc_g_input(struct file *file, void *fh, unsigned int *index)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE))
++		return -ENOTTY; /* NOTE: -EAGAIN might be more informative */
++	if (index)
++		*index = 0;
++	return 0;
++}
++
++/* set input, can make sense if we have more than one video src,
++ * called on VIDIOC_S_INPUT
++ */
++static int vidioc_s_input(struct file *file, void *fh, unsigned int index)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	if (index != 0)
++		return -EINVAL;
++	if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE))
++		return -ENOTTY; /* NOTE: -EAGAIN might be more informative */
++	return 0;
++}
++
++/* --------------- V4L2 ioctl buffer related calls ----------------- */
++
++#define is_allocated(opener, type, index)                                \
++	(opener->format_token & (opener->io_method == V4L2L_IO_TIMEOUT ? \
++					 V4L2L_TOKEN_TIMEOUT :           \
++					 token_from_type(type)) &&       \
++	 (index) < (opener)->buffer_count)
++#define BUFFER_DEBUG_FMT_STR                                      \
++	"buffer#%u @ %p type=%u bytesused=%u length=%u flags=%x " \
++	"field=%u timestamp= %lld.%06lldsequence=%u\n"
++#define BUFFER_DEBUG_FMT_ARGS(buf)                                         \
++	(buf)->index, (buf), (buf)->type, (buf)->bytesused, (buf)->length, \
++		(buf)->flags, (buf)->field,                                \
++		(long long)(buf)->timestamp.tv_sec,                        \
++		(long long)(buf)->timestamp.tv_usec, (buf)->sequence
++/* Buffer flag helpers */
++#define unset_flags(flags)                      \
++	do {                                    \
++		flags &= ~V4L2_BUF_FLAG_QUEUED; \
++		flags &= ~V4L2_BUF_FLAG_DONE;   \
++	} while (0)
++#define set_queued(flags)                      \
++	do {                                   \
++		flags |= V4L2_BUF_FLAG_QUEUED; \
++		flags &= ~V4L2_BUF_FLAG_DONE;  \
++	} while (0)
++#define set_done(flags)                         \
++	do {                                    \
++		flags &= ~V4L2_BUF_FLAG_QUEUED; \
++		flags |= V4L2_BUF_FLAG_DONE;    \
++	} while (0)
++
++static bool any_buffers_mapped(struct v4l2_loopback_device *dev)
++{
++	u32 index;
++	for (index = 0; index < dev->buffer_count; ++index)
++		if (dev->buffers[index].buffer.flags & V4L2_BUF_FLAG_MAPPED)
++			return true;
++	return false;
++}
++
++static void prepare_buffer_queue(struct v4l2_loopback_device *dev, int count)
++{
++	struct v4l2l_buffer *bufd, *n;
++	u32 pos;
++
++	spin_lock_bh(&dev->list_lock);
++
++	/* ensure sufficient number of buffers in queue */
++	for (pos = 0; pos < count; ++pos) {
++		bufd = &dev->buffers[pos];
++		if (list_empty(&bufd->list_head))
++			list_add_tail(&bufd->list_head, &dev->outbufs_list);
++	}
++	if (list_empty(&dev->outbufs_list))
++		goto exit_prepare_queue_unlock;
++
++	/* remove any excess buffers */
++	list_for_each_entry_safe(bufd, n, &dev->outbufs_list, list_head) {
++		if (bufd->buffer.index >= count)
++			list_del_init(&bufd->list_head);
++	}
++
++	/* buffers are no longer queued; and `write_position` will correspond
++	 * to the first item of `outbufs_list`. */
++	pos = v4l2l_mod64(dev->write_position, count);
++	list_for_each_entry(bufd, &dev->outbufs_list, list_head) {
++		unset_flags(bufd->buffer.flags);
++		dev->bufpos2index[pos % count] = bufd->buffer.index;
++		++pos;
++	}
++exit_prepare_queue_unlock:
++	spin_unlock_bh(&dev->list_lock);
++}
++
++/* forward declaration */
++static int vidioc_streamoff(struct file *file, void *fh,
++			    enum v4l2_buf_type type);
++/* negotiate buffer type
++ * only mmap streaming supported
++ * called on VIDIOC_REQBUFS
++ */
++static int vidioc_reqbufs(struct file *file, void *fh,
++			  struct v4l2_requestbuffers *reqbuf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 token = opener->io_method == V4L2L_IO_TIMEOUT ?
++			    V4L2L_TOKEN_TIMEOUT :
++			    token_from_type(reqbuf->type);
++	u32 req_count = reqbuf->count;
++	int result = 0;
++
++	dprintk("REQBUFS(memory=%u, req_count=%u) and device-bufs=%u/%u "
++		"[used/max]\n",
++		reqbuf->memory, req_count, dev->used_buffer_count,
++		dev->buffer_count);
++
++	switch (reqbuf->memory) {
++	case V4L2_MEMORY_MMAP:
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
++		reqbuf->capabilities = 0; /* only guarantee MMAP support */
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++		reqbuf->flags = 0; /* no memory consistency support */
++#endif
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	if (opener->format_token & ~token)
++		/* different (buffer) type already assigned to descriptor by
++		 * S_FMT or REQBUFS */
++		return -EINVAL;
++
++	MARK();
++	result = mutex_lock_killable(&dev->image_mutex);
++	if (result < 0)
++		return result; /* -EINTR */
++
++	/* CASE queue/dequeue timeout-buffer only: */
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT) {
++		opener->buffer_count = req_count;
++		if (req_count == 0)
++			release_token(dev, opener, format);
++		goto exit_reqbufs_unlock;
++	}
++
++	MARK();
++	/* CASE count is zero: streamoff, free buffers, release their token */
++	if (req_count == 0) {
++		if (dev->format_tokens & token) {
++			acquire_token(dev, opener, format, token);
++			opener->io_method = V4L2L_IO_MMAP;
++		}
++		result = vidioc_streamoff(file, fh, reqbuf->type);
++		opener->buffer_count = 0;
++		/* undocumented requirement - REQBUFS with count zero should
++		 * ALSO release lock on logical stream */
++		if (opener->format_token)
++			release_token(dev, opener, format);
++		if (has_no_owners(dev))
++			dev->used_buffer_count = 0;
++		goto exit_reqbufs_unlock;
++	}
++
++	/* CASE count non-zero: allocate buffers and acquire token for them */
++	MARK();
++	switch (reqbuf->type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (!(dev->format_tokens & token ||
++		      opener->format_token & token))
++			/* only exclusive ownership for each stream */
++			result = -EBUSY;
++		break;
++	default:
++		result = -EINVAL;
++	}
++	if (result < 0)
++		goto exit_reqbufs_unlock;
++
++	if (has_other_owners(opener, dev) && dev->used_buffer_count > 0) {
++		/* allow 'allocation' of existing number of buffers */
++		req_count = dev->used_buffer_count;
++	} else if (any_buffers_mapped(dev)) {
++		/* do not allow re-allocation if buffers are mapped */
++		result = -EBUSY;
++		goto exit_reqbufs_unlock;
++	}
++
++	MARK();
++	opener->buffer_count = 0;
++
++	if (req_count > dev->buffer_count)
++		req_count = dev->buffer_count;
++
++	if (has_no_owners(dev)) {
++		result = allocate_buffers(dev, &dev->pix_format);
++		if (result < 0)
++			goto exit_reqbufs_unlock;
++	}
++	if (!dev->timeout_image && need_timeout_buffer(dev, token)) {
++		result = allocate_timeout_buffer(dev);
++		if (result < 0)
++			goto exit_reqbufs_unlock;
++	}
++	acquire_token(dev, opener, format, token);
++
++	MARK();
++	switch (opener->io_method) {
++	case V4L2L_IO_TIMEOUT:
++		dev->timeout_image_io = 0;
++		opener->buffer_count = req_count;
++		break;
++	default:
++		opener->io_method = V4L2L_IO_MMAP;
++		prepare_buffer_queue(dev, req_count);
++		dev->used_buffer_count = opener->buffer_count = req_count;
++	}
++exit_reqbufs_unlock:
++	mutex_unlock(&dev->image_mutex);
++	reqbuf->count = opener->buffer_count;
++	return result;
++}
++
++/* returns buffer asked for;
++ * give app as many buffers as it wants, if it less than MAX,
++ * but map them in our inner buffers
++ * called on VIDIOC_QUERYBUF
++ */
++static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 type = buf->type;
++	u32 index = buf->index;
++
++	if ((type != V4L2_BUF_TYPE_VIDEO_CAPTURE) &&
++	    (type != V4L2_BUF_TYPE_VIDEO_OUTPUT))
++		return -EINVAL;
++	if (!is_allocated(opener, type, index))
++		return -EINVAL;
++
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT) {
++		*buf = dev->timeout_buffer.buffer;
++		buf->index = index;
++	} else
++		*buf = dev->buffers[index].buffer;
++
++	buf->type = type;
++
++	if (!(buf->flags & (V4L2_BUF_FLAG_DONE | V4L2_BUF_FLAG_QUEUED))) {
++		/* v4l2-compliance requires these to be zero */
++		buf->sequence = 0;
++		buf->timestamp.tv_sec = buf->timestamp.tv_usec = 0;
++	} else if (V4L2_TYPE_IS_CAPTURE(type)) {
++		/* guess flags based on sequence values */
++		if (buf->sequence >= opener->read_position) {
++			set_done(buf->flags);
++		} else if (buf->flags & V4L2_BUF_FLAG_DONE) {
++			set_queued(buf->flags);
++		}
++	}
++	dprintkrw("QUERYBUF(%s, index=%u) -> " BUFFER_DEBUG_FMT_STR,
++		  V4L2_TYPE_IS_CAPTURE(type) ? "CAPTURE" : "OUTPUT", index,
++		  BUFFER_DEBUG_FMT_ARGS(buf));
++	return 0;
++}
++
++static void buffer_written(struct v4l2_loopback_device *dev,
++			   struct v4l2l_buffer *buf)
++{
++	timer_delete_sync(&dev->sustain_timer);
++	timer_delete_sync(&dev->timeout_timer);
++
++	spin_lock_bh(&dev->list_lock);
++	list_move_tail(&buf->list_head, &dev->outbufs_list);
++	spin_unlock_bh(&dev->list_lock);
++
++	spin_lock_bh(&dev->lock);
++	dev->bufpos2index[v4l2l_mod64(dev->write_position,
++				      dev->used_buffer_count)] =
++		buf->buffer.index;
++	++dev->write_position;
++	dev->reread_count = 0;
++
++	check_timers(dev);
++	spin_unlock_bh(&dev->lock);
++}
++
++/* put buffer to queue
++ * called on VIDIOC_QBUF
++ */
++static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	struct v4l2l_buffer *bufd;
++	u32 index = buf->index;
++	u32 type = buf->type;
++
++	if (!is_allocated(opener, type, index))
++		return -EINVAL;
++	bufd = &dev->buffers[index];
++
++	switch (buf->memory) {
++	case V4L2_MEMORY_MMAP:
++		if (!(bufd->buffer.flags & V4L2_BUF_FLAG_MAPPED))
++			dprintkrw("QBUF() unmapped buffer [index=%u]\n", index);
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT) {
++		set_queued(buf->flags);
++		return 0;
++	}
++
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		dprintkrw("QBUF(CAPTURE, index=%u) -> " BUFFER_DEBUG_FMT_STR,
++			  index, BUFFER_DEBUG_FMT_ARGS(buf));
++		set_queued(buf->flags);
++		break;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		dprintkrw("QBUF(OUTPUT, index=%u) -> " BUFFER_DEBUG_FMT_STR,
++			  index, BUFFER_DEBUG_FMT_ARGS(buf));
++		if (!(bufd->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY) &&
++		    (buf->timestamp.tv_sec == 0 &&
++		     buf->timestamp.tv_usec == 0)) {
++			v4l2l_get_timestamp(&bufd->buffer);
++		} else {
++			bufd->buffer.timestamp = buf->timestamp;
++			bufd->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY;
++			bufd->buffer.flags &=
++				~V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
++		}
++		if (dev->pix_format_has_valid_sizeimage) {
++			if (buf->bytesused >= dev->pix_format.sizeimage) {
++				bufd->buffer.bytesused =
++					dev->pix_format.sizeimage;
++			} else {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
++				dev_warn_ratelimited(
++					&dev->vdev->dev,
++#else
++				dprintkrw(
++#endif
++					"warning queued output buffer bytesused too small %u < %u\n",
++					buf->bytesused,
++					dev->pix_format.sizeimage);
++				bufd->buffer.bytesused = buf->bytesused;
++			}
++		} else {
++			bufd->buffer.bytesused = buf->bytesused;
++		}
++		bufd->buffer.sequence = dev->write_position;
++		set_queued(bufd->buffer.flags);
++		*buf = bufd->buffer;
++		buffer_written(dev, bufd);
++		set_done(bufd->buffer.flags);
++		wake_up_all(&dev->read_event);
++		break;
++	default:
++		return -EINVAL;
++	}
++	buf->type = type;
++	return 0;
++}
++
++static int can_read(struct v4l2_loopback_device *dev,
++		    struct v4l2_loopback_opener *opener)
++{
++	int ret;
++
++	spin_lock_bh(&dev->lock);
++	check_timers(dev);
++	ret = dev->write_position > opener->read_position ||
++	      dev->reread_count > opener->reread_count || dev->timeout_happened;
++	spin_unlock_bh(&dev->lock);
++	return ret;
++}
++
++static int get_capture_buffer(struct file *file)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
++	int pos, timeout_happened;
++	u32 index;
++
++	if ((file->f_flags & O_NONBLOCK) &&
++	    (dev->write_position <= opener->read_position &&
++	     dev->reread_count <= opener->reread_count &&
++	     !dev->timeout_happened))
++		return -EAGAIN;
++	wait_event_interruptible(dev->read_event, can_read(dev, opener));
++
++	spin_lock_bh(&dev->lock);
++	if (dev->write_position == opener->read_position) {
++		if (dev->reread_count > opener->reread_count + 2)
++			opener->reread_count = dev->reread_count - 1;
++		++opener->reread_count;
++		pos = v4l2l_mod64(opener->read_position +
++					  dev->used_buffer_count - 1,
++				  dev->used_buffer_count);
++	} else {
++		opener->reread_count = 0;
++		if (dev->write_position >
++		    opener->read_position + dev->used_buffer_count)
++			opener->read_position = dev->write_position - 1;
++		pos = v4l2l_mod64(opener->read_position,
++				  dev->used_buffer_count);
++		++opener->read_position;
++	}
++	timeout_happened = dev->timeout_happened && (dev->timeout_jiffies > 0);
++	dev->timeout_happened = 0;
++	spin_unlock_bh(&dev->lock);
++
++	index = dev->bufpos2index[pos];
++	if (timeout_happened) {
++		if (index >= dev->used_buffer_count) {
++			dprintkrw("get_capture_buffer() read position is at "
++				  "an unallocated buffer [index=%u]\n",
++				  index);
++			return -EFAULT;
++		}
++		/* although allocated on-demand, timeout_image is freed only
++		 * in free_buffers(), so we don't need to worry about it being
++		 * deallocated suddenly */
++		memcpy(dev->image + dev->buffers[index].buffer.m.offset,
++		       dev->timeout_image, dev->buffer_size);
++	}
++	return (int)index;
++}
++
++/* put buffer to dequeue
++ * called on VIDIOC_DQBUF
++ */
++static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 type = buf->type;
++	int index;
++	struct v4l2l_buffer *bufd;
++
++	if (buf->memory != V4L2_MEMORY_MMAP)
++		return -EINVAL;
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT) {
++		*buf = dev->timeout_buffer.buffer;
++		buf->type = type;
++		unset_flags(buf->flags);
++		return 0;
++	}
++	if ((opener->buffer_count == 0) ||
++	    !(opener->format_token & token_from_type(type)))
++		return -EINVAL;
++
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		index = get_capture_buffer(file);
++		if (index < 0)
++			return index;
++		*buf = dev->buffers[index].buffer;
++		unset_flags(buf->flags);
++		break;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		spin_lock_bh(&dev->list_lock);
++
++		bufd = list_first_entry_or_null(&dev->outbufs_list,
++						struct v4l2l_buffer, list_head);
++		if (bufd)
++			list_move_tail(&bufd->list_head, &dev->outbufs_list);
++
++		spin_unlock_bh(&dev->list_lock);
++		if (!bufd)
++			return -EFAULT;
++		unset_flags(bufd->buffer.flags);
++		*buf = bufd->buffer;
++		break;
++	default:
++		return -EINVAL;
++	}
++
++	buf->type = type;
++	dprintkrw("DQBUF(%s, index=%u) -> " BUFFER_DEBUG_FMT_STR,
++		  V4L2_TYPE_IS_CAPTURE(type) ? "CAPTURE" : "OUTPUT", index,
++		  BUFFER_DEBUG_FMT_ARGS(buf));
++	return 0;
++}
++
++/* ------------- STREAMING ------------------- */
++
++/* start streaming
++ * called on VIDIOC_STREAMON
++ */
++static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 token = token_from_type(type);
++
++	/* short-circuit when using timeout buffer set */
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT)
++		return 0;
++	/* opener must have claimed (same) buffer set via REQBUFS */
++	if (!opener->buffer_count || !(opener->format_token & token))
++		return -EINVAL;
++
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if (has_output_token(dev->stream_tokens) && !dev->keep_format)
++			return -EIO;
++		if (dev->stream_tokens & token) {
++			acquire_token(dev, opener, stream, token);
++			client_usage_queue_event(dev->vdev);
++		}
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (dev->stream_tokens & token)
++			acquire_token(dev, opener, stream, token);
++		return 0;
++	default:
++		return -EINVAL;
++	}
++}
++
++/* stop streaming
++ * called on VIDIOC_STREAMOFF
++ */
++static int vidioc_streamoff(struct file *file, void *fh,
++			    enum v4l2_buf_type type)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	u32 token = token_from_type(type);
++
++	/* short-circuit when using timeout buffer set */
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT)
++		return 0;
++	/* short-circuit when buffer set has no owner */
++	if (dev->format_tokens & token)
++		return 0;
++	/* opener needs a claim to buffer set */
++	if (!opener->format_token)
++		return -EBUSY;
++	if (opener->format_token & ~token)
++		return -EINVAL;
++
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (opener->stream_token & token)
++			release_token(dev, opener, stream);
++		/* reset output queue */
++		if (dev->used_buffer_count > 0)
++			prepare_buffer_queue(dev, dev->used_buffer_count);
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if (opener->stream_token & token) {
++			release_token(dev, opener, stream);
++			client_usage_queue_event(dev->vdev);
++		}
++		return 0;
++	default:
++		return -EINVAL;
++	}
++}
++
++#ifdef CONFIG_VIDEO_V4L1_COMPAT
++static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	p->frames = dev->buffer_count;
++	p->offsets[0] = 0;
++	p->offsets[1] = 0;
++	p->size = dev->buffer_size;
++	return 0;
++}
++#endif
++
++static void client_usage_queue_event(struct video_device *vdev)
++{
++	struct v4l2_event ev;
++	struct v4l2_loopback_device *dev;
++
++	dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device,
++			   v4l2_dev);
++
++	memset(&ev, 0, sizeof(ev));
++	ev.type = V4L2_EVENT_PRI_CLIENT_USAGE;
++	((struct v4l2_event_client_usage *)&ev.u)->count =
++		!has_capture_token(dev->stream_tokens);
++
++	v4l2_event_queue(vdev, &ev);
++}
++
++static int client_usage_ops_add(struct v4l2_subscribed_event *sev,
++				unsigned elems)
++{
++	if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL))
++		return 0;
++
++	client_usage_queue_event(sev->fh->vdev);
++	return 0;
++}
++
++static void client_usage_ops_replace(struct v4l2_event *old,
++				     const struct v4l2_event *new)
++{
++	*((struct v4l2_event_client_usage *)&old->u) =
++		*((struct v4l2_event_client_usage *)&new->u);
++}
++
++static void client_usage_ops_merge(const struct v4l2_event *old,
++				   struct v4l2_event *new)
++{
++	*((struct v4l2_event_client_usage *)&new->u) =
++		*((struct v4l2_event_client_usage *)&old->u);
++}
++
++const struct v4l2_subscribed_event_ops client_usage_ops = {
++	.add = client_usage_ops_add,
++	.replace = client_usage_ops_replace,
++	.merge = client_usage_ops_merge,
++};
++
++static int vidioc_subscribe_event(struct v4l2_fh *fh,
++				  const struct v4l2_event_subscription *sub)
++{
++	switch (sub->type) {
++	case V4L2_EVENT_CTRL:
++		return v4l2_ctrl_subscribe_event(fh, sub);
++	case V4L2_EVENT_PRI_CLIENT_USAGE:
++		return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops);
++	}
++
++	return -EINVAL;
++}
++
++/* file operations */
++static void vm_open(struct vm_area_struct *vma)
++{
++	struct v4l2l_buffer *buf;
++	MARK();
++
++	buf = vma->vm_private_data;
++	atomic_inc(&buf->use_count);
++	buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED;
++}
++
++static void vm_close(struct vm_area_struct *vma)
++{
++	struct v4l2l_buffer *buf;
++	MARK();
++
++	buf = vma->vm_private_data;
++	if (atomic_dec_and_test(&buf->use_count))
++		buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED;
++}
++
++static struct vm_operations_struct vm_ops = {
++	.open = vm_open,
++	.close = vm_close,
++};
++
++static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	u8 *addr;
++	unsigned long start, size, offset;
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
++	struct v4l2l_buffer *buffer = NULL;
++	int result = 0;
++	MARK();
++
++	offset = (unsigned long)vma->vm_pgoff << PAGE_SHIFT;
++	start = (unsigned long)vma->vm_start;
++	size = (unsigned long)(vma->vm_end - vma->vm_start); /* always != 0 */
++
++	/* ensure buffer size, count, and allocated image(s) are not altered by
++	 * other file descriptors */
++	result = mutex_lock_killable(&dev->image_mutex);
++	if (result < 0)
++		return result;
++
++	if (size > dev->buffer_size) {
++		dprintk("mmap() attempt to map %lubytes when %ubytes are "
++			"allocated to buffers\n",
++			size, dev->buffer_size);
++		result = -EINVAL;
++		goto exit_mmap_unlock;
++	}
++	if (offset % dev->buffer_size != 0) {
++		dprintk("mmap() offset does not match start of any buffer\n");
++		result = -EINVAL;
++		goto exit_mmap_unlock;
++	}
++	switch (opener->format_token) {
++	case V4L2L_TOKEN_TIMEOUT:
++		if (offset != (unsigned long)dev->buffer_size * MAX_BUFFERS) {
++			dprintk("mmap() incorrect offset for timeout image\n");
++			result = -EINVAL;
++			goto exit_mmap_unlock;
++		}
++		buffer = &dev->timeout_buffer;
++		addr = dev->timeout_image;
++		break;
++	default:
++		if (offset >= dev->image_size) {
++			dprintk("mmap() attempt to map beyond all buffers\n");
++			result = -EINVAL;
++			goto exit_mmap_unlock;
++		}
++		u32 index = offset / dev->buffer_size;
++		buffer = &dev->buffers[index];
++		addr = dev->image + offset;
++		break;
++	}
++
++	while (size > 0) {
++		struct page *page = vmalloc_to_page(addr);
++
++		result = vm_insert_page(vma, start, page);
++		if (result < 0)
++			goto exit_mmap_unlock;
++
++		start += PAGE_SIZE;
++		addr += PAGE_SIZE;
++		size -= PAGE_SIZE;
++	}
++
++	vma->vm_ops = &vm_ops;
++	vma->vm_private_data = buffer;
++
++	vm_open(vma);
++exit_mmap_unlock:
++	mutex_unlock(&dev->image_mutex);
++	return result;
++}
++
++static unsigned int v4l2_loopback_poll(struct file *file,
++				       struct poll_table_struct *pts)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
++	__poll_t req_events = poll_requested_events(pts);
++	int ret_mask = 0;
++
++	/* call poll_wait in first call, regardless, to ensure that the
++	 * wait-queue is not null */
++	poll_wait(file, &dev->read_event, pts);
++	poll_wait(file, &opener->fh.wait, pts);
++
++	if (req_events & POLLPRI) {
++		if (v4l2_event_pending(&opener->fh)) {
++			ret_mask |= POLLPRI;
++			if (!(req_events & DEFAULT_POLLMASK))
++				return ret_mask;
++		}
++	}
++
++	switch (opener->format_token) {
++	case V4L2L_TOKEN_OUTPUT:
++		if (opener->stream_token != 0 ||
++		    opener->io_method == V4L2L_IO_NONE)
++			ret_mask |= POLLOUT | POLLWRNORM;
++		break;
++	case V4L2L_TOKEN_CAPTURE:
++		if ((opener->io_method == V4L2L_IO_NONE ||
++		     opener->stream_token != 0) &&
++		    can_read(dev, opener))
++			ret_mask |= POLLIN | POLLWRNORM;
++		break;
++	case V4L2L_TOKEN_TIMEOUT:
++		ret_mask |= POLLOUT | POLLWRNORM;
++		break;
++	default:
++		break;
++	}
++
++	return ret_mask;
++}
++
++/* do not want to limit device opens, it can be as many readers as user want,
++ * writers are limited by means of setting writer field */
++static int v4l2_loopback_open(struct file *file)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++
++	dev = v4l2loopback_getdevice(file);
++	if (dev->open_count.counter >= dev->max_openers)
++		return -EBUSY;
++	/* kfree on close */
++	opener = kzalloc(sizeof(*opener), GFP_KERNEL);
++	if (opener == NULL)
++		return -ENOMEM;
++
++	atomic_inc(&dev->open_count);
++	if (dev->timeout_image_io && dev->format_tokens & V4L2L_TOKEN_TIMEOUT)
++		/* will clear timeout_image_io once buffer set acquired */
++		opener->io_method = V4L2L_IO_TIMEOUT;
++
++	v4l2_fh_init(&opener->fh, video_devdata(file));
++	file->private_data = &opener->fh;
++
++	v4l2_fh_add(&opener->fh);
++	dprintk("open() -> dev@%p with image@%p\n", dev,
++		dev ? dev->image : NULL);
++	return 0;
++}
++
++static int v4l2_loopback_close(struct file *file)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
++	int result = 0;
++	dprintk("close() -> dev@%p with image@%p\n", dev,
++		dev ? dev->image : NULL);
++
++	if (opener->format_token) {
++		struct v4l2_requestbuffers reqbuf = {
++			.count = 0, .memory = V4L2_MEMORY_MMAP, .type = 0
++		};
++		switch (opener->format_token) {
++		case V4L2L_TOKEN_CAPTURE:
++			reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++			break;
++		case V4L2L_TOKEN_OUTPUT:
++		case V4L2L_TOKEN_TIMEOUT:
++			reqbuf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++			break;
++		}
++		if (reqbuf.type)
++			result = vidioc_reqbufs(file, file->private_data,
++						&reqbuf);
++		if (result < 0)
++			dprintk("failed to free buffers REQBUFS(count=0) "
++				" returned %d\n",
++				result);
++		mutex_lock(&dev->image_mutex);
++		release_token(dev, opener, format);
++		mutex_unlock(&dev->image_mutex);
++	}
++
++	if (atomic_dec_and_test(&dev->open_count)) {
++		timer_delete_sync(&dev->sustain_timer);
++		timer_delete_sync(&dev->timeout_timer);
++		if (!dev->keep_format) {
++			mutex_lock(&dev->image_mutex);
++			free_buffers(dev);
++			mutex_unlock(&dev->image_mutex);
++		}
++	}
++
++	v4l2_fh_del(&opener->fh);
++	v4l2_fh_exit(&opener->fh);
++
++	kfree(opener);
++	return 0;
++}
++
++static int start_fileio(struct file *file, void *fh, enum v4l2_buf_type type)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(fh);
++	struct v4l2_requestbuffers reqbuf = { .count = dev->buffer_count,
++					      .memory = V4L2_MEMORY_MMAP,
++					      .type = type };
++	int token = token_from_type(type);
++	int result;
++
++	if (opener->format_token & V4L2L_TOKEN_TIMEOUT ||
++	    opener->format_token & ~token)
++		return -EBUSY; /* NOTE: -EBADF might be more informative */
++
++	/* short-circuit if already have stream token */
++	if (opener->stream_token && opener->io_method == V4L2L_IO_FILE)
++		return 0;
++
++	/* otherwise attempt to acquire stream token and assign IO method */
++	if (!(dev->stream_tokens & token) || opener->io_method != V4L2L_IO_NONE)
++		return -EBUSY;
++
++	result = vidioc_reqbufs(file, fh, &reqbuf);
++	if (result < 0)
++		return result;
++	result = vidioc_streamon(file, fh, type);
++	if (result < 0)
++		return result;
++
++	opener->io_method = V4L2L_IO_FILE;
++	return 0;
++}
++
++static ssize_t v4l2_loopback_read(struct file *file, char __user *buf,
++				  size_t count, loff_t *ppos)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_buffer *b;
++	int index, result;
++
++	dprintkrw("read() %zu bytes\n", count);
++	result = start_fileio(file, file->private_data,
++			      V4L2_BUF_TYPE_VIDEO_CAPTURE);
++	if (result < 0)
++		return result;
++
++	index = get_capture_buffer(file);
++	if (index < 0)
++		return index;
++	b = &dev->buffers[index].buffer;
++	if (count > b->bytesused)
++		count = b->bytesused;
++	if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset),
++			 count)) {
++		printk(KERN_ERR "v4l2-loopback read() failed copy_to_user()\n");
++		return -EFAULT;
++	}
++	return count;
++}
++
++static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf,
++				   size_t count, loff_t *ppos)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_buffer *b;
++	int index, result;
++
++	dprintkrw("write() %zu bytes\n", count);
++	result = start_fileio(file, file->private_data,
++			      V4L2_BUF_TYPE_VIDEO_OUTPUT);
++	if (result < 0)
++		return result;
++
++	if (count > dev->buffer_size)
++		count = dev->buffer_size;
++	index = v4l2l_mod64(dev->write_position, dev->used_buffer_count);
++	b = &dev->buffers[index].buffer;
++
++	if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf,
++			   count)) {
++		printk(KERN_ERR
++		       "v4l2-loopback write() failed copy_from_user()\n");
++		return -EFAULT;
++	}
++	b->bytesused = count;
++
++	v4l2l_get_timestamp(b);
++	b->sequence = dev->write_position;
++	set_queued(b->flags);
++	buffer_written(dev, &dev->buffers[index]);
++	set_done(b->flags);
++	wake_up_all(&dev->read_event);
++
++	return count;
++}
++
++/* init functions */
++/* frees buffers, if allocated */
++static void free_buffers(struct v4l2_loopback_device *dev)
++{
++	dprintk("free_buffers() with image@%p\n", dev->image);
++	if (!dev->image)
++		return;
++	if (!has_no_owners(dev) || any_buffers_mapped(dev))
++		/* maybe an opener snuck in before image_mutex was acquired */
++		printk(KERN_WARNING
++		       "v4l2-loopback free_buffers() buffers of video device "
++		       "#%u freed while still mapped to userspace\n",
++		       dev->vdev->num);
++	vfree(dev->image);
++	dev->image = NULL;
++	dev->image_size = 0;
++	dev->buffer_size = 0;
++}
++
++static void free_timeout_buffer(struct v4l2_loopback_device *dev)
++{
++	dprintk("free_timeout_buffer() with timeout_image@%p\n",
++		dev->timeout_image);
++	if (!dev->timeout_image)
++		return;
++
++	if ((dev->timeout_jiffies > 0 && !has_no_owners(dev)) ||
++	    dev->timeout_buffer.buffer.flags & V4L2_BUF_FLAG_MAPPED)
++		printk(KERN_WARNING
++		       "v4l2-loopback free_timeout_buffer() timeout image "
++		       "of device #%u freed while still mapped to userspace\n",
++		       dev->vdev->num);
++
++	vfree(dev->timeout_image);
++	dev->timeout_image = NULL;
++	dev->timeout_buffer_size = 0;
++}
++/* allocates buffers if no (other) openers are already using them */
++static int allocate_buffers(struct v4l2_loopback_device *dev,
++			    struct v4l2_pix_format *pix_format)
++{
++	u32 buffer_size = PAGE_ALIGN(pix_format->sizeimage);
++	unsigned long image_size =
++		(unsigned long)buffer_size * (unsigned long)dev->buffer_count;
++	/* vfree on close file operation in case no open handles left */
++
++	if (buffer_size == 0 || dev->buffer_count == 0 ||
++	    buffer_size < pix_format->sizeimage)
++		return -EINVAL;
++
++	if ((__LONG_MAX__ / buffer_size) < dev->buffer_count)
++		return -ENOSPC;
++
++	dprintk("allocate_buffers() size %lubytes = %ubytes x %ubuffers\n",
++		image_size, buffer_size, dev->buffer_count);
++	if (dev->image) {
++		/* check that no buffers are expected in user-space */
++		if (!has_no_owners(dev) || any_buffers_mapped(dev))
++			return -EBUSY;
++		dprintk("allocate_buffers() existing size=%lubytes\n",
++			dev->image_size);
++		/* FIXME: prevent double allocation more intelligently! */
++		if (image_size == dev->image_size) {
++			dprintk("allocate_buffers() keep existing\n");
++			return 0;
++		}
++		free_buffers(dev);
++	}
++
++	/* FIXME: set buffers to 0 */
++	dev->image = vmalloc(image_size);
++	if (dev->image == NULL) {
++		dev->buffer_size = dev->image_size = 0;
++		return -ENOMEM;
++	}
++	init_buffers(dev, pix_format->sizeimage, buffer_size);
++	dev->buffer_size = buffer_size;
++	dev->image_size = image_size;
++	dprintk("allocate_buffers() -> vmalloc'd %lubytes\n", dev->image_size);
++	return 0;
++}
++static int allocate_timeout_buffer(struct v4l2_loopback_device *dev)
++{
++	/* device's `buffer_size` and `buffers` must be initialised in
++	 * allocate_buffers() */
++
++	dprintk("allocate_timeout_buffer() size %ubytes\n", dev->buffer_size);
++	if (dev->buffer_size == 0)
++		return -EINVAL;
++
++	if (dev->timeout_image) {
++		if (dev->timeout_buffer.buffer.flags & V4L2_BUF_FLAG_MAPPED)
++			return -EBUSY;
++		if (dev->buffer_size == dev->timeout_buffer_size)
++			return 0;
++		free_timeout_buffer(dev);
++	}
++
++	dev->timeout_image = vzalloc(dev->buffer_size);
++	if (!dev->timeout_image) {
++		dev->timeout_buffer_size = 0;
++		return -ENOMEM;
++	}
++	dev->timeout_buffer_size = dev->buffer_size;
++	return 0;
++}
++/* init inner buffers, they are capture mode and flags are set as for capture
++ * mode buffers */
++static void init_buffers(struct v4l2_loopback_device *dev, u32 bytes_used,
++			 u32 buffer_size)
++{
++	u32 i;
++
++	for (i = 0; i < dev->buffer_count; ++i) {
++		struct v4l2_buffer *b = &dev->buffers[i].buffer;
++		b->index = i;
++		b->bytesused = bytes_used;
++		b->length = buffer_size;
++		b->field = V4L2_FIELD_NONE;
++		b->flags = 0;
++		b->m.offset = i * buffer_size;
++		b->memory = V4L2_MEMORY_MMAP;
++		b->sequence = 0;
++		b->timestamp.tv_sec = 0;
++		b->timestamp.tv_usec = 0;
++		b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++
++		v4l2l_get_timestamp(b);
++	}
++	dev->timeout_buffer = dev->buffers[0];
++	dev->timeout_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size;
++}
++
++/* fills and register video device */
++static void init_vdev(struct video_device *vdev, int nr)
++{
++#ifdef V4L2LOOPBACK_WITH_STD
++	vdev->tvnorms = V4L2_STD_ALL;
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	vdev->vfl_type = VFL_TYPE_VIDEO;
++	vdev->fops = &v4l2_loopback_fops;
++	vdev->ioctl_ops = &v4l2_loopback_ioctl_ops;
++	vdev->release = &video_device_release;
++	vdev->minor = -1;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
++	vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE |
++			    V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE |
++			    V4L2_CAP_STREAMING;
++#endif
++
++	if (debug > 1)
++		vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL |
++				  V4L2_DEV_DEBUG_IOCTL_ARG;
++
++	vdev->vfl_dir = VFL_DIR_M2M;
++}
++
++/* init default capture parameters, only fps may be changed in future */
++static void init_capture_param(struct v4l2_captureparm *capture_param)
++{
++	capture_param->capability = V4L2_CAP_TIMEPERFRAME; /* since 2.16 */
++	capture_param->capturemode = 0;
++	capture_param->extendedmode = 0;
++	capture_param->readbuffers = max_buffers;
++	capture_param->timeperframe.numerator = 1;
++	capture_param->timeperframe.denominator = V4L2LOOPBACK_FPS_DEFAULT;
++}
++
++static void check_timers(struct v4l2_loopback_device *dev)
++{
++	if (has_output_token(dev->stream_tokens))
++		return;
++
++	if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer))
++		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
++	if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer))
++		mod_timer(&dev->sustain_timer,
++			  jiffies + dev->frame_jiffies * 3 / 2);
++}
++#ifdef HAVE_TIMER_SETUP
++static void sustain_timer_clb(struct timer_list *t)
++{
++	struct v4l2_loopback_device *dev =
++		container_of(t, struct v4l2_loopback_device, sustain_timer);
++#else
++static void sustain_timer_clb(unsigned long nr)
++{
++	struct v4l2_loopback_device *dev =
++		idr_find(&v4l2loopback_index_idr, nr);
++#endif
++	spin_lock(&dev->lock);
++	if (dev->sustain_framerate) {
++		dev->reread_count++;
++		dprintkrw("sustain_timer_clb() write_pos=%lld reread=%u\n",
++			  (long long)dev->write_position, dev->reread_count);
++		if (dev->reread_count == 1)
++			mod_timer(&dev->sustain_timer,
++				  jiffies + max(1UL, dev->frame_jiffies / 2));
++		else
++			mod_timer(&dev->sustain_timer,
++				  jiffies + dev->frame_jiffies);
++		wake_up_all(&dev->read_event);
++	}
++	spin_unlock(&dev->lock);
++}
++#ifdef HAVE_TIMER_SETUP
++static void timeout_timer_clb(struct timer_list *t)
++{
++	struct v4l2_loopback_device *dev =
++		container_of(t, struct v4l2_loopback_device, timeout_timer);
++#else
++static void timeout_timer_clb(unsigned long nr)
++{
++	struct v4l2_loopback_device *dev =
++		idr_find(&v4l2loopback_index_idr, nr);
++#endif
++	spin_lock(&dev->lock);
++	if (dev->timeout_jiffies > 0) {
++		dev->timeout_happened = 1;
++		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
++		wake_up_all(&dev->read_event);
++	}
++	spin_unlock(&dev->lock);
++}
++
++/* init loopback main structure */
++#define DEFAULT_FROM_CONF(confmember, default_condition, default_value)        \
++	((conf) ?                                                              \
++		 ((conf->confmember default_condition) ? (default_value) :     \
++							 (conf->confmember)) : \
++		 default_value)
++
++static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_ctrl_handler *hdl;
++	struct v4l2loopback_private *vdev_priv = NULL;
++	int err;
++
++	u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
++	u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
++
++	u32 _min_width = DEFAULT_FROM_CONF(min_width,
++					   < V4L2LOOPBACK_SIZE_MIN_WIDTH,
++					   V4L2LOOPBACK_SIZE_MIN_WIDTH);
++	u32 _min_height = DEFAULT_FROM_CONF(min_height,
++					    < V4L2LOOPBACK_SIZE_MIN_HEIGHT,
++					    V4L2LOOPBACK_SIZE_MIN_HEIGHT);
++	u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width);
++	u32 _max_height =
++		DEFAULT_FROM_CONF(max_height, < _min_height, max_height);
++	bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ?
++					  (bool)(conf->announce_all_caps) :
++					  !(V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS);
++	int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers);
++	int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers);
++	struct v4l2_format _fmt;
++
++	int nr = -1;
++
++	if (conf) {
++		const int output_nr = conf->output_nr;
++#ifdef SPLIT_DEVICES
++		const int capture_nr = conf->capture_nr;
++#else
++		const int capture_nr = output_nr;
++#endif
++		if (capture_nr >= 0 && output_nr == capture_nr) {
++			nr = output_nr;
++		} else if (capture_nr < 0 && output_nr < 0) {
++			nr = -1;
++		} else if (capture_nr < 0) {
++			nr = output_nr;
++		} else if (output_nr < 0) {
++			nr = capture_nr;
++		} else {
++			printk(KERN_ERR
++			       "v4l2-loopback add() split OUTPUT and CAPTURE "
++			       "devices not yet supported.\n");
++			printk(KERN_INFO
++			       "v4l2-loopback add() both devices must have the "
++			       "same number (%d != %d).\n",
++			       output_nr, capture_nr);
++			return -EINVAL;
++		}
++	}
++
++	if (idr_find(&v4l2loopback_index_idr, nr))
++		return -EEXIST;
++
++	/* initialisation of a new device */
++	dprintk("add() creating device #%d\n", nr);
++	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
++	if (!dev)
++		return -ENOMEM;
++
++	/* allocate id, if @id >= 0, we're requesting that specific id */
++	if (nr >= 0) {
++		err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1,
++				GFP_KERNEL);
++		if (err == -ENOSPC)
++			err = -EEXIST;
++	} else {
++		err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL);
++	}
++	if (err < 0)
++		goto out_free_dev;
++
++	/* register new device */
++	MARK();
++	nr = err;
++
++	if (conf && conf->card_label[0]) {
++		snprintf(dev->card_label, sizeof(dev->card_label), "%s",
++			 conf->card_label);
++	} else {
++		snprintf(dev->card_label, sizeof(dev->card_label),
++			 "Dummy video device (0x%04X)", nr);
++	}
++	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
++		 "v4l2loopback-%03d", nr);
++
++	err = v4l2_device_register(NULL, &dev->v4l2_dev);
++	if (err)
++		goto out_free_idr;
++
++	/* initialise the _video_ device */
++	MARK();
++	err = -ENOMEM;
++	dev->vdev = video_device_alloc();
++	if (dev->vdev == NULL)
++		goto out_unregister;
++
++	vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL);
++	if (vdev_priv == NULL)
++		goto out_unregister;
++
++	video_set_drvdata(dev->vdev, vdev_priv);
++	if (video_get_drvdata(dev->vdev) == NULL)
++		goto out_unregister;
++
++	snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s",
++		 dev->card_label);
++	vdev_priv->device_nr = nr;
++	init_vdev(dev->vdev, nr);
++	dev->vdev->v4l2_dev = &dev->v4l2_dev;
++
++	/* initialise v4l2-loopback specific parameters */
++	MARK();
++	dev->announce_all_caps = _announce_all_caps;
++	dev->min_width = _min_width;
++	dev->min_height = _min_height;
++	dev->max_width = _max_width;
++	dev->max_height = _max_height;
++	dev->max_openers = _max_openers;
++
++	/* set (initial) pixel and stream format */
++	_width = clamp_val(_width, _min_width, _max_width);
++	_height = clamp_val(_height, _min_height, _max_height);
++	_fmt = (struct v4l2_format){
++		.type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
++		.fmt.pix = { .width = _width,
++			     .height = _height,
++			     .pixelformat = formats[0].fourcc,
++			     .colorspace = V4L2_COLORSPACE_DEFAULT,
++			     .field = V4L2_FIELD_NONE }
++	};
++
++	err = v4l2l_fill_format(&_fmt, _min_width, _max_width, _min_height,
++				_max_height);
++	if (err)
++		/* highly unexpected failure to assign default format */
++		goto out_unregister;
++	dev->pix_format = _fmt.fmt.pix;
++	init_capture_param(&dev->capture_param);
++	set_timeperframe(dev, &dev->capture_param.timeperframe);
++
++	/* ctrls parameters */
++	dev->keep_format = 0;
++	dev->sustain_framerate = 0;
++	dev->timeout_jiffies = 0;
++	dev->timeout_image_io = 0;
++
++	/* initialise OUTPUT and CAPTURE buffer values */
++	dev->image = NULL;
++	dev->image_size = 0;
++	dev->buffer_count = _max_buffers;
++	dev->buffer_size = 0;
++	dev->used_buffer_count = 0;
++	INIT_LIST_HEAD(&dev->outbufs_list);
++	do {
++		u32 index;
++		for (index = 0; index < dev->buffer_count; ++index)
++			INIT_LIST_HEAD(&dev->buffers[index].list_head);
++
++	} while (0);
++	memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index));
++	dev->write_position = 0;
++
++	/* initialise synchronisation data */
++	atomic_set(&dev->open_count, 0);
++	mutex_init(&dev->image_mutex);
++	spin_lock_init(&dev->lock);
++	spin_lock_init(&dev->list_lock);
++	init_waitqueue_head(&dev->read_event);
++	dev->format_tokens = V4L2L_TOKEN_MASK;
++	dev->stream_tokens = V4L2L_TOKEN_MASK;
++
++	/* initialise sustain frame rate and timeout parameters, and timers */
++	dev->reread_count = 0;
++	dev->timeout_image = NULL;
++	dev->timeout_happened = 0;
++#ifdef HAVE_TIMER_SETUP
++	timer_setup(&dev->sustain_timer, sustain_timer_clb, 0);
++	timer_setup(&dev->timeout_timer, timeout_timer_clb, 0);
++#else
++	setup_timer(&dev->sustain_timer, sustain_timer_clb, nr);
++	setup_timer(&dev->timeout_timer, timeout_timer_clb, nr);
++#endif
++
++	/* initialise the control handler and add controls */
++	MARK();
++	hdl = &dev->ctrl_handler;
++	err = v4l2_ctrl_handler_init(hdl, 4);
++	if (err)
++		goto out_unregister;
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL);
++	if (hdl->error) {
++		err = hdl->error;
++		goto out_free_handler;
++	}
++	dev->v4l2_dev.ctrl_handler = hdl;
++
++	err = v4l2_ctrl_handler_setup(hdl);
++	if (err)
++		goto out_free_handler;
++
++	/* register the device (creates /dev/video*) */
++	MARK();
++	if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) {
++		printk(KERN_ERR
++		       "v4l2-loopback add() failed video_register_device()\n");
++		err = -EFAULT;
++		goto out_free_device;
++	}
++	v4l2loopback_create_sysfs(dev->vdev);
++	/* NOTE: ambivalent if sysfs entries fail */
++
++	if (ret_nr)
++		*ret_nr = dev->vdev->num;
++	return 0;
++
++out_free_device:
++	video_device_release(dev->vdev);
++out_free_handler:
++	v4l2_ctrl_handler_free(&dev->ctrl_handler);
++out_unregister:
++	video_set_drvdata(dev->vdev, NULL);
++	if (vdev_priv != NULL)
++		kfree(vdev_priv);
++	v4l2_device_unregister(&dev->v4l2_dev);
++out_free_idr:
++	idr_remove(&v4l2loopback_index_idr, nr);
++out_free_dev:
++	kfree(dev);
++	return err;
++}
++
++static void v4l2_loopback_remove(struct v4l2_loopback_device *dev)
++{
++	int device_nr = v4l2loopback_get_vdev_nr(dev->vdev);
++	mutex_lock(&dev->image_mutex);
++	free_buffers(dev);
++	free_timeout_buffer(dev);
++	mutex_unlock(&dev->image_mutex);
++	v4l2loopback_remove_sysfs(dev->vdev);
++	v4l2_ctrl_handler_free(&dev->ctrl_handler);
++	kfree(video_get_drvdata(dev->vdev));
++	video_unregister_device(dev->vdev);
++	v4l2_device_unregister(&dev->v4l2_dev);
++	idr_remove(&v4l2loopback_index_idr, device_nr);
++	kfree(dev);
++}
++
++static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd,
++				       unsigned long parm)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_config conf;
++	struct v4l2_loopback_config *confptr = &conf;
++	int device_nr, capture_nr, output_nr;
++	int ret;
++	const __u32 version = V4L2LOOPBACK_VERSION_CODE;
++
++	ret = mutex_lock_killable(&v4l2loopback_ctl_mutex);
++	if (ret)
++		return ret;
++
++	ret = -EINVAL;
++	switch (cmd) {
++	default:
++		ret = -ENOSYS;
++		break;
++		/* add a v4l2loopback device (pair), based on the user-provided specs */
++	case V4L2LOOPBACK_CTL_ADD:
++	case V4L2LOOPBACK_CTL_ADD_legacy:
++		if (parm) {
++			if ((ret = copy_from_user(&conf, (void *)parm,
++						  sizeof(conf))) < 0)
++				break;
++		} else
++			confptr = NULL;
++		ret = v4l2_loopback_add(confptr, &device_nr);
++		if (ret >= 0)
++			ret = device_nr;
++		break;
++		/* remove a v4l2loopback device (both capture and output) */
++	case V4L2LOOPBACK_CTL_REMOVE:
++	case V4L2LOOPBACK_CTL_REMOVE_legacy:
++		ret = v4l2loopback_lookup((__u32)parm, &dev);
++		if (ret >= 0 && dev) {
++			ret = -EBUSY;
++			if (dev->open_count.counter > 0)
++				break;
++			v4l2_loopback_remove(dev);
++			ret = 0;
++		};
++		break;
++		/* get information for a loopback device.
++		 * this is mostly about limits (which cannot be queried directly with  VIDIOC_G_FMT and friends
++		 */
++	case V4L2LOOPBACK_CTL_QUERY:
++	case V4L2LOOPBACK_CTL_QUERY_legacy:
++		if (!parm)
++			break;
++		if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) <
++		    0)
++			break;
++		capture_nr = output_nr = conf.output_nr;
++#ifdef SPLIT_DEVICES
++		capture_nr = conf.capture_nr;
++#endif
++		device_nr = (output_nr < 0) ? capture_nr : output_nr;
++		MARK();
++		/* get the device from either capture_nr or output_nr (whatever is valid) */
++		if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0)
++			break;
++		MARK();
++		/* if we got the device from output_nr and there is a valid capture_nr,
++		 * make sure that both refer to the same device (or bail out)
++		 */
++		if ((device_nr != capture_nr) && (capture_nr >= 0) &&
++		    ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0))
++			break;
++		MARK();
++		/* if otoh, we got the device from capture_nr and there is a valid output_nr,
++		 * make sure that both refer to the same device (or bail out)
++		 */
++		if ((device_nr != output_nr) && (output_nr >= 0) &&
++		    ((ret = v4l2loopback_lookup(output_nr, 0)) < 0))
++			break;
++
++		/* v4l2_loopback_config identified a single device, so fetch the data */
++		snprintf(conf.card_label, sizeof(conf.card_label), "%s",
++			 dev->card_label);
++
++		conf.output_nr = dev->vdev->num;
++#ifdef SPLIT_DEVICES
++		conf.capture_nr = dev->vdev->num;
++#endif
++		conf.min_width = dev->min_width;
++		conf.min_height = dev->min_height;
++		conf.max_width = dev->max_width;
++		conf.max_height = dev->max_height;
++		conf.announce_all_caps = dev->announce_all_caps;
++		conf.max_buffers = dev->buffer_count;
++		conf.max_openers = dev->max_openers;
++		conf.debug = debug;
++		MARK();
++		if (copy_to_user((void *)parm, &conf, sizeof(conf))) {
++			ret = -EFAULT;
++			break;
++		}
++		ret = 0;
++		break;
++	case V4L2LOOPBACK_CTL_VERSION:
++		if (!parm)
++			break;
++		if (copy_to_user((void *)parm, &version, sizeof(version))) {
++			ret = -EFAULT;
++			break;
++		}
++		ret = 0;
++		break;
++	}
++
++	mutex_unlock(&v4l2loopback_ctl_mutex);
++	MARK();
++	return ret;
++}
++
++/* LINUX KERNEL */
++
++static const struct file_operations v4l2loopback_ctl_fops = {
++	// clang-format off
++	.owner		= THIS_MODULE,
++	.open		= nonseekable_open,
++	.unlocked_ioctl	= v4l2loopback_control_ioctl,
++	.compat_ioctl	= v4l2loopback_control_ioctl,
++	.llseek		= noop_llseek,
++	// clang-format on
++};
++
++static struct miscdevice v4l2loopback_misc = {
++	// clang-format off
++	.minor		= MISC_DYNAMIC_MINOR,
++	.name		= "v4l2loopback",
++	.fops		= &v4l2loopback_ctl_fops,
++	// clang-format on
++};
++
++static const struct v4l2_file_operations v4l2_loopback_fops = {
++	// clang-format off
++	.owner		= THIS_MODULE,
++	.open		= v4l2_loopback_open,
++	.release	= v4l2_loopback_close,
++	.read		= v4l2_loopback_read,
++	.write		= v4l2_loopback_write,
++	.poll		= v4l2_loopback_poll,
++	.mmap		= v4l2_loopback_mmap,
++	.unlocked_ioctl	= video_ioctl2,
++	// clang-format on
++};
++
++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = {
++	// clang-format off
++	.vidioc_querycap		= &vidioc_querycap,
++	.vidioc_enum_framesizes		= &vidioc_enum_framesizes,
++	.vidioc_enum_frameintervals	= &vidioc_enum_frameintervals,
++
++	.vidioc_enum_output		= &vidioc_enum_output,
++	.vidioc_g_output		= &vidioc_g_output,
++	.vidioc_s_output		= &vidioc_s_output,
++
++	.vidioc_enum_input		= &vidioc_enum_input,
++	.vidioc_g_input			= &vidioc_g_input,
++	.vidioc_s_input			= &vidioc_s_input,
++
++	.vidioc_enum_fmt_vid_cap	= &vidioc_enum_fmt_cap,
++	.vidioc_g_fmt_vid_cap		= &vidioc_g_fmt_cap,
++	.vidioc_s_fmt_vid_cap		= &vidioc_s_fmt_cap,
++	.vidioc_try_fmt_vid_cap		= &vidioc_try_fmt_cap,
++
++	.vidioc_enum_fmt_vid_out	= &vidioc_enum_fmt_out,
++	.vidioc_s_fmt_vid_out		= &vidioc_s_fmt_out,
++	.vidioc_g_fmt_vid_out		= &vidioc_g_fmt_out,
++	.vidioc_try_fmt_vid_out		= &vidioc_try_fmt_out,
++
++#ifdef V4L2L_OVERLAY
++	.vidioc_s_fmt_vid_overlay	= &vidioc_s_fmt_overlay,
++	.vidioc_g_fmt_vid_overlay	= &vidioc_g_fmt_overlay,
++#endif
++
++#ifdef V4L2LOOPBACK_WITH_STD
++	.vidioc_s_std			= &vidioc_s_std,
++	.vidioc_g_std			= &vidioc_g_std,
++	.vidioc_querystd		= &vidioc_querystd,
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	.vidioc_g_parm			= &vidioc_g_parm,
++	.vidioc_s_parm			= &vidioc_s_parm,
++
++	.vidioc_reqbufs			= &vidioc_reqbufs,
++	.vidioc_querybuf		= &vidioc_querybuf,
++	.vidioc_qbuf			= &vidioc_qbuf,
++	.vidioc_dqbuf			= &vidioc_dqbuf,
++
++	.vidioc_streamon		= &vidioc_streamon,
++	.vidioc_streamoff		= &vidioc_streamoff,
++
++#ifdef CONFIG_VIDEO_V4L1_COMPAT
++	.vidiocgmbuf			= &vidiocgmbuf,
++#endif
++
++	.vidioc_subscribe_event		= &vidioc_subscribe_event,
++	.vidioc_unsubscribe_event	= &v4l2_event_unsubscribe,
++	// clang-format on
++};
++
++static int free_device_cb(int id, void *ptr, void *data)
++{
++	struct v4l2_loopback_device *dev = ptr;
++	v4l2_loopback_remove(dev);
++	return 0;
++}
++static void free_devices(void)
++{
++	idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL);
++	idr_destroy(&v4l2loopback_index_idr);
++}
++
++static int __init v4l2loopback_init_module(void)
++{
++	const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH;
++	const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT;
++	int err;
++	int i;
++	MARK();
++
++	err = misc_register(&v4l2loopback_misc);
++	if (err < 0)
++		return err;
++
++	if (devices < 0) {
++		devices = 1;
++
++		/* try guessing the devices from the "video_nr" parameter */
++		for (i = MAX_DEVICES - 1; i >= 0; i--) {
++			if (video_nr[i] >= 0) {
++				devices = i + 1;
++				break;
++			}
++		}
++	}
++
++	if (devices > MAX_DEVICES) {
++		devices = MAX_DEVICES;
++		printk(KERN_INFO
++		       "v4l2-loopback init() number of initial devices is "
++		       "limited to: %d\n",
++		       MAX_DEVICES);
++	}
++
++	if (max_buffers > MAX_BUFFERS) {
++		max_buffers = MAX_BUFFERS;
++		printk(KERN_INFO
++		       "v4l2-loopback init() number of buffers is limited "
++		       "to: %d\n",
++		       MAX_BUFFERS);
++	}
++
++	if (max_openers < 0) {
++		printk(KERN_INFO
++		       "v4l2-loopback init() allowing %d openers rather "
++		       "than %d\n",
++		       2, max_openers);
++		max_openers = 2;
++	}
++
++	if (max_width < min_width) {
++		max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
++		printk(KERN_INFO "v4l2-loopback init() using max_width %d\n",
++		       max_width);
++	}
++	if (max_height < min_height) {
++		max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
++		printk(KERN_INFO "v4l2-loopback init() using max_height %d\n",
++		       max_height);
++	}
++
++	for (i = 0; i < devices; i++) {
++		struct v4l2_loopback_config cfg = {
++			// clang-format off
++			.output_nr		= video_nr[i],
++#ifdef SPLIT_DEVICES
++			.capture_nr		= video_nr[i],
++#endif
++			.min_width		= min_width,
++			.min_height		= min_height,
++			.max_width		= max_width,
++			.max_height		= max_height,
++			.announce_all_caps	= (!exclusive_caps[i]),
++			.max_buffers		= max_buffers,
++			.max_openers		= max_openers,
++			.debug			= debug,
++			// clang-format on
++		};
++		cfg.card_label[0] = 0;
++		if (card_label[i])
++			snprintf(cfg.card_label, sizeof(cfg.card_label), "%s",
++				 card_label[i]);
++		err = v4l2_loopback_add(&cfg, 0);
++		if (err) {
++			free_devices();
++			goto error;
++		}
++	}
++
++	dprintk("module installed\n");
++
++	printk(KERN_INFO "v4l2-loopback driver version %d.%d.%d%s loaded\n",
++	       // clang-format off
++	       (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff,
++	       (V4L2LOOPBACK_VERSION_CODE >>  8) & 0xff,
++	       (V4L2LOOPBACK_VERSION_CODE      ) & 0xff,
++#ifdef SNAPSHOT_VERSION
++	       " (" __stringify(SNAPSHOT_VERSION) ")"
++#else
++	       ""
++#endif
++	       );
++	// clang-format on
++
++	return 0;
++error:
++	misc_deregister(&v4l2loopback_misc);
++	return err;
++}
++
++static void v4l2loopback_cleanup_module(void)
++{
++	MARK();
++	/* unregister the device -> it deletes /dev/video* */
++	free_devices();
++	/* and get rid of /dev/v4l2loopback */
++	misc_deregister(&v4l2loopback_misc);
++	dprintk("module removed\n");
++}
++
++MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR);
++
++module_init(v4l2loopback_init_module);
++module_exit(v4l2loopback_cleanup_module);
+diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h
+new file mode 100644
+index 000000000000..e48e0ce5949d
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback.h
+@@ -0,0 +1,108 @@
++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
++/*
++ * v4l2loopback.h
++ *
++ * Written by IOhannes m zmÃ¶lnig, 7/1/20.
++ *
++ * Copyright 2020 by IOhannes m zmÃ¶lnig.  Redistribution of this file is
++ * permitted under the GNU General Public License.
++ */
++#ifndef _V4L2LOOPBACK_H
++#define _V4L2LOOPBACK_H
++
++#define V4L2LOOPBACK_VERSION_MAJOR 0
++#define V4L2LOOPBACK_VERSION_MINOR 15
++#define V4L2LOOPBACK_VERSION_BUGFIX 0
++
++/* /dev/v4l2loopback interface */
++
++struct v4l2_loopback_config {
++	/**
++         * the device-number (/dev/video<nr>)
++         * V4L2LOOPBACK_CTL_ADD:
++         * setting this to a value<0, will allocate an available one
++         * if nr>=0 and the device already exists, the ioctl will EEXIST
++         * if output_nr and capture_nr are the same, only a single device will be created
++	 * NOTE: currently split-devices (where output_nr and capture_nr differ)
++	 *   are not implemented yet.
++	 *   until then, requesting different device-IDs will result in EINVAL.
++         *
++         * V4L2LOOPBACK_CTL_QUERY:
++         * either both output_nr and capture_nr must refer to the same loopback,
++         * or one (and only one) of them must be -1
++         *
++         */
++	__s32 output_nr;
++	__s32 unused; /*capture_nr;*/
++
++	/**
++         * a nice name for your device
++         * if (*card_label)==0, an automatic name is assigned
++         */
++	char card_label[32];
++
++	/**
++         * allowed frame size
++         * if too low, default values are used
++         */
++	__u32 min_width;
++	__u32 max_width;
++	__u32 min_height;
++	__u32 max_height;
++
++	/**
++         * number of buffers to allocate for the queue
++         * if set to <=0, default values are used
++         */
++	__s32 max_buffers;
++
++	/**
++         * how many consumers are allowed to open this device concurrently
++         * if set to <=0, default values are used
++         */
++	__s32 max_openers;
++
++	/**
++         * set the debugging level for this device
++         */
++	__s32 debug;
++
++	/**
++         * whether to announce OUTPUT/CAPTURE capabilities exclusively
++         * for this device or not
++         * (!exclusive_caps)
++	 * NOTE: this is going to be removed once separate output/capture
++	 *       devices are implemented
++         */
++	__s32 announce_all_caps;
++};
++
++#define V4L2LOOPBACK_CTL_IOCTLMAGIC '~'
++
++/* a pointer to an (unsigned int) that - on success - will hold
++ * the version code of the v4l2loopback module
++ * as returned by KERNEL_VERSION(MAJOR, MINOR, BUGFIX)
++ */
++#define V4L2LOOPBACK_CTL_VERSION _IOR(V4L2LOOPBACK_CTL_IOCTLMAGIC, 0, __u32)
++
++/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the
++ * to-be-created device set.
++ * if the ptr is NULL, a new device is created with default values at the driver's discretion.
++ *
++ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY,
++ * to get more information on the device)
++ */
++#define V4L2LOOPBACK_CTL_ADD \
++	_IOW(V4L2LOOPBACK_CTL_IOCTLMAGIC, 1, struct v4l2_loopback_config)
++
++/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */
++#define V4L2LOOPBACK_CTL_REMOVE _IOW(V4L2LOOPBACK_CTL_IOCTLMAGIC, 2, __u32)
++
++/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set
++ * (the two values must either refer to video-devices associated with the same loopback device
++ *  or exactly one of them must be <0
++ */
++#define V4L2LOOPBACK_CTL_QUERY \
++	_IOWR(V4L2LOOPBACK_CTL_IOCTLMAGIC, 3, struct v4l2_loopback_config)
++
++#endif /* _V4L2LOOPBACK_H */
+diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h
+new file mode 100644
+index 000000000000..d855a3796554
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback_formats.h
+@@ -0,0 +1,445 @@
++static const struct v4l2l_format formats[] = {
++#ifndef V4L2_PIX_FMT_VP9
++#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0')
++#endif
++#ifndef V4L2_PIX_FMT_HEVC
++#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C')
++#endif
++
++	/* here come the packed formats */
++	{
++		.name = "32 bpp RGB, le",
++		.fourcc = V4L2_PIX_FMT_BGR32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "32 bpp RGB, be",
++		.fourcc = V4L2_PIX_FMT_RGB32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "24 bpp RGB, le",
++		.fourcc = V4L2_PIX_FMT_BGR24,
++		.depth = 24,
++		.flags = 0,
++	},
++	{
++		.name = "24 bpp RGB, be",
++		.fourcc = V4L2_PIX_FMT_RGB24,
++		.depth = 24,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_ABGR32
++	{
++		.name = "32 bpp RGBA, le",
++		.fourcc = V4L2_PIX_FMT_ABGR32,
++		.depth = 32,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_RGBA32
++	{
++		.name = "32 bpp RGBA",
++		.fourcc = V4L2_PIX_FMT_RGBA32,
++		.depth = 32,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_RGB332
++	{
++		.name = "8 bpp RGB-3-3-2",
++		.fourcc = V4L2_PIX_FMT_RGB332,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB332 */
++#ifdef V4L2_PIX_FMT_RGB444
++	{
++		.name = "16 bpp RGB (xxxxrrrr ggggbbbb)",
++		.fourcc = V4L2_PIX_FMT_RGB444,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB444 */
++#ifdef V4L2_PIX_FMT_RGB555
++	{
++		.name = "16 bpp RGB-5-5-5",
++		.fourcc = V4L2_PIX_FMT_RGB555,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB555 */
++#ifdef V4L2_PIX_FMT_RGB565
++	{
++		.name = "16 bpp RGB-5-6-5",
++		.fourcc = V4L2_PIX_FMT_RGB565,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB565 */
++#ifdef V4L2_PIX_FMT_RGB555X
++	{
++		.name = "16 bpp RGB-5-5-5 BE",
++		.fourcc = V4L2_PIX_FMT_RGB555X,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB555X */
++#ifdef V4L2_PIX_FMT_RGB565X
++	{
++		.name = "16 bpp RGB-5-6-5 BE",
++		.fourcc = V4L2_PIX_FMT_RGB565X,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB565X */
++#ifdef V4L2_PIX_FMT_BGR666
++	{
++		.name = "18 bpp BGR-6-6-6",
++		.fourcc = V4L2_PIX_FMT_BGR666,
++		.depth = 18,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_BGR666 */
++	{
++		.name = "4:2:2, packed, YUYV",
++		.fourcc = V4L2_PIX_FMT_YUYV,
++		.depth = 16,
++		.flags = 0,
++	},
++	{
++		.name = "4:2:2, packed, UYVY",
++		.fourcc = V4L2_PIX_FMT_UYVY,
++		.depth = 16,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_YVYU
++	{
++		.name = "4:2:2, packed YVYU",
++		.fourcc = V4L2_PIX_FMT_YVYU,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_VYUY
++	{
++		.name = "4:2:2, packed VYUY",
++		.fourcc = V4L2_PIX_FMT_VYUY,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif
++	{
++		.name = "4:2:2, packed YYUV",
++		.fourcc = V4L2_PIX_FMT_YYUV,
++		.depth = 16,
++		.flags = 0,
++	},
++	{
++		.name = "YUV-8-8-8-8",
++		.fourcc = V4L2_PIX_FMT_YUV32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "8 bpp, Greyscale",
++		.fourcc = V4L2_PIX_FMT_GREY,
++		.depth = 8,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_Y4
++	{
++		.name = "4 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y4,
++		.depth = 4,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y4 */
++#ifdef V4L2_PIX_FMT_Y6
++	{
++		.name = "6 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y6,
++		.depth = 6,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y6 */
++#ifdef V4L2_PIX_FMT_Y10
++	{
++		.name = "10 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y10,
++		.depth = 10,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y10 */
++#ifdef V4L2_PIX_FMT_Y12
++	{
++		.name = "12 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y12,
++		.depth = 12,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y12 */
++	{
++		.name = "16 bpp, Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y16,
++		.depth = 16,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_YUV444
++	{
++		.name = "16 bpp xxxxyyyy uuuuvvvv",
++		.fourcc = V4L2_PIX_FMT_YUV444,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV444 */
++#ifdef V4L2_PIX_FMT_YUV555
++	{
++		.name = "16 bpp YUV-5-5-5",
++		.fourcc = V4L2_PIX_FMT_YUV555,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV555 */
++#ifdef V4L2_PIX_FMT_YUV565
++	{
++		.name = "16 bpp YUV-5-6-5",
++		.fourcc = V4L2_PIX_FMT_YUV565,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV565 */
++
++/* bayer formats */
++#ifdef V4L2_PIX_FMT_SRGGB8
++	{
++		.name = "Bayer RGGB 8bit",
++		.fourcc = V4L2_PIX_FMT_SRGGB8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SRGGB8 */
++#ifdef V4L2_PIX_FMT_SGRBG8
++	{
++		.name = "Bayer GRBG 8bit",
++		.fourcc = V4L2_PIX_FMT_SGRBG8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SGRBG8 */
++#ifdef V4L2_PIX_FMT_SGBRG8
++	{
++		.name = "Bayer GBRG 8bit",
++		.fourcc = V4L2_PIX_FMT_SGBRG8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SGBRG8 */
++#ifdef V4L2_PIX_FMT_SBGGR8
++	{
++		.name = "Bayer BA81 8bit",
++		.fourcc = V4L2_PIX_FMT_SBGGR8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SBGGR8 */
++
++	/* here come the planar formats */
++	{
++		.name = "4:1:0, planar, Y-Cr-Cb",
++		.fourcc = V4L2_PIX_FMT_YVU410,
++		.depth = 9,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:2:0, planar, Y-Cr-Cb",
++		.fourcc = V4L2_PIX_FMT_YVU420,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:1:0, planar, Y-Cb-Cr",
++		.fourcc = V4L2_PIX_FMT_YUV410,
++		.depth = 9,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:2:0, planar, Y-Cb-Cr",
++		.fourcc = V4L2_PIX_FMT_YUV420,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#ifdef V4L2_PIX_FMT_YUV422P
++	{
++		.name = "16 bpp YVU422 planar",
++		.fourcc = V4L2_PIX_FMT_YUV422P,
++		.depth = 16,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_YUV422P */
++#ifdef V4L2_PIX_FMT_YUV411P
++	{
++		.name = "16 bpp YVU411 planar",
++		.fourcc = V4L2_PIX_FMT_YUV411P,
++		.depth = 16,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_YUV411P */
++#ifdef V4L2_PIX_FMT_Y41P
++	{
++		.name = "12 bpp YUV 4:1:1",
++		.fourcc = V4L2_PIX_FMT_Y41P,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_Y41P */
++#ifdef V4L2_PIX_FMT_NV12
++	{
++		.name = "12 bpp Y/CbCr 4:2:0 ",
++		.fourcc = V4L2_PIX_FMT_NV12,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_NV12 */
++
++/* here come the compressed formats */
++
++#ifdef V4L2_PIX_FMT_MJPEG
++	{
++		.name = "Motion-JPEG",
++		.fourcc = V4L2_PIX_FMT_MJPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MJPEG */
++#ifdef V4L2_PIX_FMT_JPEG
++	{
++		.name = "JFIF JPEG",
++		.fourcc = V4L2_PIX_FMT_JPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_JPEG */
++#ifdef V4L2_PIX_FMT_DV
++	{
++		.name = "DV1394",
++		.fourcc = V4L2_PIX_FMT_DV,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_DV */
++#ifdef V4L2_PIX_FMT_MPEG
++	{
++		.name = "MPEG-1/2/4 Multiplexed",
++		.fourcc = V4L2_PIX_FMT_MPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG */
++#ifdef V4L2_PIX_FMT_H264
++	{
++		.name = "H264 with start codes",
++		.fourcc = V4L2_PIX_FMT_H264,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264 */
++#ifdef V4L2_PIX_FMT_H264_NO_SC
++	{
++		.name = "H264 without start codes",
++		.fourcc = V4L2_PIX_FMT_H264_NO_SC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264_NO_SC */
++#ifdef V4L2_PIX_FMT_H264_MVC
++	{
++		.name = "H264 MVC",
++		.fourcc = V4L2_PIX_FMT_H264_MVC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264_MVC */
++#ifdef V4L2_PIX_FMT_H263
++	{
++		.name = "H263",
++		.fourcc = V4L2_PIX_FMT_H263,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H263 */
++#ifdef V4L2_PIX_FMT_MPEG1
++	{
++		.name = "MPEG-1 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG1,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG1 */
++#ifdef V4L2_PIX_FMT_MPEG2
++	{
++		.name = "MPEG-2 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG2,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG2 */
++#ifdef V4L2_PIX_FMT_MPEG4
++	{
++		.name = "MPEG-4 part 2 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG4,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG4 */
++#ifdef V4L2_PIX_FMT_XVID
++	{
++		.name = "Xvid",
++		.fourcc = V4L2_PIX_FMT_XVID,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_XVID */
++#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
++	{
++		.name = "SMPTE 421M Annex G compliant stream",
++		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_G,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */
++#ifdef V4L2_PIX_FMT_VC1_ANNEX_L
++	{
++		.name = "SMPTE 421M Annex L compliant stream",
++		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_L,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */
++#ifdef V4L2_PIX_FMT_VP8
++	{
++		.name = "VP8",
++		.fourcc = V4L2_PIX_FMT_VP8,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VP8 */
++#ifdef V4L2_PIX_FMT_VP9
++	{
++		.name = "VP9",
++		.fourcc = V4L2_PIX_FMT_VP9,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VP9 */
++#ifdef V4L2_PIX_FMT_HEVC
++	{
++		.name = "HEVC",
++		.fourcc = V4L2_PIX_FMT_HEVC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_HEVC */
++};
+diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
+index 038ccbd9e3ba..de5e4f5145af 100644
+--- a/drivers/pci/controller/Makefile
++++ b/drivers/pci/controller/Makefile
+@@ -1,4 +1,10 @@
+ # SPDX-License-Identifier: GPL-2.0
++ifdef CONFIG_X86_64
++ifdef CONFIG_SATA_AHCI
++obj-y += intel-nvme-remap.o
++endif
++endif
++
+ obj-$(CONFIG_PCIE_CADENCE) += cadence/
+ obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
+ obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
+diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
+new file mode 100644
+index 000000000000..e105e6f5cc91
+--- /dev/null
++++ b/drivers/pci/controller/intel-nvme-remap.c
+@@ -0,0 +1,462 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Intel remapped NVMe device support.
++ *
++ * Copyright (c) 2019 Endless Mobile, Inc.
++ * Author: Daniel Drake <drake@endlessm.com>
++ *
++ * Some products ship by default with the SATA controller in "RAID" or
++ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
++ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
++ * devices disappear from the PCI bus, and instead their I/O memory becomes
++ * available within the AHCI device BARs.
++ *
++ * This scheme is understood to be a way of avoiding usage of the standard
++ * Windows NVMe driver under that OS, instead mandating usage of Intel's
++ * driver instead, which has better power management, and presumably offers
++ * some RAID/disk-caching solutions too.
++ *
++ * Here in this driver, we support the remapped NVMe mode by claiming the
++ * AHCI device and creating a fake PCIe root port. On the new bus, the
++ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
++ * devices corresponding to the remapped NVMe devices are created. The usual
++ * ahci and nvme drivers are then expected to bind to these devices and
++ * operate as normal.
++ *
++ * The PCI configuration space for the NVMe devices is completely
++ * unavailable, so we fake a minimal one and hope for the best.
++ *
++ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
++ * we only support the legacy interrupt here, although MSI support
++ * could potentially be added later.
++ */
++
++#define MODULE_NAME "intel-nvme-remap"
++
++#include <linux/ahci-remap.h>
++#include <linux/irq.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++
++#define AHCI_PCI_BAR_STANDARD 5
++
++struct nvme_remap_dev {
++	struct pci_dev		*dev;		/* AHCI device */
++	struct pci_bus		*bus;		/* our fake PCI bus */
++	struct pci_sysdata	sysdata;
++	int			irq_base;	/* our fake interrupts */
++
++	/*
++	 * When we detect an all-ones write to a BAR register, this flag
++	 * is set, so that we return the BAR size on the next read (a
++	 * standard PCI behaviour).
++	 * This includes the assumption that an all-ones BAR write is
++	 * immediately followed by a read of the same register.
++	 */
++	bool			bar_sizing;
++
++	/*
++	 * Resources copied from the AHCI device, to be regarded as
++	 * resources on our fake bus.
++	 */
++	struct resource		ahci_resources[PCI_NUM_RESOURCES];
++
++	/* Resources corresponding to the NVMe devices. */
++	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
++
++	/* Number of remapped NVMe devices found. */
++	int			num_remapped_devices;
++};
++
++static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
++{
++	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
++}
++
++
++/******** PCI configuration space **********/
++
++/*
++ * Helper macros for tweaking returned contents of PCI configuration space.
++ *
++ * value contains len bytes of data read from reg.
++ * If fixup_reg is included in that range, fix up the contents of that
++ * register to fixed_value.
++ */
++#define NR_FIX8(fixup_reg, fixed_value) do { \
++		if (reg <= fixup_reg && fixup_reg < reg + len) \
++			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
++	} while (0)
++
++#define NR_FIX16(fixup_reg, fixed_value) do { \
++		NR_FIX8(fixup_reg, fixed_value); \
++		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
++	} while (0)
++
++#define NR_FIX24(fixup_reg, fixed_value) do { \
++		NR_FIX8(fixup_reg, fixed_value); \
++		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
++		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
++	} while (0)
++
++#define NR_FIX32(fixup_reg, fixed_value) do { \
++		NR_FIX16(fixup_reg, (u16) fixed_value); \
++		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
++	} while (0)
++
++/*
++ * Read PCI config space of the slot 0 (AHCI) device.
++ * We pass through the read request to the underlying device, but
++ * tweak the results in some cases.
++ */
++static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
++				     int len, u32 *value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
++	int ret;
++
++	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
++				      reg, len, value);
++	if (ret)
++		return ret;
++
++	/*
++	 * Adjust the device class, to prevent this driver from attempting to
++	 * additionally probe the device we're simulating here.
++	 */
++	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
++
++	/*
++	 * Unset interrupt pin, otherwise ACPI tries to find routing
++	 * info for our virtual IRQ, fails, and complains.
++	 */
++	NR_FIX8(PCI_INTERRUPT_PIN, 0);
++
++	/*
++	 * Truncate the AHCI BAR to not include the region that covers the
++	 * hidden devices. This will cause the ahci driver to successfully
++	 * probe th new device (instead of handing it over to this driver).
++	 */
++	if (nrdev->bar_sizing) {
++		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
++		nrdev->bar_sizing = false;
++	}
++
++	return PCIBIOS_SUCCESSFUL;
++}
++
++/*
++ * Read PCI config space of a remapped device.
++ * Since the original PCI config space is inaccessible, we provide a minimal,
++ * fake config space instead.
++ */
++static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
++					int reg, int len, u32 *value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct resource *remapped_mem;
++
++	if (port > nrdev->num_remapped_devices)
++		return PCIBIOS_DEVICE_NOT_FOUND;
++
++	*value = 0;
++	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
++
++	/* Set a Vendor ID, otherwise Linux assumes no device is present */
++	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
++
++	/* Always appear on & bus mastering */
++	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
++
++	/* Set class so that nvme driver probes us */
++	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
++
++	if (nrdev->bar_sizing) {
++		NR_FIX32(PCI_BASE_ADDRESS_0,
++			 ~(resource_size(remapped_mem) - 1));
++		nrdev->bar_sizing = false;
++	} else {
++		resource_size_t mem_start = remapped_mem->start;
++
++		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
++		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
++		mem_start >>= 32;
++		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
++	}
++
++	return PCIBIOS_SUCCESSFUL;
++}
++
++/* Read PCI configuration space. */
++static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
++			       int reg, int len, u32 *value)
++{
++	if (PCI_SLOT(devfn) == 0)
++		return nvme_remap_pci_read_slot0(bus, reg, len, value);
++	else
++		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
++						    reg, len, value);
++}
++
++/*
++ * Write PCI config space of the slot 0 (AHCI) device.
++ * Apart from the special case of BAR sizing, we disable all writes.
++ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
++ * that would affect the operation of the NVMe devices.
++ */
++static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
++				      int len, u32 value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
++
++	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
++		/*
++		 * Writing all-ones to a BAR means that the size of the
++		 * memory region is being checked. Flag this so that we can
++		 * reply with an appropriate size on the next read.
++		 */
++		if (value == ~0)
++			nrdev->bar_sizing = true;
++
++		return ahci_dev_bus->ops->write(ahci_dev_bus,
++						nrdev->dev->devfn,
++						reg, len, value);
++	}
++
++	return PCIBIOS_SET_FAILED;
++}
++
++/*
++ * Write PCI config space of a remapped device.
++ * Since the original PCI config space is inaccessible, we reject all
++ * writes, except for the special case of BAR probing.
++ */
++static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
++					 unsigned int port,
++					 int reg, int len, u32 value)
++{
++	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
++
++	if (port > nrdev->num_remapped_devices)
++		return PCIBIOS_DEVICE_NOT_FOUND;
++
++	/*
++	 * Writing all-ones to a BAR means that the size of the memory
++	 * region is being checked. Flag this so that we can reply with
++	 * an appropriate size on the next read.
++	 */
++	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
++			&& reg <= PCI_BASE_ADDRESS_5) {
++		nrdev->bar_sizing = true;
++		return PCIBIOS_SUCCESSFUL;
++	}
++
++	return PCIBIOS_SET_FAILED;
++}
++
++/* Write PCI configuration space. */
++static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
++				int reg, int len, u32 value)
++{
++	if (PCI_SLOT(devfn) == 0)
++		return nvme_remap_pci_write_slot0(bus, reg, len, value);
++	else
++		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
++						     reg, len, value);
++}
++
++static struct pci_ops nvme_remap_pci_ops = {
++	.read	= nvme_remap_pci_read,
++	.write	= nvme_remap_pci_write,
++};
++
++
++/******** Initialization & exit **********/
++
++/*
++ * Find a PCI domain ID to use for our fake bus.
++ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
++ */
++static int find_free_domain(void)
++{
++	int domain = 0xffff;
++	struct pci_bus *bus = NULL;
++
++	while ((bus = pci_find_next_bus(bus)) != NULL)
++		domain = max_t(int, domain, pci_domain_nr(bus));
++
++	return domain + 1;
++}
++
++static int find_remapped_devices(struct nvme_remap_dev *nrdev,
++				 struct list_head *resources)
++{
++	void __iomem *mmio;
++	int i, count = 0;
++	u32 cap;
++
++	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
++			  pci_resource_len(nrdev->dev,
++					   AHCI_PCI_BAR_STANDARD));
++	if (!mmio)
++		return -ENODEV;
++
++	/* Check if this device might have remapped nvme devices. */
++	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
++	    !(readl(mmio + AHCI_VSCAP) & 1))
++		return -ENODEV;
++
++	cap = readq(mmio + AHCI_REMAP_CAP);
++	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
++		struct resource *remapped_mem;
++
++		if ((cap & (1 << i)) == 0)
++			continue;
++		if (readl(mmio + ahci_remap_dcc(i))
++				!= PCI_CLASS_STORAGE_EXPRESS)
++			continue;
++
++		/* We've found a remapped device */
++		remapped_mem = &nrdev->remapped_dev_mem[count++];
++		remapped_mem->start =
++			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
++			+ ahci_remap_base(i);
++		remapped_mem->end = remapped_mem->start
++			+ AHCI_REMAP_N_SIZE - 1;
++		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
++		pci_add_resource(resources, remapped_mem);
++	}
++
++	pcim_iounmap(nrdev->dev, mmio);
++
++	if (count == 0)
++		return -ENODEV;
++
++	nrdev->num_remapped_devices = count;
++	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
++		 nrdev->num_remapped_devices);
++	return 0;
++}
++
++static void nvme_remap_remove_root_bus(void *data)
++{
++	struct pci_bus *bus = data;
++
++	pci_stop_root_bus(bus);
++	pci_remove_root_bus(bus);
++}
++
++static int nvme_remap_probe(struct pci_dev *dev,
++			    const struct pci_device_id *id)
++{
++	struct nvme_remap_dev *nrdev;
++	LIST_HEAD(resources);
++	int i;
++	int ret;
++	struct pci_dev *child;
++
++	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
++	nrdev->sysdata.domain = find_free_domain();
++	nrdev->sysdata.nvme_remap_dev = dev;
++	nrdev->dev = dev;
++	pci_set_drvdata(dev, nrdev);
++
++	ret = pcim_enable_device(dev);
++	if (ret < 0)
++		return ret;
++
++	pci_set_master(dev);
++
++	ret = find_remapped_devices(nrdev, &resources);
++	if (ret)
++		return ret;
++
++	/* Add resources from the original AHCI device */
++	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++		struct resource *res = &dev->resource[i];
++
++		if (res->start) {
++			struct resource *nr_res = &nrdev->ahci_resources[i];
++
++			nr_res->start = res->start;
++			nr_res->end = res->end;
++			nr_res->flags = res->flags;
++			pci_add_resource(&resources, nr_res);
++		}
++	}
++
++	/* Create virtual interrupts */
++	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
++					       nrdev->num_remapped_devices + 1,
++					       0);
++	if (nrdev->irq_base < 0)
++		return nrdev->irq_base;
++
++	/* Create and populate PCI bus */
++	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
++					 &nrdev->sysdata, &resources);
++	if (!nrdev->bus)
++		return -ENODEV;
++
++	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
++				     nrdev->bus))
++		return -ENOMEM;
++
++	/* We don't support sharing MSI interrupts between these devices */
++	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
++
++	pci_scan_child_bus(nrdev->bus);
++
++	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
++		/*
++		 * Prevent PCI core from trying to move memory BARs around.
++		 * The hidden NVMe devices are at fixed locations.
++		 */
++		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++			struct resource *res = &child->resource[i];
++
++			if (res->flags & IORESOURCE_MEM)
++				res->flags |= IORESOURCE_PCI_FIXED;
++		}
++
++		/* Share the legacy IRQ between all devices */
++		child->irq = dev->irq;
++	}
++
++	pci_assign_unassigned_bus_resources(nrdev->bus);
++	pci_bus_add_devices(nrdev->bus);
++
++	return 0;
++}
++
++static const struct pci_device_id nvme_remap_ids[] = {
++	/*
++	 * Match all Intel RAID controllers.
++	 *
++	 * There's overlap here with the set of devices detected by the ahci
++	 * driver, but ahci will only successfully probe when there
++	 * *aren't* any remapped NVMe devices, and this driver will only
++	 * successfully probe when there *are* remapped NVMe devices that
++	 * need handling.
++	 */
++	{
++		PCI_VDEVICE(INTEL, PCI_ANY_ID),
++		.class = PCI_CLASS_STORAGE_RAID << 8,
++		.class_mask = 0xffffff00,
++	},
++	{0,}
++};
++MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
++
++static struct pci_driver nvme_remap_drv = {
++	.name		= MODULE_NAME,
++	.id_table	= nvme_remap_ids,
++	.probe		= nvme_remap_probe,
++};
++module_pci_driver(nvme_remap_drv);
++
++MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
++MODULE_LICENSE("GPL v2");
+diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
+index d97335a40193..acab5556a354 100644
+--- a/drivers/pci/quirks.c
++++ b/drivers/pci/quirks.c
+@@ -3745,6 +3745,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
+ 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
+ }
+ 
++static bool acs_on_downstream;
++static bool acs_on_multifunction;
++
++#define NUM_ACS_IDS 16
++struct acs_on_id {
++	unsigned short vendor;
++	unsigned short device;
++};
++static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
++static u8 max_acs_id;
++
++static __init int pcie_acs_override_setup(char *p)
++{
++	if (!p)
++		return -EINVAL;
++
++	while (*p) {
++		if (!strncmp(p, "downstream", 10))
++			acs_on_downstream = true;
++		if (!strncmp(p, "multifunction", 13))
++			acs_on_multifunction = true;
++		if (!strncmp(p, "id:", 3)) {
++			char opt[5];
++			int ret;
++			long val;
++
++			if (max_acs_id >= NUM_ACS_IDS - 1) {
++				pr_warn("Out of PCIe ACS override slots (%d)\n",
++						NUM_ACS_IDS);
++				goto next;
++			}
++
++			p += 3;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].vendor = val;
++
++			p += strcspn(p, ":");
++			if (*p != ':') {
++				pr_warn("PCIe ACS invalid ID\n");
++				goto next;
++			}
++
++			p++;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].device = val;
++			max_acs_id++;
++		}
++next:
++		p += strcspn(p, ",");
++		if (*p == ',')
++			p++;
++	}
++
++	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
++		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
++
++	return 0;
++}
++early_param("pcie_acs_override", pcie_acs_override_setup);
++
++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
++{
++	int i;
++
++	/* Never override ACS for legacy devices or devices with ACS caps */
++	if (!pci_is_pcie(dev) ||
++		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
++			return -ENOTTY;
++
++	for (i = 0; i < max_acs_id; i++)
++		if (acs_on_ids[i].vendor == dev->vendor &&
++			acs_on_ids[i].device == dev->device)
++				return 1;
++
++	switch (pci_pcie_type(dev)) {
++	case PCI_EXP_TYPE_DOWNSTREAM:
++	case PCI_EXP_TYPE_ROOT_PORT:
++		if (acs_on_downstream)
++			return 1;
++		break;
++	case PCI_EXP_TYPE_ENDPOINT:
++	case PCI_EXP_TYPE_UPSTREAM:
++	case PCI_EXP_TYPE_LEG_END:
++	case PCI_EXP_TYPE_RC_END:
++		if (acs_on_multifunction && dev->multifunction)
++			return 1;
++	}
++
++	return -ENOTTY;
++}
+ /*
+  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
+  * prevented for those affected devices.
+@@ -5192,6 +5292,7 @@ static const struct pci_dev_acs_enabled {
+ 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
+ 	/* Wangxun nics */
+ 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
++	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
+ 	{ 0 }
+ };
+ 
+diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
+index 5522310bab8d..9e1c4634eb7b 100644
+--- a/drivers/scsi/Kconfig
++++ b/drivers/scsi/Kconfig
+@@ -1524,4 +1524,6 @@ endif # SCSI_LOWLEVEL
+ 
+ source "drivers/scsi/device_handler/Kconfig"
+ 
++source "drivers/scsi/vhba/Kconfig"
++
+ endmenu
+diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
+index 16de3e41f94c..4e88f6e3e67b 100644
+--- a/drivers/scsi/Makefile
++++ b/drivers/scsi/Makefile
+@@ -152,6 +152,7 @@ obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
+ obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
+ 
+ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
++obj-$(CONFIG_VHBA)		+= vhba/
+ 
+ # This goes last, so that "real" scsi devices probe earlier
+ obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
+diff --git a/drivers/scsi/vhba/Kconfig b/drivers/scsi/vhba/Kconfig
+new file mode 100644
+index 000000000000..e70a381fe3df
+--- /dev/null
++++ b/drivers/scsi/vhba/Kconfig
+@@ -0,0 +1,9 @@
++config VHBA
++	tristate "Virtual (SCSI) Host Bus Adapter"
++	depends on SCSI
++	help
++	  This is the in-kernel part of CDEmu, a CD/DVD-ROM device
++	  emulator.
++
++	  This driver can also be built as a module. If so, the module
++	  will be called vhba.
+diff --git a/drivers/scsi/vhba/Makefile b/drivers/scsi/vhba/Makefile
+new file mode 100644
+index 000000000000..2d7524b66199
+--- /dev/null
++++ b/drivers/scsi/vhba/Makefile
+@@ -0,0 +1,4 @@
++VHBA_VERSION := 20240917
++
++obj-$(CONFIG_VHBA)		+= vhba.o
++ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
+diff --git a/drivers/scsi/vhba/vhba.c b/drivers/scsi/vhba/vhba.c
+new file mode 100644
+index 000000000000..878a3be0ba2b
+--- /dev/null
++++ b/drivers/scsi/vhba/vhba.c
+@@ -0,0 +1,1132 @@
++/*
++ * vhba.c
++ *
++ * Copyright (C) 2007-2012 Chia-I Wu <olvaffe AT gmail DOT com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with this program; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#define pr_fmt(fmt) "vhba: " fmt
++
++#include <linux/version.h>
++
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/highmem.h>
++#include <linux/fs.h>
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
++#include <linux/sched/signal.h>
++#else
++#include <linux/sched.h>
++#endif
++#include <linux/platform_device.h>
++#include <linux/miscdevice.h>
++#include <linux/poll.h>
++#include <linux/slab.h>
++#include <linux/scatterlist.h>
++#ifdef CONFIG_COMPAT
++#include <linux/compat.h>
++#endif
++#include <asm/uaccess.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_device.h>
++#include <scsi/scsi_tcq.h>
++
++
++MODULE_AUTHOR("Chia-I Wu");
++MODULE_VERSION(VHBA_VERSION);
++MODULE_DESCRIPTION("Virtual SCSI HBA");
++MODULE_LICENSE("GPL");
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
++#define sdev_dbg(sdev, fmt, a...) \
++    dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)
++#define scmd_dbg(scmd, fmt, a...) \
++    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
++#endif
++
++#define VHBA_MAX_SECTORS_PER_IO 256
++#define VHBA_MAX_BUS 16
++#define VHBA_MAX_ID 16
++#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1))
++#define VHBA_KBUF_SIZE PAGE_SIZE
++
++#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
++#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
++
++
++static int vhba_can_queue = 32;
++module_param_named(can_queue, vhba_can_queue, int, 0);
++
++
++enum vhba_req_state {
++    VHBA_REQ_FREE,
++    VHBA_REQ_PENDING,
++    VHBA_REQ_READING,
++    VHBA_REQ_SENT,
++    VHBA_REQ_WRITING,
++};
++
++struct vhba_command {
++    struct scsi_cmnd *cmd;
++    /* metatags are per-host. not to be confused with
++       queue tags that are usually per-lun */
++    unsigned long metatag;
++    int status;
++    struct list_head entry;
++};
++
++struct vhba_device {
++    unsigned int num;
++    spinlock_t cmd_lock;
++    struct list_head cmd_list;
++    wait_queue_head_t cmd_wq;
++    atomic_t refcnt;
++
++    unsigned char *kbuf;
++    size_t kbuf_size;
++};
++
++struct vhba_host {
++    struct Scsi_Host *shost;
++    spinlock_t cmd_lock;
++    int cmd_next;
++    struct vhba_command *commands;
++    spinlock_t dev_lock;
++    struct vhba_device *devices[VHBA_MAX_DEVICES];
++    int num_devices;
++    DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES);
++    int chgtype[VHBA_MAX_DEVICES];
++    struct work_struct scan_devices;
++};
++
++#define MAX_COMMAND_SIZE 16
++
++struct vhba_request {
++    __u32 metatag;
++    __u32 lun;
++    __u8 cdb[MAX_COMMAND_SIZE];
++    __u8 cdb_len;
++    __u32 data_len;
++};
++
++struct vhba_response {
++    __u32 metatag;
++    __u32 status;
++    __u32 data_len;
++};
++
++
++
++static struct vhba_command *vhba_alloc_command (void);
++static void vhba_free_command (struct vhba_command *vcmd);
++
++static struct platform_device vhba_platform_device;
++
++
++
++/* These functions define a symmetric 1:1 mapping between device numbers and
++   the bus and id. We have reserved the last id per bus for the host itself. */
++static void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id)
++{
++    *bus = devnum / (VHBA_MAX_ID-1);
++    *id  = devnum % (VHBA_MAX_ID-1);
++}
++
++static unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id)
++{
++    return (bus * (VHBA_MAX_ID-1)) + id;
++}
++
++static struct vhba_device *vhba_device_alloc (void)
++{
++    struct vhba_device *vdev;
++
++    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
++    if (!vdev) {
++        return NULL;
++    }
++
++    spin_lock_init(&vdev->cmd_lock);
++    INIT_LIST_HEAD(&vdev->cmd_list);
++    init_waitqueue_head(&vdev->cmd_wq);
++    atomic_set(&vdev->refcnt, 1);
++
++    vdev->kbuf = NULL;
++    vdev->kbuf_size = 0;
++
++    return vdev;
++}
++
++static void vhba_device_put (struct vhba_device *vdev)
++{
++    if (atomic_dec_and_test(&vdev->refcnt)) {
++        kfree(vdev);
++    }
++}
++
++static struct vhba_device *vhba_device_get (struct vhba_device *vdev)
++{
++    atomic_inc(&vdev->refcnt);
++
++    return vdev;
++}
++
++static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
++{
++    struct vhba_host *vhost;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    vcmd = vhba_alloc_command();
++    if (!vcmd) {
++        return SCSI_MLQUEUE_HOST_BUSY;
++    }
++
++    vcmd->cmd = cmd;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++    vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag;
++#else
++    vcmd->metatag = vcmd->cmd->request->tag;
++#endif
++    list_add_tail(&vcmd->entry, &vdev->cmd_list);
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    wake_up_interruptible(&vdev->cmd_wq);
++
++    return 0;
++}
++
++static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
++{
++    struct vhba_command *vcmd;
++    int retval;
++    unsigned long flags;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->cmd == cmd) {
++            list_del_init(&vcmd->entry);
++            break;
++        }
++    }
++
++    /* command not found */
++    if (&vcmd->entry == &vdev->cmd_list) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        return SUCCESS;
++    }
++
++    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        scmd_dbg(cmd, "wait for I/O before aborting\n");
++        schedule_timeout(1);
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++    }
++
++    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
++
++    vhba_free_command(vcmd);
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return retval;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
++static int vhba_slave_alloc(struct scsi_device *sdev)
++{
++    struct Scsi_Host *shost = sdev->host;
++
++    sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
++    if (!shost_use_blk_mq(shost) && shost->bqt) {
++#else
++    if (shost->bqt) {
++#endif
++        blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt);
++    }
++    scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth);
++
++    return 0;
++}
++#endif
++
++static void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id)
++{
++    struct scsi_device *sdev;
++
++    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
++    if (!sdev) {
++        scsi_add_device(vhost->shost, bus, id, 0);
++    } else {
++        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id);
++        scsi_device_put(sdev);
++    }
++}
++
++static void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id)
++{
++    struct scsi_device *sdev;
++
++    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
++    if (sdev) {
++        scsi_remove_device(sdev);
++        scsi_device_put(sdev);
++    } else {
++        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id);
++    }
++}
++
++static void vhba_scan_devices (struct work_struct *work)
++{
++    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
++    unsigned long flags;
++    int change, exists;
++    unsigned int devnum;
++    unsigned int bus, id;
++
++    for (;;) {
++        spin_lock_irqsave(&vhost->dev_lock, flags);
++
++        devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES);
++        if (devnum >= VHBA_MAX_DEVICES) {
++            spin_unlock_irqrestore(&vhost->dev_lock, flags);
++            break;
++        }
++        change = vhost->chgtype[devnum];
++        exists = vhost->devices[devnum] != NULL;
++
++        vhost->chgtype[devnum] = 0;
++        clear_bit(devnum, vhost->chgmap);
++
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++        devnum_to_bus_and_id(devnum, &bus, &id);
++
++        if (change < 0) {
++            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id);
++            vhba_scan_devices_remove(vhost, bus, id);
++        } else if (change > 0) {
++            dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id);
++            vhba_scan_devices_add(vhost, bus, id);
++        } else {
++            /* quick sequence of add/remove or remove/add; we determine
++               which one it was by checking if device structure exists */
++            if (exists) {
++                /* remove followed by add: remove and (re)add */
++                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id);
++                vhba_scan_devices_remove(vhost, bus, id);
++                vhba_scan_devices_add(vhost, bus, id);
++            } else {
++                /* add followed by remove: no-op */
++                dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id);
++            }
++        }
++    }
++}
++
++static int vhba_add_device (struct vhba_device *vdev)
++{
++    struct vhba_host *vhost;
++    unsigned int devnum;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    vhba_device_get(vdev);
++
++    spin_lock_irqsave(&vhost->dev_lock, flags);
++    if (vhost->num_devices >= VHBA_MAX_DEVICES) {
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++        vhba_device_put(vdev);
++        return -EBUSY;
++    }
++
++    for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) {
++        if (vhost->devices[devnum] == NULL) {
++            vdev->num = devnum;
++            vhost->devices[devnum] = vdev;
++            vhost->num_devices++;
++            set_bit(devnum, vhost->chgmap);
++            vhost->chgtype[devnum]++;
++            break;
++        }
++    }
++    spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++    schedule_work(&vhost->scan_devices);
++
++    return 0;
++}
++
++static int vhba_remove_device (struct vhba_device *vdev)
++{
++    struct vhba_host *vhost;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->dev_lock, flags);
++    set_bit(vdev->num, vhost->chgmap);
++    vhost->chgtype[vdev->num]--;
++    vhost->devices[vdev->num] = NULL;
++    vhost->num_devices--;
++    spin_unlock_irqrestore(&vhost->dev_lock, flags);
++
++    vhba_device_put(vdev);
++
++    schedule_work(&vhost->scan_devices);
++
++    return 0;
++}
++
++static struct vhba_device *vhba_lookup_device (int devnum)
++{
++    struct vhba_host *vhost;
++    struct vhba_device *vdev = NULL;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    if (likely(devnum < VHBA_MAX_DEVICES)) {
++        spin_lock_irqsave(&vhost->dev_lock, flags);
++        vdev = vhost->devices[devnum];
++        if (vdev) {
++            vdev = vhba_device_get(vdev);
++        }
++
++        spin_unlock_irqrestore(&vhost->dev_lock, flags);
++    }
++
++    return vdev;
++}
++
++static struct vhba_command *vhba_alloc_command (void)
++{
++    struct vhba_host *vhost;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++    int i;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->cmd_lock, flags);
++
++    vcmd = vhost->commands + vhost->cmd_next++;
++    if (vcmd->status != VHBA_REQ_FREE) {
++        for (i = 0; i < vhba_can_queue; i++) {
++            vcmd = vhost->commands + i;
++
++            if (vcmd->status == VHBA_REQ_FREE) {
++                vhost->cmd_next = i + 1;
++                break;
++            }
++        }
++
++        if (i == vhba_can_queue) {
++            vcmd = NULL;
++        }
++    }
++
++    if (vcmd) {
++        vcmd->status = VHBA_REQ_PENDING;
++    }
++
++    vhost->cmd_next %= vhba_can_queue;
++
++    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
++
++    return vcmd;
++}
++
++static void vhba_free_command (struct vhba_command *vcmd)
++{
++    struct vhba_host *vhost;
++    unsigned long flags;
++
++    vhost = platform_get_drvdata(&vhba_platform_device);
++
++    spin_lock_irqsave(&vhost->cmd_lock, flags);
++    vcmd->status = VHBA_REQ_FREE;
++    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
++}
++
++static int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd)
++{
++    struct vhba_device *vdev;
++    int retval;
++    unsigned int devnum;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
++    scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag);
++#else
++    scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag);
++#endif
++
++    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
++    vdev = vhba_lookup_device(devnum);
++    if (!vdev) {
++        scmd_dbg(cmd, "no such device\n");
++
++        cmd->result = DID_NO_CONNECT << 16;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(cmd);
++#else
++        cmd->scsi_done(cmd);
++#endif
++
++        return 0;
++    }
++
++    retval = vhba_device_queue(vdev, cmd);
++
++    vhba_device_put(vdev);
++
++    return retval;
++}
++
++static int vhba_abort (struct scsi_cmnd *cmd)
++{
++    struct vhba_device *vdev;
++    int retval = SUCCESS;
++    unsigned int devnum;
++
++    scmd_dbg(cmd, "abort %p\n", cmd);
++
++    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
++    vdev = vhba_lookup_device(devnum);
++    if (vdev) {
++        retval = vhba_device_dequeue(vdev, cmd);
++        vhba_device_put(vdev);
++    } else {
++        cmd->result = DID_NO_CONNECT << 16;
++    }
++
++    return retval;
++}
++
++static struct scsi_host_template vhba_template = {
++    .module = THIS_MODULE,
++    .name = "vhba",
++    .proc_name = "vhba",
++    .queuecommand = vhba_queuecommand,
++    .eh_abort_handler = vhba_abort,
++    .this_id = -1,
++    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
++    .sg_tablesize = 256,
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
++    .slave_alloc = vhba_slave_alloc,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0)
++    .tag_alloc_policy = BLK_TAG_ALLOC_RR,
++#else
++    .tag_alloc_policy_rr = true,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
++    .use_blk_tags = 1,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
++    .max_segment_size = VHBA_KBUF_SIZE,
++#endif
++};
++
++static ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
++{
++    struct vhba_request vreq;
++    ssize_t ret;
++
++    scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n",
++        metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
++
++    ret = sizeof(vreq);
++    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
++        ret += scsi_bufflen(cmd);
++    }
++
++    if (ret > buf_len) {
++        scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
++        return -EIO;
++    }
++
++    vreq.metatag = metatag;
++    vreq.lun = cmd->device->lun;
++    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
++    vreq.cdb_len = cmd->cmd_len;
++    vreq.data_len = scsi_bufflen(cmd);
++
++    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
++        return -EFAULT;
++    }
++
++    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
++        buf += sizeof(vreq);
++
++        if (scsi_sg_count(cmd)) {
++            unsigned char *kaddr, *uaddr;
++            struct scatterlist *sglist = scsi_sglist(cmd);
++            struct scatterlist *sg;
++            int i;
++
++            uaddr = (unsigned char *) buf;
++
++            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
++                size_t len = sg->length;
++
++                if (len > vdev->kbuf_size) {
++                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
++                    len = vdev->kbuf_size;
++                }
++
++                kaddr = kmap_atomic(sg_page(sg));
++                memcpy(vdev->kbuf, kaddr + sg->offset, len);
++                kunmap_atomic(kaddr);
++
++                if (copy_to_user(uaddr, vdev->kbuf, len)) {
++                    return -EFAULT;
++                }
++                uaddr += len;
++            }
++        } else {
++            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
++                return -EFAULT;
++            }
++        }
++    }
++
++    return ret;
++}
++
++static ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
++{
++    ssize_t ret = 0;
++
++    scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n",
++         metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd));
++
++    if (res->status) {
++        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
++            scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
++            res->data_len = SCSI_SENSE_BUFFERSIZE;
++        }
++
++        if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) {
++            return -EFAULT;
++        }
++
++        cmd->result = res->status;
++
++        ret += res->data_len;
++    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
++        size_t to_read;
++
++        if (res->data_len > scsi_bufflen(cmd)) {
++            scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
++            res->data_len = scsi_bufflen(cmd);
++        }
++
++        to_read = res->data_len;
++
++        if (scsi_sg_count(cmd)) {
++            unsigned char *kaddr, *uaddr;
++            struct scatterlist *sglist = scsi_sglist(cmd);
++            struct scatterlist *sg;
++            int i;
++
++            uaddr = (unsigned char *)buf;
++
++            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
++                size_t len = (sg->length < to_read) ? sg->length : to_read;
++
++                if (len > vdev->kbuf_size) {
++                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
++                    len = vdev->kbuf_size;
++                }
++
++                if (copy_from_user(vdev->kbuf, uaddr, len)) {
++                    return -EFAULT;
++                }
++                uaddr += len;
++
++                kaddr = kmap_atomic(sg_page(sg));
++                memcpy(kaddr + sg->offset, vdev->kbuf, len);
++                kunmap_atomic(kaddr);
++
++                to_read -= len;
++                if (to_read == 0) {
++                    break;
++                }
++            }
++        } else {
++            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
++                return -EFAULT;
++            }
++
++            to_read -= res->data_len;
++        }
++
++        scsi_set_resid(cmd, to_read);
++
++        ret += res->data_len - to_read;
++    }
++
++    return ret;
++}
++
++static struct vhba_command *next_command (struct vhba_device *vdev)
++{
++    struct vhba_command *vcmd;
++
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->status == VHBA_REQ_PENDING) {
++            break;
++        }
++    }
++
++    if (&vcmd->entry == &vdev->cmd_list) {
++        vcmd = NULL;
++    }
++
++    return vcmd;
++}
++
++static struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag)
++{
++    struct vhba_command *vcmd;
++
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        if (vcmd->metatag == metatag) {
++            break;
++        }
++    }
++
++    if (&vcmd->entry == &vdev->cmd_list) {
++        vcmd = NULL;
++    }
++
++    return vcmd;
++}
++
++static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
++{
++    struct vhba_command *vcmd;
++    DEFINE_WAIT(wait);
++
++    while (!(vcmd = next_command(vdev))) {
++        if (signal_pending(current)) {
++            break;
++        }
++
++        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
++
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        schedule();
++
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++    }
++
++    finish_wait(&vdev->cmd_wq, &wait);
++    if (vcmd) {
++        vcmd->status = VHBA_REQ_READING;
++    }
++
++    return vcmd;
++}
++
++static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    ssize_t ret;
++    unsigned long flags;
++
++    vdev = file->private_data;
++
++    /* Get next command */
++    if (file->f_flags & O_NONBLOCK) {
++        /* Non-blocking variant */
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++        vcmd = next_command(vdev);
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        if (!vcmd) {
++            return -EWOULDBLOCK;
++        }
++    } else {
++        /* Blocking variant */
++        spin_lock_irqsave(&vdev->cmd_lock, flags);
++        vcmd = wait_command(vdev, flags);
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++        if (!vcmd) {
++            return -ERESTARTSYS;
++        }
++    }
++
++    ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (ret >= 0) {
++        vcmd->status = VHBA_REQ_SENT;
++        *offset += ret;
++    } else {
++        vcmd->status = VHBA_REQ_PENDING;
++    }
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return ret;
++}
++
++static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    struct vhba_response res;
++    ssize_t ret;
++    unsigned long flags;
++
++    if (buf_len < sizeof(res)) {
++        return -EIO;
++    }
++
++    if (copy_from_user(&res, buf, sizeof(res))) {
++        return -EFAULT;
++    }
++
++    vdev = file->private_data;
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    vcmd = match_command(vdev, res.metatag);
++    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
++        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++        pr_debug("ctl dev #%u not expecting response\n", vdev->num);
++        return -EIO;
++    }
++    vcmd->status = VHBA_REQ_WRITING;
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (ret >= 0) {
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(vcmd->cmd);
++#else
++        vcmd->cmd->scsi_done(vcmd->cmd);
++#endif
++        ret += sizeof(res);
++
++        /* don't compete with vhba_device_dequeue */
++        if (!list_empty(&vcmd->entry)) {
++            list_del_init(&vcmd->entry);
++            vhba_free_command(vcmd);
++        }
++    } else {
++        vcmd->status = VHBA_REQ_SENT;
++    }
++
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return ret;
++}
++
++static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
++{
++    struct vhba_device *vdev = file->private_data;
++    struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device);
++
++    switch (cmd) {
++        case 0xBEEF001: {
++            unsigned int ident[4]; /* host, channel, id, lun */
++
++            ident[0] = vhost->shost->host_no;
++            devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]);
++            ident[3] = 0; /* lun */
++
++            if (copy_to_user((void *) arg, ident, sizeof(ident))) {
++                return -EFAULT;
++            }
++
++            return 0;
++        }
++        case 0xBEEF002: {
++            unsigned int devnum = vdev->num;
++
++            if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) {
++                return -EFAULT;
++            }
++
++            return 0;
++        }
++    }
++
++    return -ENOTTY;
++}
++
++#ifdef CONFIG_COMPAT
++static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
++{
++    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
++    return vhba_ctl_ioctl(file, cmd, compat_arg);
++}
++#endif
++
++static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
++{
++    struct vhba_device *vdev = file->private_data;
++    unsigned int mask = 0;
++    unsigned long flags;
++
++    poll_wait(file, &vdev->cmd_wq, wait);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    if (next_command(vdev)) {
++        mask |= POLLIN | POLLRDNORM;
++    }
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    return mask;
++}
++
++static int vhba_ctl_open (struct inode *inode, struct file *file)
++{
++    struct vhba_device *vdev;
++    int retval;
++
++    pr_debug("ctl dev open\n");
++
++    /* check if vhba is probed */
++    if (!platform_get_drvdata(&vhba_platform_device)) {
++        return -ENODEV;
++    }
++
++    vdev = vhba_device_alloc();
++    if (!vdev) {
++        return -ENOMEM;
++    }
++
++    vdev->kbuf_size = VHBA_KBUF_SIZE;
++    vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL);
++    if (!vdev->kbuf) {
++        return -ENOMEM;
++    }
++
++    if (!(retval = vhba_add_device(vdev))) {
++        file->private_data = vdev;
++    }
++
++    vhba_device_put(vdev);
++
++    return retval;
++}
++
++static int vhba_ctl_release (struct inode *inode, struct file *file)
++{
++    struct vhba_device *vdev;
++    struct vhba_command *vcmd;
++    unsigned long flags;
++
++    vdev = file->private_data;
++
++    pr_debug("ctl dev release\n");
++
++    vhba_device_get(vdev);
++    vhba_remove_device(vdev);
++
++    spin_lock_irqsave(&vdev->cmd_lock, flags);
++    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
++        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
++
++        scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd);
++        vcmd->cmd->result = DID_NO_CONNECT << 16;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
++        scsi_done(vcmd->cmd);
++#else
++        vcmd->cmd->scsi_done(vcmd->cmd);
++#endif
++        vhba_free_command(vcmd);
++    }
++    INIT_LIST_HEAD(&vdev->cmd_list);
++    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
++
++    kfree(vdev->kbuf);
++    vdev->kbuf = NULL;
++
++    vhba_device_put(vdev);
++
++    return 0;
++}
++
++static struct file_operations vhba_ctl_fops = {
++    .owner = THIS_MODULE,
++    .open = vhba_ctl_open,
++    .release = vhba_ctl_release,
++    .read = vhba_ctl_read,
++    .write = vhba_ctl_write,
++    .poll = vhba_ctl_poll,
++    .unlocked_ioctl = vhba_ctl_ioctl,
++#ifdef CONFIG_COMPAT
++    .compat_ioctl = vhba_ctl_compat_ioctl,
++#endif
++};
++
++static struct miscdevice vhba_miscdev = {
++    .minor = MISC_DYNAMIC_MINOR,
++    .name = "vhba_ctl",
++    .fops = &vhba_ctl_fops,
++};
++
++static int vhba_probe (struct platform_device *pdev)
++{
++    struct Scsi_Host *shost;
++    struct vhba_host *vhost;
++    int i;
++
++    vhba_can_queue = clamp(vhba_can_queue, 1, 256);
++
++    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
++    if (!shost) {
++        return -ENOMEM;
++    }
++
++    shost->max_channel = VHBA_MAX_BUS-1;
++    shost->max_id = VHBA_MAX_ID;
++    /* we don't support lun > 0 */
++    shost->max_lun = 1;
++    shost->max_cmd_len = MAX_COMMAND_SIZE;
++    shost->can_queue = vhba_can_queue;
++    shost->cmd_per_lun = vhba_can_queue;
++
++    vhost = (struct vhba_host *)shost->hostdata;
++    memset(vhost, 0, sizeof(struct vhba_host));
++
++    vhost->shost = shost;
++    vhost->num_devices = 0;
++    spin_lock_init(&vhost->dev_lock);
++    spin_lock_init(&vhost->cmd_lock);
++    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
++    vhost->cmd_next = 0;
++    vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL);
++    if (!vhost->commands) {
++        return -ENOMEM;
++    }
++
++    for (i = 0; i < vhba_can_queue; i++) {
++        vhost->commands[i].status = VHBA_REQ_FREE;
++    }
++
++    platform_set_drvdata(pdev, vhost);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
++    i = scsi_init_shared_tag_map(shost, vhba_can_queue);
++    if (i) return i;
++#endif
++
++    if (scsi_add_host(shost, &pdev->dev)) {
++        scsi_host_put(shost);
++        return -ENOMEM;
++    }
++
++    return 0;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
++static int vhba_remove (struct platform_device *pdev)
++#else
++static void vhba_remove (struct platform_device *pdev)
++#endif
++{
++    struct vhba_host *vhost;
++    struct Scsi_Host *shost;
++
++    vhost = platform_get_drvdata(pdev);
++    shost = vhost->shost;
++
++    scsi_remove_host(shost);
++    scsi_host_put(shost);
++
++    kfree(vhost->commands);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
++    return 0;
++#endif
++}
++
++static void vhba_release (struct device * dev)
++{
++    return;
++}
++
++static struct platform_device vhba_platform_device = {
++    .name = "vhba",
++    .id = -1,
++    .dev = {
++        .release = vhba_release,
++    },
++};
++
++static struct platform_driver vhba_platform_driver = {
++    .driver = {
++        .owner = THIS_MODULE,
++        .name = "vhba",
++    },
++    .probe = vhba_probe,
++    .remove = vhba_remove,
++};
++
++static int __init vhba_init (void)
++{
++    int ret;
++
++    ret = platform_device_register(&vhba_platform_device);
++    if (ret < 0) {
++        return ret;
++    }
++
++    ret = platform_driver_register(&vhba_platform_driver);
++    if (ret < 0) {
++        platform_device_unregister(&vhba_platform_device);
++        return ret;
++    }
++
++    ret = misc_register(&vhba_miscdev);
++    if (ret < 0) {
++        platform_driver_unregister(&vhba_platform_driver);
++        platform_device_unregister(&vhba_platform_device);
++        return ret;
++    }
++
++    return 0;
++}
++
++static void __exit vhba_exit(void)
++{
++    misc_deregister(&vhba_miscdev);
++    platform_driver_unregister(&vhba_platform_driver);
++    platform_device_unregister(&vhba_platform_device);
++}
++
++module_init(vhba_init);
++module_exit(vhba_exit);
++
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 1ae97a0b8ec7..db640e1b17ec 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -194,6 +194,14 @@ static inline void __mm_zero_struct_page(struct page *page)
+ 
+ extern int sysctl_max_map_count;
+ 
++extern bool sysctl_workingset_protection;
++extern u8 sysctl_anon_min_ratio;
++extern u8 sysctl_clean_low_ratio;
++extern u8 sysctl_clean_min_ratio;
++int vm_workingset_protection_update_handler(
++	const struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos);
++
+ extern unsigned long sysctl_user_reserve_kbytes;
+ extern unsigned long sysctl_admin_reserve_kbytes;
+ 
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 12a12dae727d..b460a691b357 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -1337,7 +1337,7 @@ struct readahead_control {
+ 		._index = i,						\
+ 	}
+ 
+-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
++#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
+ 
+ void page_cache_ra_unbounded(struct readahead_control *,
+ 		unsigned long nr_to_read, unsigned long lookahead_count);
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index a0bb6d012137..93129fea552e 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -168,6 +168,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
+ 
+ #ifdef CONFIG_USER_NS
+ 
++extern int unprivileged_userns_clone;
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	if (ns)
+@@ -201,6 +203,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
+ struct ns_common *ns_get_owner(struct ns_common *ns);
+ #else
+ 
++#define unprivileged_userns_clone 0
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	return &init_user_ns;
+diff --git a/init/Kconfig b/init/Kconfig
+index d811cad02a75..e4b7a7062838 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -171,6 +171,10 @@ config THREAD_INFO_IN_TASK
+ 
+ menu "General setup"
+ 
++config CACHY
++    bool "Some kernel tweaks by CachyOS"
++    default y
++
+ config BROKEN
+ 	bool
+ 	help
+@@ -1375,6 +1379,22 @@ config USER_NS
+ 
+ 	  If unsure, say N.
+ 
++config USER_NS_UNPRIVILEGED
++	bool "Allow unprivileged users to create namespaces"
++	default y
++	depends on USER_NS
++	help
++	  When disabled, unprivileged users will not be able to create
++	  new namespaces. Allowing users to create their own namespaces
++	  has been part of several recent local privilege escalation
++	  exploits, so if you need user namespaces but are
++	  paranoid^Wsecurity-conscious you want to disable this.
++
++	  This setting can be overridden at runtime via the
++	  kernel.unprivileged_userns_clone sysctl.
++
++	  If unsure, say Y.
++
+ config PID_NS
+ 	bool "PID Namespaces"
+ 	default y
+@@ -1524,6 +1544,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
+ 	  with the "-O2" compiler flag for best performance and most
+ 	  helpful compile-time warnings.
+ 
++config CC_OPTIMIZE_FOR_PERFORMANCE_O3
++	bool "Optimize more for performance (-O3)"
++	help
++	  Choosing this option will pass "-O3" to your compiler to optimize
++	  the kernel yet more for performance.
++
+ config CC_OPTIMIZE_FOR_SIZE
+ 	bool "Optimize for size (-Os)"
+ 	help
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index ce1435cb08b1..e1359db5561e 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -40,6 +40,27 @@ choice
+ 	 on SMP and NUMA systems and exactly dividing by both PAL and
+ 	 NTSC frame rates for video and multimedia work.
+ 
++	config HZ_500
++		bool "500 HZ"
++	help
++	 500 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
++	config HZ_600
++		bool "600 HZ"
++	help
++	 600 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
++	config HZ_750
++		bool "750 HZ"
++	help
++	 750 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
+ 	config HZ_1000
+ 		bool "1000 HZ"
+ 	help
+@@ -53,6 +74,9 @@ config HZ
+ 	default 100 if HZ_100
+ 	default 250 if HZ_250
+ 	default 300 if HZ_300
++	default 500 if HZ_500
++	default 600 if HZ_600
++	default 750 if HZ_750
+ 	default 1000 if HZ_1000
+ 
+ config SCHED_HRTICK
+diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
+index 54ea59ff8fbe..18f87e0dd137 100644
+--- a/kernel/Kconfig.preempt
++++ b/kernel/Kconfig.preempt
+@@ -88,7 +88,7 @@ endchoice
+ 
+ config PREEMPT_RT
+ 	bool "Fully Preemptible Kernel (Real-Time)"
+-	depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST
++	depends on ARCH_SUPPORTS_RT && !COMPILE_TEST
+ 	select PREEMPTION
+ 	help
+ 	  This option turns the kernel into a real-time kernel by replacing
+diff --git a/kernel/fork.c b/kernel/fork.c
+index af673856499d..d91fa2d9bce1 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -107,6 +107,10 @@
+ #include <linux/tick.h>
+ #include <linux/unwind_deferred.h>
+ 
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
++
+ #include <asm/pgalloc.h>
+ #include <linux/uaccess.h>
+ #include <asm/mmu_context.h>
+@@ -1938,6 +1942,10 @@ __latent_entropy struct task_struct *copy_process(
+ 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ 		return ERR_PTR(-EINVAL);
+ 
++	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
++		if (!capable(CAP_SYS_ADMIN))
++			return ERR_PTR(-EPERM);
++
+ 	/*
+ 	 * Thread groups must share signals as well, and detached threads
+ 	 * can only be started up within the thread group.
+@@ -3105,6 +3113,12 @@ int ksys_unshare(unsigned long unshare_flags)
+ 	if (unshare_flags & CLONE_NEWNS)
+ 		unshare_flags |= CLONE_FS;
+ 
++	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
++		err = -EPERM;
++		if (!capable(CAP_SYS_ADMIN))
++			goto bad_unshare_out;
++	}
++
+ 	err = check_unshare_flags(unshare_flags);
+ 	if (err)
+ 		goto bad_unshare_out;
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index 24df4d98f7d2..1d5923996fa5 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -746,6 +746,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+ 	struct task_struct *new, *owner;
+ 	unsigned long flags, new_flags;
+ 	enum owner_state state;
++	int i = 0;
+ 
+ 	lockdep_assert_preemption_disabled();
+ 
+@@ -782,7 +783,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+ 			break;
+ 		}
+ 
+-		cpu_relax();
++		if (i++ > 1000)
++			cpu_relax();
+ 	}
+ 
+ 	return state;
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b173a059315c..226a96cd2536 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -76,10 +76,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+  *
+  * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_CACHY
++unsigned int sysctl_sched_base_slice			= 350000ULL;
++static unsigned int normalized_sysctl_sched_base_slice	= 350000ULL;
++#else
+ unsigned int sysctl_sched_base_slice			= 700000ULL;
+ static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
++#endif /* CONFIG_CACHY */
+ 
++#ifdef CONFIG_CACHY
++__read_mostly unsigned int sysctl_sched_migration_cost	= 300000UL;
++#else
+ __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
++#endif
+ 
+ static int __init setup_sched_thermal_decay_shift(char *str)
+ {
+@@ -122,8 +131,12 @@ int __weak arch_asym_cpu_priority(int cpu)
+  *
+  * (default: 5 msec, units: microseconds)
+  */
++#ifdef CONFIG_CACHY
++static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
++#else
+ static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+ #endif
++#endif
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+ /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index be9745d104f7..4ee277cb92b9 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2769,7 +2769,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+ 
+ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+ 
+-#ifdef CONFIG_PREEMPT_RT
++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY)
+ # define SCHED_NR_MIGRATE_BREAK 8
+ #else
+ # define SCHED_NR_MIGRATE_BREAK 32
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index cb6196e3fa99..cc5bf841e3fe 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -23,6 +23,10 @@
+ #include <linux/uaccess.h>
+ #include <asm/processor.h>
+ 
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
++
+ /* shared constants to be used in various sysctls */
+ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
+ EXPORT_SYMBOL(sysctl_vals);
+@@ -1455,6 +1459,15 @@ int proc_do_static_key(const struct ctl_table *table, int write,
+ }
+ 
+ static const struct ctl_table sysctl_subsys_table[] = {
++#ifdef CONFIG_USER_NS
++	{
++		.procname	= "unprivileged_userns_clone",
++		.data		= &unprivileged_userns_clone,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++#endif
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "sysctl_writes_strict",
+diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
+index 682f40d5632d..434a25f7b2ed 100644
+--- a/kernel/user_namespace.c
++++ b/kernel/user_namespace.c
+@@ -22,6 +22,13 @@
+ #include <linux/bsearch.h>
+ #include <linux/sort.h>
+ 
++/* sysctl */
++#ifdef CONFIG_USER_NS_UNPRIVILEGED
++int unprivileged_userns_clone = 1;
++#else
++int unprivileged_userns_clone;
++#endif
++
+ static struct kmem_cache *user_ns_cachep __ro_after_init;
+ static DEFINE_MUTEX(userns_state_mutex);
+ 
+diff --git a/mm/Kconfig b/mm/Kconfig
+index e443fe8cd6cf..d3148d9d335d 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -462,6 +462,69 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+ config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
+ 	bool
+ 
++config ANON_MIN_RATIO
++	int "Default value for vm.anon_min_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 1
++	help
++	  This option sets the default value for vm.anon_min_ratio sysctl knob.
++
++	  The vm.anon_min_ratio sysctl knob provides *hard* protection of
++	  anonymous pages. The anonymous pages on the current node won't be
++	  reclaimed under any conditions when their amount is below
++	  vm.anon_min_ratio. This knob may be used to prevent excessive swap
++	  thrashing when anonymous memory is low (for example, when memory is
++	  going to be overfilled by compressed data of zram module).
++
++	  Setting this value too high (close to MemTotal) can result in
++	  inability to swap and can lead to early OOM under memory pressure.
++
++config CLEAN_LOW_RATIO
++	int "Default value for vm.clean_low_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 15
++	help
++	  This option sets the default value for vm.clean_low_ratio sysctl knob.
++
++	  The vm.clean_low_ratio sysctl knob provides *best-effort*
++	  protection of clean file pages. The file pages on the current node
++	  won't be reclaimed under memory pressure when the amount of clean file
++	  pages is below vm.clean_low_ratio *unless* we threaten to OOM.
++	  Protection of clean file pages using this knob may be used when
++	  swapping is still possible to
++	    - prevent disk I/O thrashing under memory pressure;
++	    - improve performance in disk cache-bound tasks under memory
++	      pressure.
++
++	  Setting it to a high value may result in a early eviction of anonymous
++	  pages into the swap space by attempting to hold the protected amount
++	  of clean file pages in memory.
++
++config CLEAN_MIN_RATIO
++	int "Default value for vm.clean_min_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 4
++	help
++	  This option sets the default value for vm.clean_min_ratio sysctl knob.
++
++	  The vm.clean_min_ratio sysctl knob provides *hard* protection of
++	  clean file pages. The file pages on the current node won't be
++	  reclaimed under memory pressure when the amount of clean file pages is
++	  below vm.clean_min_ratio. Hard protection of clean file pages using
++	  this knob may be used to
++	    - prevent disk I/O thrashing under memory pressure even with no free
++	      swap space;
++	    - improve performance in disk cache-bound tasks under memory
++	      pressure;
++	    - avoid high latency and prevent livelock in near-OOM conditions.
++
++	  Setting it to a high value may result in a early out-of-memory condition
++	  due to the inability to reclaim the protected amount of clean file pages
++	  when other types of pages cannot be reclaimed.
++
+ config HAVE_MEMBLOCK_PHYS_MAP
+ 	bool
+ 
+@@ -658,7 +721,7 @@ config COMPACTION
+ config COMPACT_UNEVICTABLE_DEFAULT
+ 	int
+ 	depends on COMPACTION
+-	default 0 if PREEMPT_RT
++	default 0 if PREEMPT_RT || CACHY
+ 	default 1
+ 
+ #
+diff --git a/mm/compaction.c b/mm/compaction.c
+index bf021b31c7ec..cd1c1ece9888 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -1887,7 +1887,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
+  * aggressively the kernel should compact memory in the
+  * background. It takes values in the range [0, 100].
+  */
++#ifdef CONFIG_CACHY
++static unsigned int __read_mostly sysctl_compaction_proactiveness;
++#else
+ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
++#endif
+ static int sysctl_extfrag_threshold = 500;
+ static int __read_mostly sysctl_compact_memory;
+ 
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 9c38a95e9f09..4bc77b92d649 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+ #endif
++#ifdef CONFIG_CACHY
++	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
++#else
+ 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
++#endif
+ 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+ 
+diff --git a/mm/mm_init.c b/mm/mm_init.c
+index 5c21b3af216b..752d2632c508 100644
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -2731,6 +2731,7 @@ static void __init mem_init_print_info(void)
+ 		, K(totalhigh_pages())
+ #endif
+ 		);
++	printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.15a by Masahito Suzuki (forked from hakavlad's original le9 patch)");
+ }
+ 
+ void __init __weak arch_mm_preinit(void)
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 3e248d1c3969..4fe0b14d8d73 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -72,7 +72,11 @@ static long ratelimit_pages = 32;
+ /*
+  * Start background writeback (via writeback threads) at this percentage
+  */
++#ifdef CONFIG_CACHY
++static int dirty_background_ratio = 5;
++#else
+ static int dirty_background_ratio = 10;
++#endif
+ 
+ /*
+  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+@@ -100,7 +104,11 @@ static unsigned long vm_dirty_bytes;
+ /*
+  * The interval between `kupdate'-style writebacks
+  */
++#ifdef CONFIG_CACHY
++unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
++#else
+ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
++#endif
+ 
+ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+ 
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index d1d037f97c5f..caabeb362291 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -274,7 +274,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
+ 
+ int min_free_kbytes = 1024;
+ int user_min_free_kbytes = -1;
++#ifdef CONFIG_CACHY
++static int watermark_boost_factor __read_mostly;
++#else
+ static int watermark_boost_factor __read_mostly = 15000;
++#endif
+ static int watermark_scale_factor = 10;
+ int defrag_mode;
+ 
+diff --git a/mm/swap.c b/mm/swap.c
+index 3632dd061beb..23d362634d96 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -1096,6 +1096,10 @@ static const struct ctl_table swap_sysctl_table[] = {
+  */
+ void __init swap_setup(void)
+ {
++#ifdef CONFIG_CACHY
++	/* Only swap-in pages requested, avoid readahead */
++	page_cluster = 0;
++#else
+ 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ 
+ 	/* Use a smaller cluster for small-memory machines */
+@@ -1103,6 +1107,7 @@ void __init swap_setup(void)
+ 		page_cluster = 2;
+ 	else
+ 		page_cluster = 3;
++#endif /* CONFIG_CACHY */
+ 	/*
+ 	 * Right now other parts of the system means that we
+ 	 * _really_ don't want to cluster much more
+diff --git a/mm/util.c b/mm/util.c
+index f814e6a59ab1..a84d4f4a6195 100644
+--- a/mm/util.c
++++ b/mm/util.c
+@@ -858,6 +858,40 @@ static const struct ctl_table util_sysctl_table[] = {
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
++	{
++		.procname	= "workingset_protection",
++		.data		= &sysctl_workingset_protection,
++		.maxlen		= sizeof(bool),
++		.mode		= 0644,
++		.proc_handler	= &proc_dobool,
++	},
++	{
++		.procname	= "anon_min_ratio",
++		.data		= &sysctl_anon_min_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
++	{
++		.procname	= "clean_low_ratio",
++		.data		= &sysctl_clean_low_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
++	{
++		.procname	= "clean_min_ratio",
++		.data		= &sysctl_clean_min_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
+ };
+ 
+ static int __init init_vm_util_sysctls(void)
+diff --git a/mm/vmpressure.c b/mm/vmpressure.c
+index c197ed47bcc4..1b359dcc88c4 100644
+--- a/mm/vmpressure.c
++++ b/mm/vmpressure.c
+@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+  * essence, they are percents: the higher the value, the more number
+  * unsuccessful reclaims there were.
+  */
++#ifdef CONFIG_CACHY
++static const unsigned int vmpressure_level_med = 65;
++#else
+ static const unsigned int vmpressure_level_med = 60;
++#endif
+ static const unsigned int vmpressure_level_critical = 95;
+ 
+ /*
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index a48aec8bfd92..e2c3f8712bbb 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -147,6 +147,15 @@ struct scan_control {
+ 	/* The file folios on the current node are dangerously low */
+ 	unsigned int file_is_tiny:1;
+ 
++	/* The anonymous pages on the current node are below vm.anon_min_ratio */
++	unsigned int anon_below_min:1;
++
++	/* The clean file pages on the current node are below vm.clean_low_ratio */
++	unsigned int clean_below_low:1;
++
++	/* The clean file pages on the current node are below vm.clean_min_ratio */
++	unsigned int clean_below_min:1;
++
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
+@@ -196,10 +205,23 @@ struct scan_control {
+ #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
+ #endif
+ 
++bool sysctl_workingset_protection __read_mostly = true;
++u8 sysctl_anon_min_ratio  __read_mostly = CONFIG_ANON_MIN_RATIO;
++u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO;
++u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO;
++static u64 sysctl_anon_min_ratio_kb  __read_mostly = 0;
++static u64 sysctl_clean_low_ratio_kb __read_mostly = 0;
++static u64 sysctl_clean_min_ratio_kb __read_mostly = 0;
++static u64 workingset_protection_prev_totalram __read_mostly = 0;
++
+ /*
+  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
+  */
++#ifdef CONFIG_CACHY
++int vm_swappiness = 100;
++#else
+ int vm_swappiness = 60;
++#endif
+ 
+ #ifdef CONFIG_MEMCG
+ 
+@@ -1157,6 +1179,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
+ 		if (!sc->may_unmap && folio_mapped(folio))
+ 			goto keep_locked;
+ 
++		if (folio_is_file_lru(folio) ? sc->clean_below_min :
++				(sc->anon_below_min && !sc->clean_below_min))
++			goto keep_locked;
++
+ 		/*
+ 		 * The number of dirty pages determines if a node is marked
+ 		 * reclaim_congested. kswapd will stall and start writing
+@@ -2606,6 +2632,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 		goto out;
+ 	}
+ 
++	/*
++	 * Force-scan anon if clean file pages is under vm.clean_low_ratio
++	 * or vm.clean_min_ratio.
++	 */
++	if (sc->clean_below_low || sc->clean_below_min) {
++		scan_balance = SCAN_ANON;
++		goto out;
++	}
++
+ 	/*
+ 	 * If there is enough inactive page cache, we do not reclaim
+ 	 * anything from the anonymous working right now to make sure
+@@ -2664,6 +2699,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 			BUG();
+ 		}
+ 
++		/*
++		 * Hard protection of the working set.
++		 * Don't reclaim anon/file pages when the amount is
++		 * below the watermark of the same type.
++		 */
++		if (file ? sc->clean_below_min : sc->anon_below_min)
++			scan = 0;
++
+ 		nr[lru] = scan;
+ 	}
+ }
+@@ -2684,6 +2727,96 @@ static bool can_age_anon_pages(struct lruvec *lruvec,
+ 			  lruvec_memcg(lruvec));
+ }
+ 
++int vm_workingset_protection_update_handler(const struct ctl_table *table, int write,
++		void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	workingset_protection_prev_totalram = 0;
++
++	return 0;
++}
++
++static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
++{
++	unsigned long node_mem_total;
++	struct sysinfo i;
++
++	if (!(sysctl_workingset_protection)) {
++		sc->anon_below_min = 0;
++		sc->clean_below_low = 0;
++		sc->clean_below_min = 0;
++		return;
++	}
++
++	if (likely(sysctl_anon_min_ratio  ||
++	           sysctl_clean_low_ratio ||
++		       sysctl_clean_min_ratio)) {
++#ifdef CONFIG_NUMA
++		si_meminfo_node(&i, pgdat->node_id);
++#else //CONFIG_NUMA
++		si_meminfo(&i);
++#endif //CONFIG_NUMA
++		node_mem_total = i.totalram;
++
++		if (unlikely(workingset_protection_prev_totalram != node_mem_total)) {
++			sysctl_anon_min_ratio_kb  =
++				node_mem_total * sysctl_anon_min_ratio  / 100;
++			sysctl_clean_low_ratio_kb =
++				node_mem_total * sysctl_clean_low_ratio / 100;
++			sysctl_clean_min_ratio_kb =
++				node_mem_total * sysctl_clean_min_ratio / 100;
++			workingset_protection_prev_totalram = node_mem_total;
++		}
++	}
++
++	/*
++	 * Check the number of anonymous pages to protect them from
++	 * reclaiming if their amount is below the specified.
++	 */
++	if (sysctl_anon_min_ratio) {
++		unsigned long reclaimable_anon;
++
++		reclaimable_anon =
++			node_page_state(pgdat, NR_ACTIVE_ANON) +
++			node_page_state(pgdat, NR_INACTIVE_ANON) +
++			node_page_state(pgdat, NR_ISOLATED_ANON);
++
++		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb;
++	} else
++		sc->anon_below_min = 0;
++
++	/*
++	 * Check the number of clean file pages to protect them from
++	 * reclaiming if their amount is below the specified.
++	 */
++	if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) {
++		unsigned long reclaimable_file, dirty, clean;
++
++		reclaimable_file =
++			node_page_state(pgdat, NR_ACTIVE_FILE) +
++			node_page_state(pgdat, NR_INACTIVE_FILE) +
++			node_page_state(pgdat, NR_ISOLATED_FILE);
++		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
++		/*
++		 * node_page_state() sum can go out of sync since
++		 * all the values are not read at once.
++		 */
++		if (likely(reclaimable_file > dirty))
++			clean = reclaimable_file - dirty;
++		else
++			clean = 0;
++
++		sc->clean_below_low = clean < sysctl_clean_low_ratio_kb;
++		sc->clean_below_min = clean < sysctl_clean_min_ratio_kb;
++	} else {
++		sc->clean_below_low = 0;
++		sc->clean_below_min = 0;
++	}
++}
++
+ #ifdef CONFIG_LRU_GEN
+ 
+ #ifdef CONFIG_LRU_GEN_ENABLED
+@@ -4667,11 +4800,21 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
+ 	return tier - 1;
+ }
+ 
+-static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
++static int get_type_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+ {
+ 	struct ctrl_pos sp, pv;
+ 
+-	if (swappiness <= MIN_SWAPPINESS + 1)
++	if (swappiness == MIN_SWAPPINESS)
++		return LRU_GEN_FILE;
++
++	if (sc->clean_below_min)
++		return LRU_GEN_ANON;
++	if (sc->anon_below_min)
++		return LRU_GEN_FILE;
++	if (sc->clean_below_low)
++		return LRU_GEN_ANON;
++
++	if (swappiness == MIN_SWAPPINESS + 1)
+ 		return LRU_GEN_FILE;
+ 
+ 	if (swappiness >= MAX_SWAPPINESS)
+@@ -4691,7 +4834,7 @@ static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ 			  int *type_scanned, struct list_head *list)
+ {
+ 	int i;
+-	int type = get_type_to_scan(lruvec, swappiness);
++	int type = get_type_to_scan(lruvec, sc, swappiness);
+ 
+ 	for_each_evictable_type(i, swappiness) {
+ 		int scanned;
+@@ -4937,6 +5080,12 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
++	prepare_workingset_protection(pgdat, sc);
++
++	if (sysctl_workingset_protection && sc->clean_below_min &&
++			!can_reclaim_anon_pages(memcg, pgdat->node_id, sc))
++		return 0;
++
+ 	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
+ 	if (mem_cgroup_below_min(NULL, memcg))
+ 		return MEMCG_LRU_YOUNG;
+@@ -6089,6 +6238,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 
+ 	prepare_scan_control(pgdat, sc);
+ 
++	prepare_workingset_protection(pgdat, sc);
++
+ 	shrink_node_memcgs(pgdat, sc);
+ 
+ 	flush_reclaim_state(sc);
+diff --git a/scripts/Makefile.thinlto b/scripts/Makefile.thinlto
+new file mode 100644
+index 000000000000..ec98fa2ead3b
+--- /dev/null
++++ b/scripts/Makefile.thinlto
+@@ -0,0 +1,38 @@
++PHONY := __default
++__default:
++
++include include/config/auto.conf
++include $(srctree)/scripts/Kbuild.include
++include $(srctree)/scripts/Makefile.lib
++
++native-objs := $(patsubst %.o,%.thinlto-native.o,$(call read-file, vmlinux.thinlto-index))
++
++__default: $(native-objs)
++
++# Generate .thinlto-native.o (obj) from .o (bitcode) and .thinlto.bc (summary) files
++# ---------------------------------------------------------------------------
++quiet_cmd_cc_o_bc = CC $(quiet_modtag)  $@
++      cmd_cc_o_bc = \
++      $(CC) $(_c_flags) -fno-lto -Wno-unused-command-line-argument \
++      -fthinlto-index=$(word 2, $^) -c -o $@ $<
++
++targets += $(native-objs)
++$(native-objs): %.thinlto-native.o: %.o %.o.thinlto.bc   FORCE
++	$(call if_changed,cc_o_bc)
++
++# Add FORCE to the prerequisites of a target to force it to be always rebuilt.
++# ---------------------------------------------------------------------------
++
++PHONY += FORCE
++FORCE:
++
++# Read all saved command lines and dependencies for the $(targets) we
++# may be building above, using $(if_changed{,_dep}). As an
++# optimization, we don't need to read them if the target does not
++# exist, we will rebuild anyway in that case.
++
++existing-targets := $(wildcard $(sort $(targets)))
++
++-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd)
++
++.PHONY: $(PHONY)
+diff --git a/scripts/Makefile.vmlinux_a b/scripts/Makefile.vmlinux_a
+new file mode 100644
+index 000000000000..73c9545de7cf
+--- /dev/null
++++ b/scripts/Makefile.vmlinux_a
+@@ -0,0 +1,83 @@
++# SPDX-License-Identifier: GPL-2.0-only
++
++PHONY := __default
++__default: vmlinux.a
++
++include include/config/auto.conf
++include $(srctree)/scripts/Kbuild.include
++include $(srctree)/scripts/Makefile.lib
++
++# Link of built-in-fixup.a
++# ---------------------------------------------------------------------------
++
++# '$(AR) mPi' needs 'T' to workaround the bug of llvm-ar <= 14
++quiet_cmd_ar_builtin_fixup = AR      $@
++      cmd_ar_builtin_fixup = \
++	rm -f $@; \
++	$(AR) cDPrST $@ $(KBUILD_VMLINUX_OBJS); \
++	$(AR) mPiT $$($(AR) t $@ | sed -n 1p) $@ $$($(AR) t $@ | grep -F -f $(srctree)/scripts/head-object-list.txt)
++
++targets += built-in-fixup.a
++built-in-fixup.a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE
++	$(call if_changed,ar_builtin_fixup)
++
++ifdef CONFIG_LTO_CLANG_THIN_DIST
++
++quiet_cmd_builtin.order = GEN     $@
++      cmd_builtin.order = $(AR) t $< > $@
++
++targets += builtin.order
++builtin.order: built-in-fixup.a FORCE
++	$(call if_changed,builtin.order)
++
++quiet_cmd_ld_thinlto_index = LD      $@
++      cmd_ld_thinlto_index = \
++	$(LD) $(KBUILD_LDFLAGS) -r --thinlto-index-only=$@ @$<
++
++targets += vmlinux.thinlto-index
++vmlinux.thinlto-index: builtin.order FORCE
++	$(call if_changed,ld_thinlto_index)
++
++quiet_cmd_ar_vmlinux.a = GEN     $@
++      cmd_ar_vmlinux.a =					\
++	rm -f $@;						\
++	while read -r obj; do					\
++		if grep -q $${obj} $(word 2, $^); then		\
++			echo $${obj%.o}.thinlto-native.o;	\
++		else						\
++			echo $${obj};				\
++		fi;						\
++	done < $< | xargs $(AR) cDPrS $@
++
++targets += vmlinux.a
++vmlinux.a: builtin.order vmlinux.thinlto-index FORCE
++	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.thinlto
++	$(call if_changed,ar_vmlinux.a)
++
++else
++
++# vmlinux.a
++# ---------------------------------------------------------------------------
++
++targets += vmlinux.a
++vmlinux.a: built-in-fixup.a FORCE
++	$(call if_changed,copy)
++
++endif
++
++# Add FORCE to the prerequisites of a target to force it to be always rebuilt.
++# ---------------------------------------------------------------------------
++
++PHONY += FORCE
++FORCE:
++
++# Read all saved command lines and dependencies for the $(targets) we
++# may be building above, using $(if_changed{,_dep}). As an
++# optimization, we don't need to read them if the target does not
++# exist, we will rebuild anyway in that case.
++
++existing-targets := $(wildcard $(sort $(targets)))
++
++-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd)
++
++.PHONY: $(PHONY)
+diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
+index 5ca7c268294e..8b01746c9ce6 100644
+--- a/scripts/mod/modpost.c
++++ b/scripts/mod/modpost.c
+@@ -1473,13 +1473,22 @@ static void extract_crcs_for_object(const char *object, struct module *mod)
+ 	char cmd_file[PATH_MAX];
+ 	char *buf, *p;
+ 	const char *base;
+-	int dirlen, ret;
++	int dirlen, baselen_without_suffix, ret;
+ 
+ 	base = get_basename(object);
+ 	dirlen = base - object;
+ 
+-	ret = snprintf(cmd_file, sizeof(cmd_file), "%.*s.%s.cmd",
+-		       dirlen, object, base);
++	baselen_without_suffix = strlen(object) - dirlen - strlen(".o");
++
++	/*
++	 * When CONFIG_LTO_CLANG_THIN_DIST=y, the ELF is *.thinlto-native.o
++	 * but the symbol CRCs are recorded in *.o.cmd file.
++	 */
++	if (strends(object, ".thinlto-native.o"))
++		baselen_without_suffix -= strlen(".thinlto-native");
++
++	ret = snprintf(cmd_file, sizeof(cmd_file), "%.*s.%.*s.o.cmd",
++		       dirlen, object, baselen_without_suffix, base);
+ 	if (ret >= sizeof(cmd_file)) {
+ 		error("%s: too long path was truncated\n", cmd_file);
+ 		return;
+-- 
+2.51.0
+
diff --git a/sys-kernel/git-sources/0004-fixes.patch b/sys-kernel/git-sources/0004-fixes.patch
new file mode 100644
index 0000000..1f68361
--- /dev/null
+++ b/sys-kernel/git-sources/0004-fixes.patch
@@ -0,0 +1,107 @@
+From 3a2358a5db595bd3797db3e5d65cd01863f42b94 Mon Sep 17 00:00:00 2001
+From: Eric Naim <dnaim@cachyos.org>
+Date: Mon, 1 Sep 2025 09:38:55 +0800
+Subject: [PATCH 4/4] fixes
+
+Signed-off-by: Eric Naim <dnaim@cachyos.org>
+---
+ drivers/gpu/drm/drm_atomic_uapi.c | 23 ++++++++++++-----------
+ include/linux/btf.h               |  2 +-
+ net/ipv4/route.c                  |  7 ++++++-
+ scripts/package/PKGBUILD          |  5 +++++
+ 4 files changed, 24 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
+index ecc73d52bfae..85dbdaa4a2e2 100644
+--- a/drivers/gpu/drm/drm_atomic_uapi.c
++++ b/drivers/gpu/drm/drm_atomic_uapi.c
+@@ -1078,19 +1078,20 @@ int drm_atomic_set_property(struct drm_atomic_state *state,
+ 		}
+ 
+ 		if (async_flip) {
+-			/* check if the prop does a nop change */
+-			if ((prop != config->prop_fb_id &&
+-			     prop != config->prop_in_fence_fd &&
+-			     prop != config->prop_fb_damage_clips)) {
+-				ret = drm_atomic_plane_get_property(plane, plane_state,
+-								    prop, &old_val);
+-				ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop);
+-			}
++			/* no-op changes are always allowed */
++			ret = drm_atomic_plane_get_property(plane, plane_state,
++							    prop, &old_val);
++			ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop);
+ 
+-			/* ask the driver if this non-primary plane is supported */
+-			if (plane->type != DRM_PLANE_TYPE_PRIMARY) {
+-				ret = -EINVAL;
++			/* fail everything that isn't no-op or a pure flip */
++			if (ret && prop != config->prop_fb_id &&
++			    prop != config->prop_in_fence_fd &&
++			    prop != config->prop_fb_damage_clips) {
++				break;
++			}
+ 
++			if (ret && plane->type != DRM_PLANE_TYPE_PRIMARY) {
++				/* ask the driver if this non-primary plane is supported */
+ 				if (plane_funcs && plane_funcs->atomic_async_check)
+ 					ret = plane_funcs->atomic_async_check(plane, state, true);
+ 
+diff --git a/include/linux/btf.h b/include/linux/btf.h
+index 9eda6b113f9b..f06976ffb63f 100644
+--- a/include/linux/btf.h
++++ b/include/linux/btf.h
+@@ -86,7 +86,7 @@
+  * as to avoid issues such as the compiler inlining or eliding either a static
+  * kfunc, or a global kfunc in an LTO build.
+  */
+-#define __bpf_kfunc __used __retain noinline
++#define __bpf_kfunc __used __retain __noclone noinline
+ 
+ #define __bpf_kfunc_start_defs()					       \
+ 	__diag_push();							       \
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index baa43e5966b1..05a5d185807a 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -2592,6 +2592,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
+ 	do_cache = true;
+ 	if (type == RTN_BROADCAST) {
+ 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
++		fi = NULL;
+ 	} else if (type == RTN_MULTICAST) {
+ 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
+ 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+@@ -2661,8 +2662,12 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
+ 			rth->dst.output = ip_mc_output;
+ 			RT_CACHE_STAT_INC(out_slow_mc);
+ 		}
++		if (type == RTN_BROADCAST && res->fi) {
++			/* ensure MTU value for broadcast routes is retained */
++			ip_dst_init_metrics(&rth->dst, res->fi->fib_metrics);
++		}
+ #ifdef CONFIG_IP_MROUTE
+-		if (type == RTN_MULTICAST) {
++		else if (type == RTN_MULTICAST) {
+ 			if (IN_DEV_MFORWARD(in_dev) &&
+ 			    !ipv4_is_local_multicast(fl4->daddr)) {
+ 				rth->dst.input = ip_mr_input;
+diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD
+index 452374d63c24..08f80d7c5df0 100644
+--- a/scripts/package/PKGBUILD
++++ b/scripts/package/PKGBUILD
+@@ -90,6 +90,11 @@ _package-headers() {
+ 		"${srctree}/scripts/package/install-extmod-build" "${builddir}"
+ 	fi
+ 
++	# required when DEBUG_INFO_BTF_MODULES is enabled
++	if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then
++		install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids
++	fi
++
+ 	echo "Installing System.map and config..."
+ 	mkdir -p "${builddir}"
+ 	cp System.map "${builddir}/System.map"
+-- 
+2.51.0
+
diff --git a/sys-kernel/git-sources/0005-sched-ext.patch b/sys-kernel/git-sources/0005-sched-ext.patch
deleted file mode 100644
index e14973c..0000000
--- a/sys-kernel/git-sources/0005-sched-ext.patch
+++ /dev/null
@@ -1,21992 +0,0 @@
-From a202d5f9500a682f40f4ba89dfeeae27177a56af Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Fri, 7 Jun 2024 08:41:45 +0200
-Subject: [PATCH] sched-ext
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- .github/workflows/run-schedulers              |   73 +
- .github/workflows/sched-ext.config            |   34 +
- .github/workflows/test-kernel.yml             |   56 +
- .gitignore                                    |    3 +
- Documentation/bpf/libbpf/libbpf_overview.rst  |    8 +
- Documentation/bpf/standardization/abi.rst     |    3 +
- .../bpf/standardization/instruction-set.rst   |  261 +-
- Documentation/scheduler/index.rst             |    1 +
- Documentation/scheduler/sched-ext.rst         |  314 +
- MAINTAINERS                                   |   13 +
- Makefile                                      |    8 +-
- arch/riscv/Kconfig                            |   12 +
- arch/riscv/net/bpf_jit.h                      |   51 +
- arch/riscv/net/bpf_jit_comp32.c               |    3 +-
- arch/riscv/net/bpf_jit_comp64.c               |   21 +-
- drivers/isdn/mISDN/dsp_blowfish.c             |    5 -
- drivers/net/ethernet/8390/ne2k-pci.c          |   11 -
- drivers/net/ethernet/adaptec/starfire.c       |    8 -
- .../net/ethernet/cavium/liquidio/lio_main.c   |    6 -
- .../ethernet/cavium/liquidio/octeon_droq.c    |    5 -
- drivers/net/ethernet/mellanox/mlx4/main.c     |    6 -
- drivers/net/usb/lan78xx.c                     |    5 -
- drivers/net/usb/smsc75xx.c                    |    5 -
- drivers/tty/sysrq.c                           |    1 +
- include/asm-generic/vmlinux.lds.h             |    1 +
- include/linux/bpf.h                           |   13 +-
- include/linux/cgroup-defs.h                   |    8 +
- include/linux/cgroup.h                        |    5 +-
- include/linux/filter.h                        |    2 +-
- include/linux/sched.h                         |    5 +
- include/linux/sched/ext.h                     |  210 +
- include/linux/sched/task.h                    |    3 +-
- include/linux/skbuff.h                        |   68 +-
- include/net/inet_frag.h                       |    4 +-
- include/trace/events/sched_ext.h              |   32 +
- include/uapi/linux/bpf.h                      |   15 +-
- include/uapi/linux/sched.h                    |    1 +
- init/Kconfig                                  |    5 +
- init/init_task.c                              |   12 +
- kernel/Kconfig.preempt                        |   24 +-
- kernel/bpf/bpf_local_storage.c                |    4 +-
- kernel/bpf/bpf_struct_ops.c                   |   75 +-
- kernel/bpf/helpers.c                          |  119 +
- kernel/bpf/syscall.c                          |   34 +-
- kernel/cgroup/cgroup.c                        |   97 +-
- kernel/fork.c                                 |   17 +-
- kernel/sched/build_policy.c                   |    9 +
- kernel/sched/core.c                           |  316 +-
- kernel/sched/cpufreq_schedutil.c              |   50 +-
- kernel/sched/debug.c                          |    3 +
- kernel/sched/ext.c                            | 6973 +++++++++++++++++
- kernel/sched/ext.h                            |  143 +
- kernel/sched/fair.c                           |   21 +-
- kernel/sched/idle.c                           |    2 +
- kernel/sched/sched.h                          |  116 +-
- lib/dump_stack.c                              |    1 +
- lib/test_bpf.c                                |    1 +
- net/bpf/bpf_dummy_struct_ops.c                |    4 +-
- net/bridge/netfilter/nf_conntrack_bridge.c    |    6 +-
- net/core/dev.c                                |    2 +-
- net/core/filter.c                             |   62 +-
- net/core/sock.c                               |   19 +-
- net/ieee802154/6lowpan/reassembly.c           |    2 +-
- net/ipv4/bpf_tcp_ca.c                         |    6 +-
- net/ipv4/inet_fragment.c                      |    2 +-
- net/ipv4/ip_fragment.c                        |    2 +-
- net/ipv4/ip_output.c                          |   14 +-
- net/ipv4/raw.c                                |    2 +-
- net/ipv4/tcp_ipv4.c                           |    2 +
- net/ipv4/tcp_output.c                         |   14 +-
- net/ipv6/ip6_output.c                         |   11 +-
- net/ipv6/netfilter.c                          |    6 +-
- net/ipv6/netfilter/nf_conntrack_reasm.c       |    2 +-
- net/ipv6/raw.c                                |    2 +-
- net/ipv6/reassembly.c                         |    2 +-
- net/ipv6/tcp_ipv6.c                           |   12 +-
- net/netfilter/nf_conntrack_bpf.c              |   68 +-
- net/packet/af_packet.c                        |    7 +-
- net/sched/act_bpf.c                           |    4 +-
- net/sched/cls_bpf.c                           |    4 +-
- samples/bpf/cpustat_kern.c                    |    3 +-
- scripts/Makefile.btf                          |    4 +-
- tools/Makefile                                |   10 +-
- .../bpf/bpftool/Documentation/bpftool-btf.rst |    6 +-
- tools/bpf/bpftool/Makefile                    |    3 +-
- tools/bpf/bpftool/bash-completion/bpftool     |    3 +
- tools/bpf/bpftool/btf.c                       |  138 +-
- tools/bpf/bpftool/common.c                    |    2 +-
- tools/bpf/bpftool/skeleton/pid_iter.bpf.c     |    7 +-
- tools/bpf/bpftool/skeleton/profiler.bpf.c     |   14 +-
- tools/include/uapi/linux/bpf.h                |   15 +-
- tools/lib/bpf/libbpf.c                        |   25 +-
- tools/lib/bpf/libbpf.h                        |    5 +-
- tools/lib/bpf/libbpf_internal.h               |   10 +-
- tools/sched_ext/.gitignore                    |    2 +
- tools/sched_ext/Makefile                      |  246 +
- tools/sched_ext/README.md                     |  270 +
- .../sched_ext/include/bpf-compat/gnu/stubs.h  |   11 +
- tools/sched_ext/include/scx/common.bpf.h      |  349 +
- tools/sched_ext/include/scx/common.h          |   71 +
- tools/sched_ext/include/scx/compat.bpf.h      |  120 +
- tools/sched_ext/include/scx/compat.h          |  208 +
- tools/sched_ext/include/scx/user_exit_info.h  |  111 +
- tools/sched_ext/scx_central.bpf.c             |  362 +
- tools/sched_ext/scx_central.c                 |  135 +
- tools/sched_ext/scx_flatcg.bpf.c              |  939 +++
- tools/sched_ext/scx_flatcg.c                  |  233 +
- tools/sched_ext/scx_flatcg.h                  |   51 +
- tools/sched_ext/scx_qmap.bpf.c                |  728 ++
- tools/sched_ext/scx_qmap.c                    |  154 +
- tools/sched_ext/scx_show_state.py             |   39 +
- tools/sched_ext/scx_simple.bpf.c              |  157 +
- tools/sched_ext/scx_simple.c                  |  107 +
- .../bpf/bpf_test_no_cfi/bpf_test_no_cfi.c     |    4 +-
- .../selftests/bpf/bpf_testmod/bpf_testmod.c   |    6 +-
- tools/testing/selftests/bpf/config            |    1 +
- tools/testing/selftests/bpf/network_helpers.c |   32 +-
- tools/testing/selftests/bpf/network_helpers.h |    8 +-
- .../testing/selftests/bpf/prog_tests/bpf_nf.c |    7 +
- .../selftests/bpf/prog_tests/bpf_tcp_ca.c     |   92 +-
- .../bpf/prog_tests/bpf_verif_scale.c          |    6 -
- .../selftests/bpf/prog_tests/ctx_rewrite.c    |   10 +-
- .../bpf/prog_tests/sockopt_inherit.c          |    2 +-
- .../selftests/bpf/prog_tests/tc_redirect.c    |    3 -
- .../bpf/prog_tests/test_struct_ops_module.c   |   57 +
- .../selftests/bpf/prog_tests/verifier.c       |    2 +
- .../bpf/progs/bpf_iter_bpf_array_map.c        |    6 -
- .../bpf/progs/bpf_iter_bpf_percpu_array_map.c |    6 -
- .../selftests/bpf/progs/struct_ops_detach.c   |   10 +
- .../testing/selftests/bpf/progs/test_bpf_nf.c |  108 +
- .../selftests/bpf/progs/test_sockmap_kern.h   |   20 +-
- .../selftests/bpf/progs/test_tc_dtime.c       |   39 +-
- .../selftests/bpf/progs/verifier_bits_iter.c  |  153 +
- tools/testing/selftests/bpf/test_sockmap.c    |  136 +-
- .../bpf/test_tcp_check_syncookie_user.c       |    4 +-
- tools/testing/selftests/bpf/test_verifier.c   |    5 -
- tools/testing/selftests/sched_ext/.gitignore  |    6 +
- tools/testing/selftests/sched_ext/Makefile    |  218 +
- tools/testing/selftests/sched_ext/config      |    9 +
- .../selftests/sched_ext/create_dsq.bpf.c      |   58 +
- .../testing/selftests/sched_ext/create_dsq.c  |   57 +
- .../sched_ext/ddsp_bogus_dsq_fail.bpf.c       |   42 +
- .../selftests/sched_ext/ddsp_bogus_dsq_fail.c |   57 +
- .../sched_ext/ddsp_vtimelocal_fail.bpf.c      |   39 +
- .../sched_ext/ddsp_vtimelocal_fail.c          |   56 +
- .../selftests/sched_ext/dsp_local_on.bpf.c    |   65 +
- .../selftests/sched_ext/dsp_local_on.c        |   58 +
- .../sched_ext/enq_last_no_enq_fails.bpf.c     |   21 +
- .../sched_ext/enq_last_no_enq_fails.c         |   60 +
- .../sched_ext/enq_select_cpu_fails.bpf.c      |   43 +
- .../sched_ext/enq_select_cpu_fails.c          |   61 +
- tools/testing/selftests/sched_ext/exit.bpf.c  |   84 +
- tools/testing/selftests/sched_ext/exit.c      |   55 +
- tools/testing/selftests/sched_ext/exit_test.h |   20 +
- .../testing/selftests/sched_ext/hotplug.bpf.c |   61 +
- tools/testing/selftests/sched_ext/hotplug.c   |  168 +
- .../selftests/sched_ext/hotplug_test.h        |   15 +
- .../sched_ext/init_enable_count.bpf.c         |   53 +
- .../selftests/sched_ext/init_enable_count.c   |  166 +
- .../testing/selftests/sched_ext/maximal.bpf.c |  164 +
- tools/testing/selftests/sched_ext/maximal.c   |   51 +
- .../selftests/sched_ext/maybe_null.bpf.c      |   26 +
- .../testing/selftests/sched_ext/maybe_null.c  |   40 +
- .../selftests/sched_ext/maybe_null_fail.bpf.c |   25 +
- .../testing/selftests/sched_ext/minimal.bpf.c |   21 +
- tools/testing/selftests/sched_ext/minimal.c   |   58 +
- .../selftests/sched_ext/prog_run.bpf.c        |   32 +
- tools/testing/selftests/sched_ext/prog_run.c  |   78 +
- .../testing/selftests/sched_ext/reload_loop.c |   75 +
- tools/testing/selftests/sched_ext/runner.c    |  201 +
- tools/testing/selftests/sched_ext/scx_test.h  |  131 +
- .../selftests/sched_ext/select_cpu_dfl.bpf.c  |   40 +
- .../selftests/sched_ext/select_cpu_dfl.c      |   72 +
- .../sched_ext/select_cpu_dfl_nodispatch.bpf.c |   89 +
- .../sched_ext/select_cpu_dfl_nodispatch.c     |   72 +
- .../sched_ext/select_cpu_dispatch.bpf.c       |   41 +
- .../selftests/sched_ext/select_cpu_dispatch.c |   70 +
- .../select_cpu_dispatch_bad_dsq.bpf.c         |   37 +
- .../sched_ext/select_cpu_dispatch_bad_dsq.c   |   56 +
- .../select_cpu_dispatch_dbl_dsp.bpf.c         |   38 +
- .../sched_ext/select_cpu_dispatch_dbl_dsp.c   |   56 +
- .../sched_ext/select_cpu_vtime.bpf.c          |   92 +
- .../selftests/sched_ext/select_cpu_vtime.c    |   59 +
- .../selftests/sched_ext/test_example.c        |   49 +
- tools/testing/selftests/sched_ext/util.c      |   71 +
- tools/testing/selftests/sched_ext/util.h      |   13 +
- 186 files changed, 17378 insertions(+), 663 deletions(-)
- create mode 100755 .github/workflows/run-schedulers
- create mode 100644 .github/workflows/sched-ext.config
- create mode 100644 .github/workflows/test-kernel.yml
- create mode 100644 Documentation/scheduler/sched-ext.rst
- create mode 100644 include/linux/sched/ext.h
- create mode 100644 include/trace/events/sched_ext.h
- create mode 100644 kernel/sched/ext.c
- create mode 100644 kernel/sched/ext.h
- create mode 100644 tools/sched_ext/.gitignore
- create mode 100644 tools/sched_ext/Makefile
- create mode 100644 tools/sched_ext/README.md
- create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h
- create mode 100644 tools/sched_ext/include/scx/common.bpf.h
- create mode 100644 tools/sched_ext/include/scx/common.h
- create mode 100644 tools/sched_ext/include/scx/compat.bpf.h
- create mode 100644 tools/sched_ext/include/scx/compat.h
- create mode 100644 tools/sched_ext/include/scx/user_exit_info.h
- create mode 100644 tools/sched_ext/scx_central.bpf.c
- create mode 100644 tools/sched_ext/scx_central.c
- create mode 100644 tools/sched_ext/scx_flatcg.bpf.c
- create mode 100644 tools/sched_ext/scx_flatcg.c
- create mode 100644 tools/sched_ext/scx_flatcg.h
- create mode 100644 tools/sched_ext/scx_qmap.bpf.c
- create mode 100644 tools/sched_ext/scx_qmap.c
- create mode 100644 tools/sched_ext/scx_show_state.py
- create mode 100644 tools/sched_ext/scx_simple.bpf.c
- create mode 100644 tools/sched_ext/scx_simple.c
- create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_detach.c
- create mode 100644 tools/testing/selftests/bpf/progs/verifier_bits_iter.c
- create mode 100644 tools/testing/selftests/sched_ext/.gitignore
- create mode 100644 tools/testing/selftests/sched_ext/Makefile
- create mode 100644 tools/testing/selftests/sched_ext/config
- create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c
- create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
- create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
- create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c
- create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
- create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
- create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/exit.c
- create mode 100644 tools/testing/selftests/sched_ext/exit_test.h
- create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/hotplug.c
- create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h
- create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c
- create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/maximal.c
- create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c
- create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/minimal.c
- create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/prog_run.c
- create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c
- create mode 100644 tools/testing/selftests/sched_ext/runner.c
- create mode 100644 tools/testing/selftests/sched_ext/scx_test.h
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
- create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c
- create mode 100644 tools/testing/selftests/sched_ext/test_example.c
- create mode 100644 tools/testing/selftests/sched_ext/util.c
- create mode 100644 tools/testing/selftests/sched_ext/util.h
-
-diff --git a/.github/workflows/run-schedulers b/.github/workflows/run-schedulers
-new file mode 100755
-index 000000000000..fc1f92270f59
---- /dev/null
-+++ b/.github/workflows/run-schedulers
-@@ -0,0 +1,73 @@
-+#!/bin/bash
-+#
-+# Run sched-ext scheduler for TIMEOUT seconds inside virtme-ng and catch
-+# potential errors, then unload the scheduler and return the exit status.
-+
-+# Maximum time for each scheduler run.
-+TEST_TIMEOUT=30
-+
-+# Maximum timeout for the guest used for each scheduler run (this is used to
-+# hard-shutdown the guest in case of system hangs).
-+GUEST_TIMEOUT=60
-+
-+# Check if virtme-ng is available.
-+if [ ! -x `which vng` ]; then
-+    echo "vng not found, please install virtme-ng to enable testing"
-+    exit 1
-+fi
-+
-+function runtest() {
-+    local bin="${1}"
-+
-+    if [ -z "${bin}" ]; then
-+        echo "No binary passed to runtest"
-+        exit 1
-+    fi
-+
-+    if ! [ -f "${bin}" ]; then
-+        echo "Binary ${bin} was not a regular file"
-+        exit 1
-+    fi
-+
-+    rm -f /tmp/output
-+    (timeout --foreground --preserve-status ${GUEST_TIMEOUT} \
-+        vng --force-9p --verbose -- \
-+            "timeout --foreground --preserve-status ${TEST_TIMEOUT} ${bin}" \
-+                2>&1 </dev/null || true) | tee /tmp/output
-+}
-+
-+# Test all the available schedulers.
-+#
-+# NOTE: virtme-ng automatically runs the kernel from the current working
-+# directory by default.
-+#
-+# Each scheduler will be tested in a separate instance booted from scratch, to
-+# ensure that each run does not impact the others.
-+#
-+for sched in $(find tools/sched_ext/build/bin -type f -executable); do
-+    runtest "${sched}"
-+    grep -v " Speculative Return Stack Overflow" /tmp/output | \
-+        sed -n -e '/\bBUG:/q1' \
-+            -e '/\bWARNING:/q1' \
-+            -e '/\berror\b/Iq1' \
-+            -e '/\bstall/Iq1' \
-+            -e '/\btimeout\b/Iq1'
-+    res=$?
-+    if [ ${res} -ne 0 ]; then
-+        echo "FAIL: ${sched}"
-+        exit 1
-+    else
-+        echo "OK: ${sched}"
-+    fi
-+done
-+
-+# Run the selftests suite
-+runtest "tools/testing/selftests/sched_ext/runner"
-+sed -n -e '/not ok/q1' /tmp/output
-+res=$?
-+if [ ${res} -ne 0 ]; then
-+	echo "FAIL: selftests"
-+	echo "output: $(cat /tmp/output)"
-+else
-+	echo "OK: selftests"
-+fi
-diff --git a/.github/workflows/sched-ext.config b/.github/workflows/sched-ext.config
-new file mode 100644
-index 000000000000..efb7bda8b8fa
---- /dev/null
-+++ b/.github/workflows/sched-ext.config
-@@ -0,0 +1,34 @@
-+# sched-ext mandatory options
-+#
-+CONFIG_BPF=y
-+CONFIG_BPF_SYSCALL=y
-+CONFIG_BPF_JIT=y
-+CONFIG_DEBUG_INFO_BTF=y
-+CONFIG_BPF_JIT_ALWAYS_ON=y
-+CONFIG_BPF_JIT_DEFAULT_ON=y
-+CONFIG_SCHED_CLASS_EXT=y
-+
-+# Enable scheduling debugging
-+#
-+CONFIG_SCHED_DEBUG=y
-+
-+# Enable extra scheduling features (for a better code coverage while testing
-+# the schedulers)
-+#
-+CONFIG_SCHED_AUTOGROUP=y
-+CONFIG_SCHED_CORE=y
-+
-+# Enable fully preemptible kernel for a better test coverage of the schedulers
-+#
-+# CONFIG_PREEMPT_NONE is not set
-+# CONFIG_PREEMPT_VOLUNTARY is not set
-+CONFIG_PREEMPT=y
-+CONFIG_PREEMPT_COUNT=y
-+CONFIG_PREEMPTION=y
-+CONFIG_PREEMPT_DYNAMIC=y
-+CONFIG_PREEMPT_RCU=y
-+
-+# Additional debugging information (useful to catch potential locking issues)
-+#
-+CONFIG_DEBUG_LOCKDEP=y
-+CONFIG_DEBUG_ATOMIC_SLEEP=y
-diff --git a/.github/workflows/test-kernel.yml b/.github/workflows/test-kernel.yml
-new file mode 100644
-index 000000000000..2efc273836fe
---- /dev/null
-+++ b/.github/workflows/test-kernel.yml
-@@ -0,0 +1,56 @@
-+name: test-kernel
-+run-name: ${{ github.actor }} PR run
-+on: [pull_request]
-+jobs:
-+  test-schedulers:
-+    runs-on: ubuntu-22.04
-+    steps:
-+      ### OTHER REPOS ####
-+
-+      # Hard turn-off interactive mode
-+      - run: echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
-+
-+      # Refresh packages list
-+      - run: sudo apt update
-+
-+      ### DOWNLOAD AND INSTALL DEPENDENCIES ###
-+
-+      # Download dependencies packaged by Ubuntu
-+      - run: sudo apt -y install bc gcc make git coreutils cmake elfutils libelf-dev libunwind-dev libzstd-dev linux-headers-generic linux-tools-common linux-tools-generic ninja-build python3-pip python3-requests qemu-kvm udev iproute2 busybox-static libvirt-clients kbd kmod file rsync zstd pahole flex bison cpio libcap-dev libelf-dev python3-dev cargo rustc gcc-multilib
-+
-+      # clang 17
-+      # Use a custom llvm.sh script which includes the -y flag for
-+      # add-apt-repository. Otherwise, the CI job will hang. If and when
-+      # https://github.com/opencollab/llvm-jenkins.debian.net/pull/26 is
-+      # merged, we can go back to using https://apt.llvm.org/llvm.sh.
-+      - run: wget https://raw.githubusercontent.com/Byte-Lab/llvm-jenkins.debian.net/fix_llvmsh/llvm.sh
-+      - run: chmod +x llvm.sh
-+      - run: sudo ./llvm.sh all
-+      - run: sudo ln -sf /usr/bin/clang-17 /usr/bin/clang
-+      - run: sudo ln -sf /usr/bin/llvm-strip-17 /usr/bin/llvm-strip
-+
-+      # Checkout repository
-+      - uses: actions/checkout@v4
-+
-+      # Install virtme-ng
-+      - run: pip install virtme-ng
-+
-+      ### END DEPENDENCIES ###
-+
-+      # Build a minimal kernel (with sched-ext enabled) using virtme-ng
-+      - run: vng -v --build --config .github/workflows/sched-ext.config
-+
-+      # Build the selftests suite
-+      - run: make -j $(nproc) -C tools/testing/selftests/sched_ext
-+
-+      # Build the in-kernel schedulers
-+      - run: make -j $(nproc) -C tools/sched_ext
-+
-+      # Setup KVM support
-+      - run: |
-+          echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules
-+          sudo udevadm control --reload-rules
-+          sudo udevadm trigger --name-match=kvm
-+
-+      # Test the schedulers inside the recompile kernel
-+      - run: .github/workflows/run-schedulers
-diff --git a/.gitignore b/.gitignore
-index c59dc60ba62e..fc918eed9876 100644
---- a/.gitignore
-+++ b/.gitignore
-@@ -171,3 +171,6 @@ sphinx_*/
- 
- # Rust analyzer configuration
- /rust-project.json
-+
-+# Include ".github" directory
-+!.github/
-diff --git a/Documentation/bpf/libbpf/libbpf_overview.rst b/Documentation/bpf/libbpf/libbpf_overview.rst
-index f36a2d4ffea2..f4d22f0c62b0 100644
---- a/Documentation/bpf/libbpf/libbpf_overview.rst
-+++ b/Documentation/bpf/libbpf/libbpf_overview.rst
-@@ -219,6 +219,14 @@ compilation and skeleton generation. Using Libbpf-rs will make building user
- space part of the BPF application easier. Note that the BPF program themselves
- must still be written in plain C.
- 
-+libbpf logging
-+==============
-+
-+By default, libbpf logs informational and warning messages to stderr. The
-+verbosity of these messages can be controlled by setting the environment
-+variable LIBBPF_LOG_LEVEL to either warn, info, or debug. A custom log
-+callback can be set using ``libbpf_set_print()``.
-+
- Additional Documentation
- ========================
- 
-diff --git a/Documentation/bpf/standardization/abi.rst b/Documentation/bpf/standardization/abi.rst
-index 0c2e10eeb89a..41514137cb7b 100644
---- a/Documentation/bpf/standardization/abi.rst
-+++ b/Documentation/bpf/standardization/abi.rst
-@@ -23,3 +23,6 @@ The BPF calling convention is defined as:
- 
- R0 - R5 are scratch registers and BPF programs needs to spill/fill them if
- necessary across calls.
-+
-+The BPF program needs to store the return value into register R0 before doing an
-+``EXIT``.
-diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
-index 00c93eb42613..8d19810504b8 100644
---- a/Documentation/bpf/standardization/instruction-set.rst
-+++ b/Documentation/bpf/standardization/instruction-set.rst
-@@ -14,6 +14,13 @@ set architecture (ISA).
- Documentation conventions
- =========================
- 
-+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
-+"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
-+"OPTIONAL" in this document are to be interpreted as described in
-+BCP 14 `<https://www.rfc-editor.org/info/rfc2119>`_
-+`RFC8174 <https://www.rfc-editor.org/info/rfc8174>`_
-+when, and only when, they appear in all capitals, as shown here.
-+
- For brevity and consistency, this document refers to families
- of types using a shorthand syntax and refers to several expository,
- mnemonic functions when describing the semantics of instructions.
-@@ -25,7 +32,7 @@ Types
- This document refers to integer types with the notation `SN` to specify
- a type's signedness (`S`) and bit width (`N`), respectively.
- 
--.. table:: Meaning of signedness notation.
-+.. table:: Meaning of signedness notation
- 
-   ==== =========
-   S    Meaning
-@@ -34,7 +41,7 @@ a type's signedness (`S`) and bit width (`N`), respectively.
-   s    signed
-   ==== =========
- 
--.. table:: Meaning of bit-width notation.
-+.. table:: Meaning of bit-width notation
- 
-   ===== =========
-   N     Bit width
-@@ -106,9 +113,9 @@ Conformance groups
- 
- An implementation does not need to support all instructions specified in this
- document (e.g., deprecated instructions).  Instead, a number of conformance
--groups are specified.  An implementation must support the base32 conformance
--group and may support additional conformance groups, where supporting a
--conformance group means it must support all instructions in that conformance
-+groups are specified.  An implementation MUST support the base32 conformance
-+group and MAY support additional conformance groups, where supporting a
-+conformance group means it MUST support all instructions in that conformance
- group.
- 
- The use of named conformance groups enables interoperability between a runtime
-@@ -209,7 +216,7 @@ For example::
-   07     1       0        00 00  11 22 33 44  r1 += 0x11223344 // big
- 
- Note that most instructions do not use all of the fields.
--Unused fields shall be cleared to zero.
-+Unused fields SHALL be cleared to zero.
- 
- Wide instruction encoding
- --------------------------
-@@ -256,18 +263,20 @@ Instruction classes
- 
- The three least significant bits of the 'opcode' field store the instruction class:
- 
--=====  =====  ===============================  ===================================
--class  value  description                      reference
--=====  =====  ===============================  ===================================
--LD     0x0    non-standard load operations     `Load and store instructions`_
--LDX    0x1    load into register operations    `Load and store instructions`_
--ST     0x2    store from immediate operations  `Load and store instructions`_
--STX    0x3    store from register operations   `Load and store instructions`_
--ALU    0x4    32-bit arithmetic operations     `Arithmetic and jump instructions`_
--JMP    0x5    64-bit jump operations           `Arithmetic and jump instructions`_
--JMP32  0x6    32-bit jump operations           `Arithmetic and jump instructions`_
--ALU64  0x7    64-bit arithmetic operations     `Arithmetic and jump instructions`_
--=====  =====  ===============================  ===================================
-+.. table:: Instruction class
-+
-+  =====  =====  ===============================  ===================================
-+  class  value  description                      reference
-+  =====  =====  ===============================  ===================================
-+  LD     0x0    non-standard load operations     `Load and store instructions`_
-+  LDX    0x1    load into register operations    `Load and store instructions`_
-+  ST     0x2    store from immediate operations  `Load and store instructions`_
-+  STX    0x3    store from register operations   `Load and store instructions`_
-+  ALU    0x4    32-bit arithmetic operations     `Arithmetic and jump instructions`_
-+  JMP    0x5    64-bit jump operations           `Arithmetic and jump instructions`_
-+  JMP32  0x6    32-bit jump operations           `Arithmetic and jump instructions`_
-+  ALU64  0x7    64-bit arithmetic operations     `Arithmetic and jump instructions`_
-+  =====  =====  ===============================  ===================================
- 
- Arithmetic and jump instructions
- ================================
-@@ -285,12 +294,14 @@ For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and
- **s (source)**
-   the source operand location, which unless otherwise specified is one of:
- 
--  ======  =====  ==============================================
--  source  value  description
--  ======  =====  ==============================================
--  K       0      use 32-bit 'imm' value as source operand
--  X       1      use 'src_reg' register value as source operand
--  ======  =====  ==============================================
-+  .. table:: Source operand location
-+
-+    ======  =====  ==============================================
-+    source  value  description
-+    ======  =====  ==============================================
-+    K       0      use 32-bit 'imm' value as source operand
-+    X       1      use 'src_reg' register value as source operand
-+    ======  =====  ==============================================
- 
- **instruction class**
-   the instruction class (see `Instruction classes`_)
-@@ -305,27 +316,29 @@ The 'code' field encodes the operation as below, where 'src' refers to the
- the source operand and 'dst' refers to the value of the destination
- register.
- 
--=====  =====  =======  ==========================================================
--name   code   offset   description
--=====  =====  =======  ==========================================================
--ADD    0x0    0        dst += src
--SUB    0x1    0        dst -= src
--MUL    0x2    0        dst \*= src
--DIV    0x3    0        dst = (src != 0) ? (dst / src) : 0
--SDIV   0x3    1        dst = (src != 0) ? (dst s/ src) : 0
--OR     0x4    0        dst \|= src
--AND    0x5    0        dst &= src
--LSH    0x6    0        dst <<= (src & mask)
--RSH    0x7    0        dst >>= (src & mask)
--NEG    0x8    0        dst = -dst
--MOD    0x9    0        dst = (src != 0) ? (dst % src) : dst
--SMOD   0x9    1        dst = (src != 0) ? (dst s% src) : dst
--XOR    0xa    0        dst ^= src
--MOV    0xb    0        dst = src
--MOVSX  0xb    8/16/32  dst = (s8,s16,s32)src
--ARSH   0xc    0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
--END    0xd    0        byte swap operations (see `Byte swap instructions`_ below)
--=====  =====  =======  ==========================================================
-+.. table:: Arithmetic instructions
-+
-+  =====  =====  =======  ==========================================================
-+  name   code   offset   description
-+  =====  =====  =======  ==========================================================
-+  ADD    0x0    0        dst += src
-+  SUB    0x1    0        dst -= src
-+  MUL    0x2    0        dst \*= src
-+  DIV    0x3    0        dst = (src != 0) ? (dst / src) : 0
-+  SDIV   0x3    1        dst = (src != 0) ? (dst s/ src) : 0
-+  OR     0x4    0        dst \|= src
-+  AND    0x5    0        dst &= src
-+  LSH    0x6    0        dst <<= (src & mask)
-+  RSH    0x7    0        dst >>= (src & mask)
-+  NEG    0x8    0        dst = -dst
-+  MOD    0x9    0        dst = (src != 0) ? (dst % src) : dst
-+  SMOD   0x9    1        dst = (src != 0) ? (dst s% src) : dst
-+  XOR    0xa    0        dst ^= src
-+  MOV    0xb    0        dst = src
-+  MOVSX  0xb    8/16/32  dst = (s8,s16,s32)src
-+  ARSH   0xc    0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
-+  END    0xd    0        byte swap operations (see `Byte swap instructions`_ below)
-+  =====  =====  =======  ==========================================================
- 
- Underflow and overflow are allowed during arithmetic operations, meaning
- the 64-bit or 32-bit value will wrap. If BPF program execution would
-@@ -374,7 +387,7 @@ interpreted as a 64-bit signed value.
- Note that there are varying definitions of the signed modulo operation
- when the dividend or divisor are negative, where implementations often
- vary by language such that Python, Ruby, etc.  differ from C, Go, Java,
--etc. This specification requires that signed modulo use truncated division
-+etc. This specification requires that signed modulo MUST use truncated division
- (where -13 % 3 == -1) as implemented in C, Go, etc.::
- 
-    a % n = a - n * trunc(a / n)
-@@ -386,6 +399,19 @@ The ``MOVSX`` instruction does a move operation with sign extension.
- operands into 64-bit operands.  Unlike other arithmetic instructions,
- ``MOVSX`` is only defined for register source operands (``X``).
- 
-+``{MOV, K, ALU64}`` means::
-+
-+  dst = (s64)imm
-+
-+``{MOV, X, ALU}`` means::
-+
-+  dst = (u32)src
-+
-+``{MOVSX, X, ALU}`` with 'offset' 8 means::
-+
-+  dst = (u32)(s32)(s8)src
-+
-+
- The ``NEG`` instruction is only defined when the source bit is clear
- (``K``).
- 
-@@ -404,15 +430,17 @@ only and do not use a separate source register or immediate value.
- For ``ALU``, the 1-bit source operand field in the opcode is used to
- select what byte order the operation converts from or to. For
- ``ALU64``, the 1-bit source operand field in the opcode is reserved
--and must be set to 0.
-+and MUST be set to 0.
- 
--=====  ========  =====  =================================================
--class  source    value  description
--=====  ========  =====  =================================================
--ALU    TO_LE     0      convert between host byte order and little endian
--ALU    TO_BE     1      convert between host byte order and big endian
--ALU64  Reserved  0      do byte swap unconditionally
--=====  ========  =====  =================================================
-+.. table:: Byte swap instructions
-+
-+  =====  ========  =====  =================================================
-+  class  source    value  description
-+  =====  ========  =====  =================================================
-+  ALU    TO_LE     0      convert between host byte order and little endian
-+  ALU    TO_BE     1      convert between host byte order and big endian
-+  ALU64  Reserved  0      do byte swap unconditionally
-+  =====  ========  =====  =================================================
- 
- The 'imm' field encodes the width of the swap operations.  The following widths
- are supported: 16, 32 and 64.  Width 64 operations belong to the base64
-@@ -448,27 +476,29 @@ otherwise identical operations, and indicates the base64 conformance
- group unless otherwise specified.
- The 'code' field encodes the operation as below:
- 
--========  =====  =======  =================================  ===================================================
--code      value  src_reg  description                        notes
--========  =====  =======  =================================  ===================================================
--JA        0x0    0x0      PC += offset                       {JA, K, JMP} only
--JA        0x0    0x0      PC += imm                          {JA, K, JMP32} only
--JEQ       0x1    any      PC += offset if dst == src
--JGT       0x2    any      PC += offset if dst > src          unsigned
--JGE       0x3    any      PC += offset if dst >= src         unsigned
--JSET      0x4    any      PC += offset if dst & src
--JNE       0x5    any      PC += offset if dst != src
--JSGT      0x6    any      PC += offset if dst > src          signed
--JSGE      0x7    any      PC += offset if dst >= src         signed
--CALL      0x8    0x0      call helper function by static ID  {CALL, K, JMP} only, see `Helper functions`_
--CALL      0x8    0x1      call PC += imm                     {CALL, K, JMP} only, see `Program-local functions`_
--CALL      0x8    0x2      call helper function by BTF ID     {CALL, K, JMP} only, see `Helper functions`_
--EXIT      0x9    0x0      return                             {CALL, K, JMP} only
--JLT       0xa    any      PC += offset if dst < src          unsigned
--JLE       0xb    any      PC += offset if dst <= src         unsigned
--JSLT      0xc    any      PC += offset if dst < src          signed
--JSLE      0xd    any      PC += offset if dst <= src         signed
--========  =====  =======  =================================  ===================================================
-+.. table:: Jump instructions
-+
-+  ========  =====  =======  =================================  ===================================================
-+  code      value  src_reg  description                        notes
-+  ========  =====  =======  =================================  ===================================================
-+  JA        0x0    0x0      PC += offset                       {JA, K, JMP} only
-+  JA        0x0    0x0      PC += imm                          {JA, K, JMP32} only
-+  JEQ       0x1    any      PC += offset if dst == src
-+  JGT       0x2    any      PC += offset if dst > src          unsigned
-+  JGE       0x3    any      PC += offset if dst >= src         unsigned
-+  JSET      0x4    any      PC += offset if dst & src
-+  JNE       0x5    any      PC += offset if dst != src
-+  JSGT      0x6    any      PC += offset if dst > src          signed
-+  JSGE      0x7    any      PC += offset if dst >= src         signed
-+  CALL      0x8    0x0      call helper function by static ID  {CALL, K, JMP} only, see `Helper functions`_
-+  CALL      0x8    0x1      call PC += imm                     {CALL, K, JMP} only, see `Program-local functions`_
-+  CALL      0x8    0x2      call helper function by BTF ID     {CALL, K, JMP} only, see `Helper functions`_
-+  EXIT      0x9    0x0      return                             {CALL, K, JMP} only
-+  JLT       0xa    any      PC += offset if dst < src          unsigned
-+  JLE       0xb    any      PC += offset if dst <= src         unsigned
-+  JSLT      0xc    any      PC += offset if dst < src          signed
-+  JSLE      0xd    any      PC += offset if dst <= src         signed
-+  ========  =====  =======  =================================  ===================================================
- 
- where 'PC' denotes the program counter, and the offset to increment by
- is in units of 64-bit instructions relative to the instruction following
-@@ -476,9 +506,6 @@ the jump instruction.  Thus 'PC += 1' skips execution of the next
- instruction if it's a basic instruction or results in undefined behavior
- if the next instruction is a 128-bit wide instruction.
- 
--The BPF program needs to store the return value into register R0 before doing an
--``EXIT``.
--
- Example:
- 
- ``{JSGE, X, JMP32}`` means::
-@@ -487,6 +514,10 @@ Example:
- 
- where 's>=' indicates a signed '>=' comparison.
- 
-+``{JLE, K, JMP}`` means::
-+
-+  if dst <= (u64)(s64)imm goto +offset
-+
- ``{JA, K, JMP32}`` means::
- 
-   gotol +imm
-@@ -515,14 +546,16 @@ for each program type, but static IDs are unique across all program types.
- 
- Platforms that support the BPF Type Format (BTF) support identifying
- a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID
--identifies the helper name and type.
-+identifies the helper name and type.  Further documentation of BTF
-+is outside the scope of this document and is left for future work.
- 
- Program-local functions
- ~~~~~~~~~~~~~~~~~~~~~~~
- Program-local functions are functions exposed by the same BPF program as the
--caller, and are referenced by offset from the call instruction, similar to
--``JA``.  The offset is encoded in the 'imm' field of the call instruction.
--An ``EXIT`` within the program-local function will return to the caller.
-+caller, and are referenced by offset from the instruction following the call
-+instruction, similar to ``JA``.  The offset is encoded in the 'imm' field of
-+the call instruction. An ``EXIT`` within the program-local function will
-+return to the caller.
- 
- Load and store instructions
- ===========================
-@@ -537,6 +570,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
- **mode**
-   The mode modifier is one of:
- 
-+  .. table:: Mode modifier
-+
-     =============  =====  ====================================  =============
-     mode modifier  value  description                           reference
-     =============  =====  ====================================  =============
-@@ -551,6 +586,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
- **sz (size)**
-   The size modifier is one of:
- 
-+  .. table:: Size modifier
-+
-     ====  =====  =====================
-     size  value  description
-     ====  =====  =====================
-@@ -619,14 +656,16 @@ The 'imm' field is used to encode the actual atomic operation.
- Simple atomic operation use a subset of the values defined to encode
- arithmetic operations in the 'imm' field to encode the atomic operation:
- 
--========  =====  ===========
--imm       value  description
--========  =====  ===========
--ADD       0x00   atomic add
--OR        0x40   atomic or
--AND       0x50   atomic and
--XOR       0xa0   atomic xor
--========  =====  ===========
-+.. table:: Simple atomic operations
-+
-+  ========  =====  ===========
-+  imm       value  description
-+  ========  =====  ===========
-+  ADD       0x00   atomic add
-+  OR        0x40   atomic or
-+  AND       0x50   atomic and
-+  XOR       0xa0   atomic xor
-+  ========  =====  ===========
- 
- 
- ``{ATOMIC, W, STX}`` with 'imm' = ADD means::
-@@ -640,13 +679,15 @@ XOR       0xa0   atomic xor
- In addition to the simple atomic operations, there also is a modifier and
- two complex atomic operations:
- 
--===========  ================  ===========================
--imm          value             description
--===========  ================  ===========================
--FETCH        0x01              modifier: return old value
--XCHG         0xe0 | FETCH      atomic exchange
--CMPXCHG      0xf0 | FETCH      atomic compare and exchange
--===========  ================  ===========================
-+.. table:: Complex atomic operations
-+
-+  ===========  ================  ===========================
-+  imm          value             description
-+  ===========  ================  ===========================
-+  FETCH        0x01              modifier: return old value
-+  XCHG         0xe0 | FETCH      atomic exchange
-+  CMPXCHG      0xf0 | FETCH      atomic compare and exchange
-+  ===========  ================  ===========================
- 
- The ``FETCH`` modifier is optional for simple atomic operations, and
- always set for the complex atomic operations.  If the ``FETCH`` flag
-@@ -673,17 +714,19 @@ The following table defines a set of ``{IMM, DW, LD}`` instructions
- with opcode subtypes in the 'src_reg' field, using new terms such as "map"
- defined further below:
- 
--=======  =========================================  ===========  ==============
--src_reg  pseudocode                                 imm type     dst type
--=======  =========================================  ===========  ==============
--0x0      dst = (next_imm << 32) | imm               integer      integer
--0x1      dst = map_by_fd(imm)                       map fd       map
--0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data address
--0x3      dst = var_addr(imm)                        variable id  data address
--0x4      dst = code_addr(imm)                       integer      code address
--0x5      dst = map_by_idx(imm)                      map index    map
--0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data address
--=======  =========================================  ===========  ==============
-+.. table:: 64-bit immediate instructions
-+
-+  =======  =========================================  ===========  ==============
-+  src_reg  pseudocode                                 imm type     dst type
-+  =======  =========================================  ===========  ==============
-+  0x0      dst = (next_imm << 32) | imm               integer      integer
-+  0x1      dst = map_by_fd(imm)                       map fd       map
-+  0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data address
-+  0x3      dst = var_addr(imm)                        variable id  data address
-+  0x4      dst = code_addr(imm)                       integer      code address
-+  0x5      dst = map_by_idx(imm)                      map index    map
-+  0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data address
-+  =======  =========================================  ===========  ==============
- 
- where
- 
-@@ -725,5 +768,5 @@ carried over from classic BPF. These instructions used an instruction
- class of ``LD``, a size modifier of ``W``, ``H``, or ``B``, and a
- mode modifier of ``ABS`` or ``IND``.  The 'dst_reg' and 'offset' fields were
- set to zero, and 'src_reg' was set to zero for ``ABS``.  However, these
--instructions are deprecated and should no longer be used.  All legacy packet
-+instructions are deprecated and SHOULD no longer be used.  All legacy packet
- access instructions belong to the "packet" conformance group.
-diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
-index 43bd8a145b7a..0611dc3dda8e 100644
---- a/Documentation/scheduler/index.rst
-+++ b/Documentation/scheduler/index.rst
-@@ -20,6 +20,7 @@ Scheduler
-     sched-nice-design
-     sched-rt-group
-     sched-stats
-+    sched-ext
-     sched-debug
- 
-     text_files
-diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
-new file mode 100644
-index 000000000000..497eeaa5ecbe
---- /dev/null
-+++ b/Documentation/scheduler/sched-ext.rst
-@@ -0,0 +1,314 @@
-+==========================
-+Extensible Scheduler Class
-+==========================
-+
-+sched_ext is a scheduler class whose behavior can be defined by a set of BPF
-+programs - the BPF scheduler.
-+
-+* sched_ext exports a full scheduling interface so that any scheduling
-+  algorithm can be implemented on top.
-+
-+* The BPF scheduler can group CPUs however it sees fit and schedule them
-+  together, as tasks aren't tied to specific CPUs at the time of wakeup.
-+
-+* The BPF scheduler can be turned on and off dynamically anytime.
-+
-+* The system integrity is maintained no matter what the BPF scheduler does.
-+  The default scheduling behavior is restored anytime an error is detected,
-+  a runnable task stalls, or on invoking the SysRq key sequence
-+  :kbd:`SysRq-S`.
-+
-+* When the BPF scheduler triggers an error, debug information is dumped to
-+  aid debugging. The debug dump is passed to and printed out by the
-+  scheduler binary. The debug dump can also be accessed through the
-+  `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D`
-+  triggers a debug dump. This doesn't terminate the BPF scheduler and can
-+  only be read through the tracepoint.
-+
-+Switching to and from sched_ext
-+===============================
-+
-+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and
-+``tools/sched_ext`` contains the example schedulers. The following config
-+options should be enabled to use sched_ext:
-+
-+.. code-block:: none
-+
-+    CONFIG_BPF=y
-+    CONFIG_SCHED_CLASS_EXT=y
-+    CONFIG_BPF_SYSCALL=y
-+    CONFIG_BPF_JIT=y
-+    CONFIG_DEBUG_INFO_BTF=y
-+    CONFIG_BPF_JIT_ALWAYS_ON=y
-+    CONFIG_BPF_JIT_DEFAULT_ON=y
-+    CONFIG_PAHOLE_HAS_SPLIT_BTF=y
-+    CONFIG_PAHOLE_HAS_BTF_TAG=y
-+
-+sched_ext is used only when the BPF scheduler is loaded and running.
-+
-+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be
-+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is
-+loaded. On load, such tasks will be switched to and scheduled by sched_ext.
-+
-+The BPF scheduler can choose to schedule all normal and lower class tasks by
-+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this
-+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and
-+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers,
-+this mode can be selected with the ``-a`` option.
-+
-+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or
-+detection of any internal error including stalled runnable tasks aborts the
-+BPF scheduler and reverts all tasks back to CFS.
-+
-+.. code-block:: none
-+
-+    # make -j16 -C tools/sched_ext
-+    # tools/sched_ext/scx_simple
-+    local=0 global=3
-+    local=5 global=24
-+    local=9 global=44
-+    local=13 global=56
-+    local=17 global=72
-+    ^CEXIT: BPF scheduler unregistered
-+
-+The current status of the BPF scheduler can be determined as follows:
-+
-+.. code-block:: none
-+
-+    # cat /sys/kernel/sched_ext/state
-+    enabled
-+    # cat /sys/kernel/sched_ext/root/ops
-+    simple
-+
-+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
-+detailed information:
-+
-+.. code-block:: none
-+
-+    # tools/sched_ext/scx_show_state.py
-+    ops           : simple
-+    enabled       : 1
-+    switching_all : 1
-+    switched_all  : 1
-+    enable_state  : enabled (2)
-+    bypass_depth  : 0
-+    nr_rejected   : 0
-+
-+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
-+be determined as follows:
-+
-+.. code-block:: none
-+
-+    # grep ext /proc/self/sched
-+    ext.enabled                                  :                    1
-+
-+The Basics
-+==========
-+
-+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
-+programs that implement ``struct sched_ext_ops``. The only mandatory field
-+is ``ops.name`` which must be a valid BPF object name. All operations are
-+optional. The following modified excerpt is from
-+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler.
-+
-+.. code-block:: c
-+
-+    /*
-+     * Decide which CPU a task should be migrated to before being
-+     * enqueued (either at wakeup, fork time, or exec time). If an
-+     * idle core is found by the default ops.select_cpu() implementation,
-+     * then dispatch the task directly to SCX_DSQ_LOCAL and skip the
-+     * ops.enqueue() callback.
-+     *
-+     * Note that this implementation has exactly the same behavior as the
-+     * default ops.select_cpu implementation. The behavior of the scheduler
-+     * would be exactly same if the implementation just didn't define the
-+     * simple_select_cpu() struct_ops prog.
-+     */
-+    s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p,
-+                       s32 prev_cpu, u64 wake_flags)
-+    {
-+            s32 cpu;
-+            /* Need to initialize or the BPF verifier will reject the program */
-+            bool direct = false;
-+
-+            cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);
-+
-+            if (direct)
-+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-+
-+            return cpu;
-+    }
-+
-+    /*
-+     * Do a direct dispatch of a task to the global DSQ. This ops.enqueue()
-+     * callback will only be invoked if we failed to find a core to dispatch
-+     * to in ops.select_cpu() above.
-+     *
-+     * Note that this implementation has exactly the same behavior as the
-+     * default ops.enqueue implementation, which just dispatches the task
-+     * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same
-+     * if the implementation just didn't define the simple_enqueue struct_ops
-+     * prog.
-+     */
-+    void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
-+    {
-+            scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+    }
-+
-+    s32 BPF_STRUCT_OPS(simple_init)
-+    {
-+            /*
-+             * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should
-+             * use sched_ext.
-+             */
-+            scx_bpf_switch_all();
-+            return 0;
-+    }
-+
-+    void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
-+    {
-+            exit_type = ei->type;
-+    }
-+
-+    SEC(".struct_ops")
-+    struct sched_ext_ops simple_ops = {
-+            .select_cpu             = (void *)simple_select_cpu,
-+            .enqueue                = (void *)simple_enqueue,
-+            .init                   = (void *)simple_init,
-+            .exit                   = (void *)simple_exit,
-+            .name                   = "simple",
-+    };
-+
-+Dispatch Queues
-+---------------
-+
-+To match the impedance between the scheduler core and the BPF scheduler,
-+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a
-+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``),
-+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage
-+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
-+``scx_bpf_destroy_dsq()``.
-+
-+A CPU always executes a task from its local DSQ. A task is "dispatched" to a
-+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
-+local DSQ.
-+
-+When a CPU is looking for the next task to run, if the local DSQ is not
-+empty, the first task is picked. Otherwise, the CPU tries to consume the
-+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()``
-+is invoked.
-+
-+Scheduling Cycle
-+----------------
-+
-+The following briefly shows how a waking task is scheduled and executed.
-+
-+1. When a task is waking up, ``ops.select_cpu()`` is the first operation
-+   invoked. This serves two purposes. First, CPU selection optimization
-+   hint. Second, waking up the selected CPU if idle.
-+
-+   The CPU selected by ``ops.select_cpu()`` is an optimization hint and not
-+   binding. The actual decision is made at the last step of scheduling.
-+   However, there is a small performance gain if the CPU
-+   ``ops.select_cpu()`` returns matches the CPU the task eventually runs on.
-+
-+   A side-effect of selecting a CPU is waking it up from idle. While a BPF
-+   scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
-+   using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
-+
-+   A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by
-+   calling ``scx_bpf_dispatch()``. If the task is dispatched to
-+   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the
-+   local DSQ of whichever CPU is returned from ``ops.select_cpu()``.
-+   Additionally, dispatching directly from ``ops.select_cpu()`` will cause the
-+   ``ops.enqueue()`` callback to be skipped.
-+
-+   Note that the scheduler core will ignore an invalid CPU selection, for
-+   example, if it's outside the allowed cpumask of the task.
-+
-+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the
-+   task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()``
-+   can make one of the following decisions:
-+
-+   * Immediately dispatch the task to either the global or local DSQ by
-+     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
-+     ``SCX_DSQ_LOCAL``, respectively.
-+
-+   * Immediately dispatch the task to a custom DSQ by calling
-+     ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63.
-+
-+   * Queue the task on the BPF side.
-+
-+3. When a CPU is ready to schedule, it first looks at its local DSQ. If
-+   empty, it then looks at the global DSQ. If there still isn't a task to
-+   run, ``ops.dispatch()`` is invoked which can use the following two
-+   functions to populate the local DSQ.
-+
-+   * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can
-+     be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
-+     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()``
-+     currently can't be called with BPF locks held, this is being worked on
-+     and will be supported. ``scx_bpf_dispatch()`` schedules dispatching
-+     rather than performing them immediately. There can be up to
-+     ``ops.dispatch_max_batch`` pending tasks.
-+
-+   * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ
-+     to the dispatching DSQ. This function cannot be called with any BPF
-+     locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks
-+     before trying to consume the specified DSQ.
-+
-+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,
-+   the CPU runs the first one. If empty, the following steps are taken:
-+
-+   * Try to consume the global DSQ. If successful, run the task.
-+
-+   * If ``ops.dispatch()`` has dispatched any tasks, retry #3.
-+
-+   * If the previous task is an SCX task and still runnable, keep executing
-+     it (see ``SCX_OPS_ENQ_LAST``).
-+
-+   * Go idle.
-+
-+Note that the BPF scheduler can always choose to dispatch tasks immediately
-+in ``ops.enqueue()`` as illustrated in the above simple example. If only the
-+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
-+a task is never queued on the BPF scheduler and both the local and global
-+DSQs are consumed automatically.
-+
-+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
-+``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as
-+``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue
-+dispatching, and must be dispatched to with ``scx_bpf_dispatch()``.  See the
-+function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for
-+more information.
-+
-+Where to Look
-+=============
-+
-+* ``include/linux/sched/ext.h`` defines the core data structures, ops table
-+  and constants.
-+
-+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers.
-+  The functions prefixed with ``scx_bpf_`` can be called from the BPF
-+  scheduler.
-+
-+* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
-+
-+  * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
-+    custom DSQ.
-+
-+  * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
-+    levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
-+
-+ABI Instability
-+===============
-+
-+The APIs provided by sched_ext to BPF schedulers programs have no stability
-+guarantees. This includes the ops table callbacks and constants defined in
-+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
-+``kernel/sched/ext.c``.
-+
-+While we will attempt to provide a relatively stable API surface when
-+possible, they are subject to change without warning between kernel
-+versions.
-diff --git a/MAINTAINERS b/MAINTAINERS
-index 7bcdcb4b7806..03e2d4690d51 100644
---- a/MAINTAINERS
-+++ b/MAINTAINERS
-@@ -19964,6 +19964,19 @@ F:	include/linux/wait.h
- F:	include/uapi/linux/sched.h
- F:	kernel/sched/
- 
-+SCHEDULER - SCHED_EXT
-+R:	Tejun Heo <tj@kernel.org>
-+R:	David Vernet <void@manifault.com>
-+L:	linux-kernel@vger.kernel.org
-+S:	Maintained
-+W:	https://github.com/sched-ext/scx
-+T:	git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
-+F:	include/linux/sched/ext.h
-+F:	kernel/sched/ext.h
-+F:	kernel/sched/ext.c
-+F:	tools/sched_ext/
-+F:	tools/testing/selftests/sched_ext
-+
- SCSI LIBSAS SUBSYSTEM
- R:	John Garry <john.g.garry@oracle.com>
- R:	Jason Yan <yanaijie@huawei.com>
-diff --git a/Makefile b/Makefile
-index 6235b1ebb38b..0e3c1aadad69 100644
---- a/Makefile
-+++ b/Makefile
-@@ -1358,6 +1358,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),)
- 	$(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
- endif
- 
-+tools-clean-targets := sched_ext
-+PHONY += $(tools-clean-targets)
-+$(tools-clean-targets):
-+	$(Q)$(MAKE) -sC tools $@_clean
-+tools_clean: $(tools-clean-targets)
-+
- # Clear a bunch of variables before executing the submake
- ifeq ($(quiet),silent_)
- tools_silent=s
-@@ -1530,7 +1536,7 @@ PHONY += $(mrproper-dirs) mrproper
- $(mrproper-dirs):
- 	$(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
- 
--mrproper: clean $(mrproper-dirs)
-+mrproper: clean $(mrproper-dirs) tools_clean
- 	$(call cmd,rmfiles)
- 	@find . $(RCS_FIND_IGNORE) \
- 		\( -name '*.rmeta' \) \
-diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
-index 0525ee2d63c7..9f38a5ecbee3 100644
---- a/arch/riscv/Kconfig
-+++ b/arch/riscv/Kconfig
-@@ -610,6 +610,18 @@ config TOOLCHAIN_HAS_VECTOR_CRYPTO
- 	def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb)
- 	depends on AS_HAS_OPTION_ARCH
- 
-+config RISCV_ISA_ZBA
-+	bool "Zba extension support for bit manipulation instructions"
-+	default y
-+	help
-+	   Add support for enabling optimisations in the kernel when the Zba
-+	   extension is detected at boot.
-+
-+	   The Zba extension provides instructions to accelerate the generation
-+	   of addresses that index into arrays of basic data types.
-+
-+	   If you don't know what to do here, say Y.
-+
- config RISCV_ISA_ZBB
- 	bool "Zbb extension support for bit manipulation instructions"
- 	depends on TOOLCHAIN_HAS_ZBB
-diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
-index fdbf88ca8b70..1d1c78d4cff1 100644
---- a/arch/riscv/net/bpf_jit.h
-+++ b/arch/riscv/net/bpf_jit.h
-@@ -18,6 +18,11 @@ static inline bool rvc_enabled(void)
- 	return IS_ENABLED(CONFIG_RISCV_ISA_C);
- }
- 
-+static inline bool rvzba_enabled(void)
-+{
-+	return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA);
-+}
-+
- static inline bool rvzbb_enabled(void)
- {
- 	return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB);
-@@ -737,6 +742,17 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2)
- 	return rv_css_insn(0x6, imm, rs2, 0x2);
- }
- 
-+/* RVZBA instructions. */
-+static inline u32 rvzba_sh2add(u8 rd, u8 rs1, u8 rs2)
-+{
-+	return rv_r_insn(0x10, rs2, rs1, 0x4, rd, 0x33);
-+}
-+
-+static inline u32 rvzba_sh3add(u8 rd, u8 rs1, u8 rs2)
-+{
-+	return rv_r_insn(0x10, rs2, rs1, 0x6, rd, 0x33);
-+}
-+
- /* RVZBB instructions. */
- static inline u32 rvzbb_sextb(u8 rd, u8 rs1)
- {
-@@ -939,6 +955,14 @@ static inline u16 rvc_sdsp(u32 imm9, u8 rs2)
- 	return rv_css_insn(0x7, imm, rs2, 0x2);
- }
- 
-+/* RV64-only ZBA instructions. */
-+
-+static inline u32 rvzba_zextw(u8 rd, u8 rs1)
-+{
-+	/* add.uw rd, rs1, ZERO */
-+	return rv_r_insn(0x04, RV_REG_ZERO, rs1, 0, rd, 0x3b);
-+}
-+
- #endif /* __riscv_xlen == 64 */
- 
- /* Helper functions that emit RVC instructions when possible. */
-@@ -1082,6 +1106,28 @@ static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx)
- 		emit(rv_sw(rs1, off, rs2), ctx);
- }
- 
-+static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
-+{
-+	if (rvzba_enabled()) {
-+		emit(rvzba_sh2add(rd, rs1, rs2), ctx);
-+		return;
-+	}
-+
-+	emit_slli(rd, rs1, 2, ctx);
-+	emit_add(rd, rd, rs2, ctx);
-+}
-+
-+static inline void emit_sh3add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
-+{
-+	if (rvzba_enabled()) {
-+		emit(rvzba_sh3add(rd, rs1, rs2), ctx);
-+		return;
-+	}
-+
-+	emit_slli(rd, rs1, 3, ctx);
-+	emit_add(rd, rd, rs2, ctx);
-+}
-+
- /* RV64-only helper functions. */
- #if __riscv_xlen == 64
- 
-@@ -1161,6 +1207,11 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx)
- 
- static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx)
- {
-+	if (rvzba_enabled()) {
-+		emit(rvzba_zextw(rd, rs), ctx);
-+		return;
-+	}
-+
- 	emit_slli(rd, rs, 32, ctx);
- 	emit_srli(rd, rd, 32, ctx);
- }
-diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
-index f5ba73bb153d..592dd86fbf81 100644
---- a/arch/riscv/net/bpf_jit_comp32.c
-+++ b/arch/riscv/net/bpf_jit_comp32.c
-@@ -811,8 +811,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
- 	 * if (!prog)
- 	 *   goto out;
- 	 */
--	emit(rv_slli(RV_REG_T0, lo(idx_reg), 2), ctx);
--	emit(rv_add(RV_REG_T0, RV_REG_T0, lo(arr_reg)), ctx);
-+	emit_sh2add(RV_REG_T0, lo(idx_reg), lo(arr_reg), ctx);
- 	off = offsetof(struct bpf_array, ptrs);
- 	if (is_12b_check(off, insn))
- 		return -1;
-diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
-index 79a001d5533e..d5cebb0b0afe 100644
---- a/arch/riscv/net/bpf_jit_comp64.c
-+++ b/arch/riscv/net/bpf_jit_comp64.c
-@@ -380,8 +380,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
- 	 * if (!prog)
- 	 *     goto out;
- 	 */
--	emit_slli(RV_REG_T2, RV_REG_A2, 3, ctx);
--	emit_add(RV_REG_T2, RV_REG_T2, RV_REG_A1, ctx);
-+	emit_sh3add(RV_REG_T2, RV_REG_A2, RV_REG_A1, ctx);
- 	off = offsetof(struct bpf_array, ptrs);
- 	if (is_12b_check(off, insn))
- 		return -1;
-@@ -537,8 +536,10 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
- 	/* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */
- 	case BPF_CMPXCHG:
- 		r0 = bpf_to_rv_reg(BPF_REG_0, ctx);
--		emit(is64 ? rv_addi(RV_REG_T2, r0, 0) :
--		     rv_addiw(RV_REG_T2, r0, 0), ctx);
-+		if (is64)
-+			emit_mv(RV_REG_T2, r0, ctx);
-+		else
-+			emit_addiw(RV_REG_T2, r0, 0, ctx);
- 		emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) :
- 		     rv_lr_w(r0, 0, rd, 0, 0), ctx);
- 		jmp_offset = ninsns_rvoff(8);
-@@ -868,7 +869,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
- 	stack_size += 8;
- 	sreg_off = stack_size;
- 
--	stack_size = round_up(stack_size, 16);
-+	stack_size = round_up(stack_size, STACK_ALIGN);
- 
- 	if (!is_struct_ops) {
- 		/* For the trampoline called from function entry,
-@@ -1097,12 +1098,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
- 			/* Load current CPU number in T1 */
- 			emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu),
- 				RV_REG_TP, ctx);
--			/* << 3 because offsets are 8 bytes */
--			emit_slli(RV_REG_T1, RV_REG_T1, 3, ctx);
- 			/* Load address of __per_cpu_offset array in T2 */
- 			emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx);
--			/* Add offset of current CPU to  __per_cpu_offset */
--			emit_add(RV_REG_T1, RV_REG_T2, RV_REG_T1, ctx);
-+			/* Get address of __per_cpu_offset[cpu] in T1 */
-+			emit_sh3add(RV_REG_T1, RV_REG_T1, RV_REG_T2, ctx);
- 			/* Load __per_cpu_offset[cpu] in T1 */
- 			emit_ld(RV_REG_T1, 0, RV_REG_T1, ctx);
- 			/* Add the offset to Rd */
-@@ -1960,7 +1959,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
- {
- 	int i, stack_adjust = 0, store_offset, bpf_stack_adjust;
- 
--	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
-+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, STACK_ALIGN);
- 	if (bpf_stack_adjust)
- 		mark_fp(ctx);
- 
-@@ -1982,7 +1981,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
- 	if (ctx->arena_vm_start)
- 		stack_adjust += 8;
- 
--	stack_adjust = round_up(stack_adjust, 16);
-+	stack_adjust = round_up(stack_adjust, STACK_ALIGN);
- 	stack_adjust += bpf_stack_adjust;
- 
- 	store_offset = stack_adjust - 8;
-diff --git a/drivers/isdn/mISDN/dsp_blowfish.c b/drivers/isdn/mISDN/dsp_blowfish.c
-index 0aa572f3858d..0e77c282c862 100644
---- a/drivers/isdn/mISDN/dsp_blowfish.c
-+++ b/drivers/isdn/mISDN/dsp_blowfish.c
-@@ -73,11 +73,6 @@
-  * crypto-api for faster implementation
-  */
- 
--struct bf_ctx {
--	u32 p[18];
--	u32 s[1024];
--};
--
- static const u32 bf_pbox[16 + 2] = {
- 	0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
- 	0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
-diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c
-index 65f56a98c0a0..1a34da07c0db 100644
---- a/drivers/net/ethernet/8390/ne2k-pci.c
-+++ b/drivers/net/ethernet/8390/ne2k-pci.c
-@@ -186,17 +186,6 @@ static void ne2k_pci_block_output(struct net_device *dev, const int count,
- static const struct ethtool_ops ne2k_pci_ethtool_ops;
- 
- 
--
--/* There is no room in the standard 8390 structure for extra info we need,
-- * so we build a meta/outer-wrapper structure..
-- */
--struct ne2k_pci_card {
--	struct net_device *dev;
--	struct pci_dev *pci_dev;
--};
--
--
--
- /* NEx000-clone boards have a Station Address (SA) PROM (SAPROM) in the packet
-  * buffer memory space.  By-the-spec NE2000 clones have 0x57,0x57 in bytes
-  * 0x0e,0x0f of the SAPROM, while other supposed NE2000 clones must be
-diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
-index 857361c74f5d..e1b8794b14c9 100644
---- a/drivers/net/ethernet/adaptec/starfire.c
-+++ b/drivers/net/ethernet/adaptec/starfire.c
-@@ -441,14 +441,6 @@ enum rx_desc_bits {
- };
- 
- /* Completion queue entry. */
--struct short_rx_done_desc {
--	__le32 status;			/* Low 16 bits is length. */
--};
--struct basic_rx_done_desc {
--	__le32 status;			/* Low 16 bits is length. */
--	__le16 vlanid;
--	__le16 status2;
--};
- struct csum_rx_done_desc {
- 	__le32 status;			/* Low 16 bits is length. */
- 	__le16 csum;			/* Partial checksum */
-diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
-index 34f02a8ec2ca..1d79f6eaa41f 100644
---- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
-+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
-@@ -92,12 +92,6 @@ static int octeon_console_debug_enabled(u32 console)
- /* time to wait for possible in-flight requests in milliseconds */
- #define WAIT_INFLIGHT_REQUEST	msecs_to_jiffies(1000)
- 
--struct oct_link_status_resp {
--	u64 rh;
--	struct oct_link_info link_info;
--	u64 status;
--};
--
- struct oct_timestamp_resp {
- 	u64 rh;
- 	u64 timestamp;
-diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
-index 0d6ee30affb9..eef12fdd246d 100644
---- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
-+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
-@@ -30,11 +30,6 @@
- #include "cn23xx_pf_device.h"
- #include "cn23xx_vf_device.h"
- 
--struct niclist {
--	struct list_head list;
--	void *ptr;
--};
--
- struct __dispatch {
- 	struct list_head list;
- 	struct octeon_recv_info *rinfo;
-diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
-index 98688e4dbec5..febeadfdd5a5 100644
---- a/drivers/net/ethernet/mellanox/mlx4/main.c
-+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
-@@ -169,12 +169,6 @@ module_param_array(port_type_array, int, &arr_argc, 0444);
- MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
- 				"1 for IB, 2 for Ethernet");
- 
--struct mlx4_port_config {
--	struct list_head list;
--	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
--	struct pci_dev *pdev;
--};
--
- static atomic_t pf_loading = ATOMIC_INIT(0);
- 
- static int mlx4_devlink_ierr_reset_get(struct devlink *devlink, u32 id,
-diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
-index 5a2c38b63012..7a5cc49ebec6 100644
---- a/drivers/net/usb/lan78xx.c
-+++ b/drivers/net/usb/lan78xx.c
-@@ -380,11 +380,6 @@ struct skb_data {		/* skb->cb is one of these */
- 	int num_of_packet;
- };
- 
--struct usb_context {
--	struct usb_ctrlrequest req;
--	struct lan78xx_net *dev;
--};
--
- #define EVENT_TX_HALT			0
- #define EVENT_RX_HALT			1
- #define EVENT_RX_MEMORY			2
-diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
-index 0726e18bee6f..78c821349f48 100644
---- a/drivers/net/usb/smsc75xx.c
-+++ b/drivers/net/usb/smsc75xx.c
-@@ -61,11 +61,6 @@ struct smsc75xx_priv {
- 	u8 suspend_flags;
- };
- 
--struct usb_context {
--	struct usb_ctrlrequest req;
--	struct usbnet *dev;
--};
--
- static bool turbo_mode = true;
- module_param(turbo_mode, bool, 0644);
- MODULE_PARM_DESC(turbo_mode, "Enable multiple frames per Rx transaction");
-diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
-index e5974b8239c9..167e877b8bef 100644
---- a/drivers/tty/sysrq.c
-+++ b/drivers/tty/sysrq.c
-@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
- 	NULL,				/* P */
- 	NULL,				/* Q */
- 	&sysrq_replay_logs_op,		/* R */
-+	/* S: May be registered by sched_ext for resetting */
- 	NULL,				/* S */
- 	NULL,				/* T */
- 	NULL,				/* U */
-diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
-index 5703526d6ebf..2e712183ba09 100644
---- a/include/asm-generic/vmlinux.lds.h
-+++ b/include/asm-generic/vmlinux.lds.h
-@@ -133,6 +133,7 @@
- 	*(__dl_sched_class)			\
- 	*(__rt_sched_class)			\
- 	*(__fair_sched_class)			\
-+	*(__ext_sched_class)			\
- 	*(__idle_sched_class)			\
- 	__sched_class_lowest = .;
- 
-diff --git a/include/linux/bpf.h b/include/linux/bpf.h
-index 5e694a308081..a834f4b761bc 100644
---- a/include/linux/bpf.h
-+++ b/include/linux/bpf.h
-@@ -1612,6 +1612,7 @@ struct bpf_link_ops {
- 			      struct bpf_link_info *info);
- 	int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
- 			  struct bpf_map *old_map);
-+	__poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
- };
- 
- struct bpf_tramp_link {
-@@ -1730,9 +1731,9 @@ struct bpf_struct_ops {
- 	int (*init_member)(const struct btf_type *t,
- 			   const struct btf_member *member,
- 			   void *kdata, const void *udata);
--	int (*reg)(void *kdata);
--	void (*unreg)(void *kdata);
--	int (*update)(void *kdata, void *old_kdata);
-+	int (*reg)(void *kdata, struct bpf_link *link);
-+	void (*unreg)(void *kdata, struct bpf_link *link);
-+	int (*update)(void *kdata, void *old_kdata, struct bpf_link *link);
- 	int (*validate)(void *kdata);
- 	void *cfi_stubs;
- 	struct module *owner;
-@@ -2333,6 +2334,7 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
- int bpf_link_settle(struct bpf_link_primer *primer);
- void bpf_link_cleanup(struct bpf_link_primer *primer);
- void bpf_link_inc(struct bpf_link *link);
-+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link);
- void bpf_link_put(struct bpf_link *link);
- int bpf_link_new_fd(struct bpf_link *link);
- struct bpf_link *bpf_link_get_from_fd(u32 ufd);
-@@ -2704,6 +2706,11 @@ static inline void bpf_link_inc(struct bpf_link *link)
- {
- }
- 
-+static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
-+{
-+	return NULL;
-+}
-+
- static inline void bpf_link_put(struct bpf_link *link)
- {
- }
-diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
-index ea48c861cd36..bfc027311950 100644
---- a/include/linux/cgroup-defs.h
-+++ b/include/linux/cgroup-defs.h
-@@ -132,12 +132,18 @@ enum {
- 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
- 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
- 
-+	CFTYPE_HIDDEN		= (1 << 6),	/* file type hidden, see cgroup_show_cftypes() */
-+
- 	/* internal flags, do not use outside cgroup core proper */
- 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
- 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
- 	__CFTYPE_ADDED		= (1 << 18),
- };
- 
-+enum cfile_flags {
-+	CFILE_HIDDEN		= (1 << 0),	/* file instance hidden */
-+};
-+
- /*
-  * cgroup_file is the handle for a file instance created in a cgroup which
-  * is used, for example, to generate file changed notifications.  This can
-@@ -145,7 +151,9 @@ enum {
-  */
- struct cgroup_file {
- 	/* do not access any fields from outside cgroup core */
-+	struct cftype *cft;
- 	struct kernfs_node *kn;
-+	unsigned int flags;
- 	unsigned long notified_at;
- 	struct timer_list notify_timer;
- };
-diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
-index 2150ca60394b..06bb4ca93414 100644
---- a/include/linux/cgroup.h
-+++ b/include/linux/cgroup.h
-@@ -29,8 +29,6 @@
- 
- struct kernel_clone_args;
- 
--#ifdef CONFIG_CGROUPS
--
- /*
-  * All weight knobs on the default hierarchy should use the following min,
-  * default and max values.  The default value is the logarithmic center of
-@@ -40,6 +38,8 @@ struct kernel_clone_args;
- #define CGROUP_WEIGHT_DFL		100
- #define CGROUP_WEIGHT_MAX		10000
- 
-+#ifdef CONFIG_CGROUPS
-+
- enum {
- 	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
- 	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
-@@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
- int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
- int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
- int cgroup_rm_cftypes(struct cftype *cfts);
-+void cgroup_show_cftype(struct cftype *cft, bool show);
- void cgroup_file_notify(struct cgroup_file *cfile);
- void cgroup_file_show(struct cgroup_file *cfile, bool show);
- 
-diff --git a/include/linux/filter.h b/include/linux/filter.h
-index 0f12cf01070e..b02aea291b7e 100644
---- a/include/linux/filter.h
-+++ b/include/linux/filter.h
-@@ -1406,7 +1406,7 @@ struct bpf_sock_ops_kern {
- 
- struct bpf_sysctl_kern {
- 	struct ctl_table_header *head;
--	struct ctl_table *table;
-+	const struct ctl_table *table;
- 	void *cur_val;
- 	size_t cur_len;
- 	void *new_val;
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 61591ac6eab6..55912a3830b7 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -80,6 +80,8 @@ struct task_group;
- struct task_struct;
- struct user_event_mm;
- 
-+#include <linux/sched/ext.h>
-+
- /*
-  * Task state bitmask. NOTE! These bits are also
-  * encoded in fs/proc/array.c: get_task_state().
-@@ -802,6 +804,9 @@ struct task_struct {
- 	struct sched_rt_entity		rt;
- 	struct sched_dl_entity		dl;
- 	struct sched_dl_entity		*dl_server;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	struct sched_ext_entity		scx;
-+#endif
- 	const struct sched_class	*sched_class;
- 
- #ifdef CONFIG_SCHED_CORE
-diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
-new file mode 100644
-index 000000000000..6e510c0cb10c
---- /dev/null
-+++ b/include/linux/sched/ext.h
-@@ -0,0 +1,210 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef _LINUX_SCHED_EXT_H
-+#define _LINUX_SCHED_EXT_H
-+
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+
-+#include <linux/llist.h>
-+#include <linux/rhashtable-types.h>
-+
-+enum scx_public_consts {
-+	SCX_OPS_NAME_LEN	= 128,
-+
-+	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
-+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
-+};
-+
-+/*
-+ * DSQ (dispatch queue) IDs are 64bit of the format:
-+ *
-+ *   Bits: [63] [62 ..  0]
-+ *         [ B] [   ID   ]
-+ *
-+ *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
-+ *   ID: 63 bit ID
-+ *
-+ * Built-in IDs:
-+ *
-+ *   Bits: [63] [62] [61..32] [31 ..  0]
-+ *         [ 1] [ L] [   R  ] [    V   ]
-+ *
-+ *    1: 1 for built-in DSQs.
-+ *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
-+ *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
-+ */
-+enum scx_dsq_id_flags {
-+	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
-+	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
-+
-+	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
-+	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
-+	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
-+	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
-+	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
-+};
-+
-+/*
-+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
-+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
-+ * buffer between the scheduler core and the BPF scheduler. See the
-+ * documentation for more details.
-+ */
-+struct scx_dispatch_q {
-+	raw_spinlock_t		lock;
-+	struct list_head	list;	/* tasks in dispatch order */
-+	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
-+	u32			nr;
-+	u64			seq;	/* used by BPF iter */
-+	u64			id;
-+	struct rhash_head	hash_node;
-+	struct llist_node	free_node;
-+	struct rcu_head		rcu;
-+};
-+
-+/* scx_entity.flags */
-+enum scx_ent_flags {
-+	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
-+	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
-+	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
-+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
-+
-+	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
-+	SCX_TASK_STATE_BITS	= 2,
-+	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
-+
-+	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
-+};
-+
-+/* scx_entity.flags & SCX_TASK_STATE_MASK */
-+enum scx_task_state {
-+	SCX_TASK_NONE,		/* ops.init_task() not called yet */
-+	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
-+	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
-+	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
-+
-+	SCX_TASK_NR_STATES,
-+};
-+
-+/* scx_entity.dsq_flags */
-+enum scx_ent_dsq_flags {
-+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
-+
-+	SCX_TASK_DSQ_CURSOR	= 1 << 31, /* iteration cursor, not a task */
-+};
-+
-+/*
-+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
-+ * everywhere and the following bits track which kfunc sets are currently
-+ * allowed for %current. This simple per-task tracking works because SCX ops
-+ * nest in a limited way. BPF will likely implement a way to allow and disallow
-+ * kfuncs depending on the calling context which will replace this manual
-+ * mechanism. See scx_kf_allow().
-+ */
-+enum scx_kf_mask {
-+	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
-+	/* all non-sleepables may be nested inside SLEEPABLE */
-+	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
-+	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-+	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
-+	/* ops.dequeue (in REST) may be nested inside DISPATCH */
-+	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
-+	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
-+	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
-+	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
-+
-+	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
-+				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-+	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-+};
-+
-+struct scx_dsq_node {
-+	struct list_head	list;		/* dispatch order */
-+	struct rb_node		priq;		/* p->scx.dsq_vtime order */
-+	u32			flags;		/* SCX_TASK_DSQ_* flags */
-+};
-+
-+/*
-+ * The following is embedded in task_struct and contains all fields necessary
-+ * for a task to be scheduled by SCX.
-+ */
-+struct sched_ext_entity {
-+	struct scx_dispatch_q	*dsq;
-+	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
-+	u64			dsq_seq;
-+	u32			flags;		/* protected by rq lock */
-+	u32			weight;
-+	s32			sticky_cpu;
-+	s32			holding_cpu;
-+	u32			kf_mask;	/* see scx_kf_mask above */
-+	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
-+	atomic_long_t		ops_state;
-+
-+	struct list_head	runnable_node;	/* rq->scx.runnable_list */
-+	unsigned long		runnable_at;
-+
-+#ifdef CONFIG_SCHED_CORE
-+	u64			core_sched_at;	/* see scx_prio_less() */
-+#endif
-+	u64			ddsp_dsq_id;
-+	u64			ddsp_enq_flags;
-+
-+	/* BPF scheduler modifiable fields */
-+
-+	/*
-+	 * Runtime budget in nsecs. This is usually set through
-+	 * scx_bpf_dispatch() but can also be modified directly by the BPF
-+	 * scheduler. Automatically decreased by SCX as the task executes. On
-+	 * depletion, a scheduling event is triggered.
-+	 *
-+	 * This value is cleared to zero if the task is preempted by
-+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
-+	 * task ran. Use p->se.sum_exec_runtime instead.
-+	 */
-+	u64			slice;
-+
-+	/*
-+	 * Used to order tasks when dispatching to the vtime-ordered priority
-+	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
-+	 * but can also be modified directly by the BPF scheduler. Modifying it
-+	 * while a task is queued on a dsq may mangle the ordering and is not
-+	 * recommended.
-+	 */
-+	u64			dsq_vtime;
-+
-+	/*
-+	 * If set, reject future sched_setscheduler(2) calls updating the policy
-+	 * to %SCHED_EXT with -%EACCES.
-+	 *
-+	 * If set from ops.init_task() and the task's policy is already
-+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
-+	 * or by inhering the parent's policy during fork, the task's policy is
-+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
-+	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
-+	 */
-+	bool			disallow;	/* reject switching into SCX */
-+
-+	/* cold fields */
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	struct cgroup		*cgrp_moving_from;
-+#endif
-+	/* must be the last field, see init_scx_entity() */
-+	struct list_head	tasks_node;
-+};
-+
-+void sched_ext_free(struct task_struct *p);
-+void print_scx_info(const char *log_lvl, struct task_struct *p);
-+
-+#else	/* !CONFIG_SCHED_CLASS_EXT */
-+
-+static inline void sched_ext_free(struct task_struct *p) {}
-+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
-+
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-+#endif	/* _LINUX_SCHED_EXT_H */
-diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
-index d362aacf9f89..4df2f9055587 100644
---- a/include/linux/sched/task.h
-+++ b/include/linux/sched/task.h
-@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
- extern void init_idle(struct task_struct *idle, int cpu);
- 
- extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
--extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
-+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
-+extern void sched_cancel_fork(struct task_struct *p);
- extern void sched_post_fork(struct task_struct *p);
- extern void sched_dead(struct task_struct *p);
- 
-diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
-index 1c2902eaebd3..fe7d8dbef77e 100644
---- a/include/linux/skbuff.h
-+++ b/include/linux/skbuff.h
-@@ -706,6 +706,13 @@ typedef unsigned int sk_buff_data_t;
- typedef unsigned char *sk_buff_data_t;
- #endif
- 
-+enum skb_tstamp_type {
-+	SKB_CLOCK_REALTIME,
-+	SKB_CLOCK_MONOTONIC,
-+	SKB_CLOCK_TAI,
-+	__SKB_CLOCK_MAX = SKB_CLOCK_TAI,
-+};
-+
- /**
-  * DOC: Basic sk_buff geometry
-  *
-@@ -823,10 +830,8 @@ typedef unsigned char *sk_buff_data_t;
-  *	@dst_pending_confirm: need to confirm neighbour
-  *	@decrypted: Decrypted SKB
-  *	@slow_gro: state present at GRO time, slower prepare step required
-- *	@mono_delivery_time: When set, skb->tstamp has the
-- *		delivery_time in mono clock base (i.e. EDT).  Otherwise, the
-- *		skb->tstamp has the (rcv) timestamp at ingress and
-- *		delivery_time at egress.
-+ *	@tstamp_type: When set, skb->tstamp has the
-+ *		delivery_time clock base of skb->tstamp.
-  *	@napi_id: id of the NAPI struct this skb came from
-  *	@sender_cpu: (aka @napi_id) source CPU in XPS
-  *	@alloc_cpu: CPU which did the skb allocation.
-@@ -954,7 +959,7 @@ struct sk_buff {
- 	/* private: */
- 	__u8			__mono_tc_offset[0];
- 	/* public: */
--	__u8			mono_delivery_time:1;	/* See SKB_MONO_DELIVERY_TIME_MASK */
-+	__u8			tstamp_type:2;	/* See skb_tstamp_type */
- #ifdef CONFIG_NET_XGRESS
- 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
- 	__u8			tc_skip_classify:1;
-@@ -1084,15 +1089,16 @@ struct sk_buff {
- #endif
- #define PKT_TYPE_OFFSET		offsetof(struct sk_buff, __pkt_type_offset)
- 
--/* if you move tc_at_ingress or mono_delivery_time
-+/* if you move tc_at_ingress or tstamp_type
-  * around, you also must adapt these constants.
-  */
- #ifdef __BIG_ENDIAN_BITFIELD
--#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
--#define TC_AT_INGRESS_MASK		(1 << 6)
-+#define SKB_TSTAMP_TYPE_MASK		(3 << 6)
-+#define SKB_TSTAMP_TYPE_RSHIFT		(6)
-+#define TC_AT_INGRESS_MASK		(1 << 5)
- #else
--#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
--#define TC_AT_INGRESS_MASK		(1 << 1)
-+#define SKB_TSTAMP_TYPE_MASK		(3)
-+#define TC_AT_INGRESS_MASK		(1 << 2)
- #endif
- #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
- 
-@@ -4179,7 +4185,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb,
- static inline void __net_timestamp(struct sk_buff *skb)
- {
- 	skb->tstamp = ktime_get_real();
--	skb->mono_delivery_time = 0;
-+	skb->tstamp_type = SKB_CLOCK_REALTIME;
- }
- 
- static inline ktime_t net_timedelta(ktime_t t)
-@@ -4188,10 +4194,36 @@ static inline ktime_t net_timedelta(ktime_t t)
- }
- 
- static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
--					 bool mono)
-+					 u8 tstamp_type)
- {
- 	skb->tstamp = kt;
--	skb->mono_delivery_time = kt && mono;
-+
-+	if (kt)
-+		skb->tstamp_type = tstamp_type;
-+	else
-+		skb->tstamp_type = SKB_CLOCK_REALTIME;
-+}
-+
-+static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
-+						    ktime_t kt, clockid_t clockid)
-+{
-+	u8 tstamp_type = SKB_CLOCK_REALTIME;
-+
-+	switch (clockid) {
-+	case CLOCK_REALTIME:
-+		break;
-+	case CLOCK_MONOTONIC:
-+		tstamp_type = SKB_CLOCK_MONOTONIC;
-+		break;
-+	case CLOCK_TAI:
-+		tstamp_type = SKB_CLOCK_TAI;
-+		break;
-+	default:
-+		WARN_ON_ONCE(1);
-+		kt = 0;
-+	}
-+
-+	skb_set_delivery_time(skb, kt, tstamp_type);
- }
- 
- DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
-@@ -4201,8 +4233,8 @@ DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
-  */
- static inline void skb_clear_delivery_time(struct sk_buff *skb)
- {
--	if (skb->mono_delivery_time) {
--		skb->mono_delivery_time = 0;
-+	if (skb->tstamp_type) {
-+		skb->tstamp_type = SKB_CLOCK_REALTIME;
- 		if (static_branch_unlikely(&netstamp_needed_key))
- 			skb->tstamp = ktime_get_real();
- 		else
-@@ -4212,7 +4244,7 @@ static inline void skb_clear_delivery_time(struct sk_buff *skb)
- 
- static inline void skb_clear_tstamp(struct sk_buff *skb)
- {
--	if (skb->mono_delivery_time)
-+	if (skb->tstamp_type)
- 		return;
- 
- 	skb->tstamp = 0;
-@@ -4220,7 +4252,7 @@ static inline void skb_clear_tstamp(struct sk_buff *skb)
- 
- static inline ktime_t skb_tstamp(const struct sk_buff *skb)
- {
--	if (skb->mono_delivery_time)
-+	if (skb->tstamp_type)
- 		return 0;
- 
- 	return skb->tstamp;
-@@ -4228,7 +4260,7 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb)
- 
- static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
- {
--	if (!skb->mono_delivery_time && skb->tstamp)
-+	if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
- 		return skb->tstamp;
- 
- 	if (static_branch_unlikely(&netstamp_needed_key) || cond)
-diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
-index 153960663ce4..5af6eb14c5db 100644
---- a/include/net/inet_frag.h
-+++ b/include/net/inet_frag.h
-@@ -76,7 +76,7 @@ struct frag_v6_compare_key {
-  * @stamp: timestamp of the last received fragment
-  * @len: total length of the original datagram
-  * @meat: length of received fragments so far
-- * @mono_delivery_time: stamp has a mono delivery time (EDT)
-+ * @tstamp_type: stamp has a mono delivery time (EDT)
-  * @flags: fragment queue flags
-  * @max_size: maximum received fragment size
-  * @fqdir: pointer to struct fqdir
-@@ -97,7 +97,7 @@ struct inet_frag_queue {
- 	ktime_t			stamp;
- 	int			len;
- 	int			meat;
--	u8			mono_delivery_time;
-+	u8			tstamp_type;
- 	__u8			flags;
- 	u16			max_size;
- 	struct fqdir		*fqdir;
-diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
-new file mode 100644
-index 000000000000..fe19da7315a9
---- /dev/null
-+++ b/include/trace/events/sched_ext.h
-@@ -0,0 +1,32 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+#undef TRACE_SYSTEM
-+#define TRACE_SYSTEM sched_ext
-+
-+#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ)
-+#define _TRACE_SCHED_EXT_H
-+
-+#include <linux/tracepoint.h>
-+
-+TRACE_EVENT(sched_ext_dump,
-+
-+	TP_PROTO(const char *line),
-+
-+	TP_ARGS(line),
-+
-+	TP_STRUCT__entry(
-+		__string(line, line)
-+	),
-+
-+	TP_fast_assign(
-+		__assign_str(line);
-+	),
-+
-+	TP_printk("%s",
-+		__get_str(line)
-+	)
-+);
-+
-+#endif /* _TRACE_SCHED_EXT_H */
-+
-+/* This part must be outside protection */
-+#include <trace/define_trace.h>
-diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
-index 90706a47f6ff..25ea393cf084 100644
---- a/include/uapi/linux/bpf.h
-+++ b/include/uapi/linux/bpf.h
-@@ -6207,12 +6207,17 @@ union {					\
- 	__u64 :64;			\
- } __attribute__((aligned(8)))
- 
-+/* The enum used in skb->tstamp_type. It specifies the clock type
-+ * of the time stored in the skb->tstamp.
-+ */
- enum {
--	BPF_SKB_TSTAMP_UNSPEC,
--	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
--	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
--	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
--	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
-+	BPF_SKB_TSTAMP_UNSPEC = 0,		/* DEPRECATED */
-+	BPF_SKB_TSTAMP_DELIVERY_MONO = 1,	/* DEPRECATED */
-+	BPF_SKB_CLOCK_REALTIME = 0,
-+	BPF_SKB_CLOCK_MONOTONIC = 1,
-+	BPF_SKB_CLOCK_TAI = 2,
-+	/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
-+	 * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
- 	 */
- };
- 
-diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
-index 3bac0a8ceab2..359a14cc76a4 100644
---- a/include/uapi/linux/sched.h
-+++ b/include/uapi/linux/sched.h
-@@ -118,6 +118,7 @@ struct clone_args {
- /* SCHED_ISO: reserved but not implemented yet */
- #define SCHED_IDLE		5
- #define SCHED_DEADLINE		6
-+#define SCHED_EXT		7
- 
- /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
- #define SCHED_RESET_ON_FORK     0x40000000
-diff --git a/init/Kconfig b/init/Kconfig
-index 44616ffe0af5..c4f6fc369754 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -1041,6 +1041,11 @@ config RT_GROUP_SCHED
- 	  realtime bandwidth for them.
- 	  See Documentation/scheduler/sched-rt-group.rst for more information.
- 
-+config EXT_GROUP_SCHED
-+	bool
-+	depends on SCHED_CLASS_EXT && CGROUP_SCHED
-+	default y
-+
- endif #CGROUP_SCHED
- 
- config SCHED_MM_CID
-diff --git a/init/init_task.c b/init/init_task.c
-index eeb110c65fe2..5726b3a0eea9 100644
---- a/init/init_task.c
-+++ b/init/init_task.c
-@@ -6,6 +6,7 @@
- #include <linux/sched/sysctl.h>
- #include <linux/sched/rt.h>
- #include <linux/sched/task.h>
-+#include <linux/sched/ext.h>
- #include <linux/init.h>
- #include <linux/fs.h>
- #include <linux/mm.h>
-@@ -98,6 +99,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
- #endif
- #ifdef CONFIG_CGROUP_SCHED
- 	.sched_task_group = &root_task_group,
-+#endif
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	.scx		= {
-+		.dsq_node.list	= LIST_HEAD_INIT(init_task.scx.dsq_node.list),
-+		.sticky_cpu	= -1,
-+		.holding_cpu	= -1,
-+		.runnable_node	= LIST_HEAD_INIT(init_task.scx.runnable_node),
-+		.runnable_at	= INITIAL_JIFFIES,
-+		.ddsp_dsq_id	= SCX_DSQ_INVALID,
-+		.slice		= SCX_SLICE_DFL,
-+	},
- #endif
- 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
- 	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
-diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
-index c2f1fd95a821..bae49b743834 100644
---- a/kernel/Kconfig.preempt
-+++ b/kernel/Kconfig.preempt
-@@ -133,4 +133,26 @@ config SCHED_CORE
- 	  which is the likely usage by Linux distributions, there should
- 	  be no measurable impact on performance.
- 
--
-+config SCHED_CLASS_EXT
-+	bool "Extensible Scheduling Class"
-+	depends on BPF_SYSCALL && BPF_JIT
-+	help
-+	  This option enables a new scheduler class sched_ext (SCX), which
-+	  allows scheduling policies to be implemented as BPF programs to
-+	  achieve the following:
-+
-+	  - Ease of experimentation and exploration: Enabling rapid
-+	    iteration of new scheduling policies.
-+	  - Customization: Building application-specific schedulers which
-+	    implement policies that are not applicable to general-purpose
-+	    schedulers.
-+	  - Rapid scheduler deployments: Non-disruptive swap outs of
-+	    scheduling policies in production environments.
-+
-+	  sched_ext leverages BPFâs struct_ops feature to define a structure
-+	  which exports function callbacks and flags to BPF programs that
-+	  wish to implement scheduling policies. The struct_ops structure
-+	  exported by sched_ext is struct sched_ext_ops, and is conceptually
-+	  similar to struct sched_class.
-+
-+	  See Documentation/scheduler/sched-ext.rst for more details.
-diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
-index 976cb258a0ed..c938dea5ddbf 100644
---- a/kernel/bpf/bpf_local_storage.c
-+++ b/kernel/bpf/bpf_local_storage.c
-@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
- 	nbuckets = max_t(u32, 2, nbuckets);
- 	smap->bucket_log = ilog2(nbuckets);
- 
--	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
--					 nbuckets, GFP_USER | __GFP_NOWARN);
-+	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
-+					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
- 	if (!smap->buckets) {
- 		err = -ENOMEM;
- 		goto free_smap;
-diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
-index 86c7884abaf8..a2cf31b14be4 100644
---- a/kernel/bpf/bpf_struct_ops.c
-+++ b/kernel/bpf/bpf_struct_ops.c
-@@ -12,6 +12,7 @@
- #include <linux/mutex.h>
- #include <linux/btf_ids.h>
- #include <linux/rcupdate_wait.h>
-+#include <linux/poll.h>
- 
- struct bpf_struct_ops_value {
- 	struct bpf_struct_ops_common_value common;
-@@ -56,6 +57,7 @@ struct bpf_struct_ops_map {
- struct bpf_struct_ops_link {
- 	struct bpf_link link;
- 	struct bpf_map __rcu *map;
-+	wait_queue_head_t wait_hup;
- };
- 
- static DEFINE_MUTEX(update_mutex);
-@@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- 		goto unlock;
- 	}
- 
--	err = st_ops->reg(kdata);
-+	err = st_ops->reg(kdata, NULL);
- 	if (likely(!err)) {
- 		/* This refcnt increment on the map here after
- 		 * 'st_ops->reg()' is secure since the state of the
-@@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
- 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
- 	switch (prev_state) {
- 	case BPF_STRUCT_OPS_STATE_INUSE:
--		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
-+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
- 		bpf_map_put(map);
- 		return 0;
- 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
-@@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
- 	st_map = (struct bpf_struct_ops_map *)
- 		rcu_dereference_protected(st_link->map, true);
- 	if (st_map) {
--		/* st_link->map can be NULL if
--		 * bpf_struct_ops_link_create() fails to register.
--		 */
--		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
-+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
- 		bpf_map_put(&st_map->map);
- 	}
- 	kfree(st_link);
-@@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
- 	st_link = container_of(link, struct bpf_struct_ops_link, link);
- 	rcu_read_lock();
- 	map = rcu_dereference(st_link->map);
--	seq_printf(seq, "map_id:\t%d\n", map->id);
-+	if (map)
-+		seq_printf(seq, "map_id:\t%d\n", map->id);
- 	rcu_read_unlock();
- }
- 
-@@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
- 	st_link = container_of(link, struct bpf_struct_ops_link, link);
- 	rcu_read_lock();
- 	map = rcu_dereference(st_link->map);
--	info->struct_ops.map_id = map->id;
-+	if (map)
-+		info->struct_ops.map_id = map->id;
- 	rcu_read_unlock();
- 	return 0;
- }
-@@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
- 	mutex_lock(&update_mutex);
- 
- 	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
-+	if (!old_map) {
-+		err = -ENOLINK;
-+		goto err_out;
-+	}
- 	if (expected_old_map && old_map != expected_old_map) {
- 		err = -EPERM;
- 		goto err_out;
-@@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
- 		goto err_out;
- 	}
- 
--	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
-+	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
- 	if (err)
- 		goto err_out;
- 
-@@ -1139,11 +1144,53 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
- 	return err;
- }
- 
-+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
-+{
-+	struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
-+	struct bpf_struct_ops_map *st_map;
-+	struct bpf_map *map;
-+
-+	mutex_lock(&update_mutex);
-+
-+	map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
-+	if (!map) {
-+		mutex_unlock(&update_mutex);
-+		return 0;
-+	}
-+	st_map = container_of(map, struct bpf_struct_ops_map, map);
-+
-+	st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
-+
-+	RCU_INIT_POINTER(st_link->map, NULL);
-+	/* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
-+	 * bpf_map_inc() in bpf_struct_ops_map_link_update().
-+	 */
-+	bpf_map_put(&st_map->map);
-+
-+	mutex_unlock(&update_mutex);
-+
-+	wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
-+
-+	return 0;
-+}
-+
-+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
-+					     struct poll_table_struct *pts)
-+{
-+	struct bpf_struct_ops_link *st_link = file->private_data;
-+
-+	poll_wait(file, &st_link->wait_hup, pts);
-+
-+	return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
-+}
-+
- static const struct bpf_link_ops bpf_struct_ops_map_lops = {
- 	.dealloc = bpf_struct_ops_map_link_dealloc,
-+	.detach = bpf_struct_ops_map_link_detach,
- 	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
- 	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
- 	.update_map = bpf_struct_ops_map_link_update,
-+	.poll = bpf_struct_ops_map_link_poll,
- };
- 
- int bpf_struct_ops_link_create(union bpf_attr *attr)
-@@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
- 	if (err)
- 		goto err_out;
- 
--	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
-+	init_waitqueue_head(&link->wait_hup);
-+
-+	/* Hold the update_mutex such that the subsystem cannot
-+	 * do link->ops->detach() before the link is fully initialized.
-+	 */
-+	mutex_lock(&update_mutex);
-+	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
- 	if (err) {
-+		mutex_unlock(&update_mutex);
- 		bpf_link_cleanup(&link_primer);
- 		link = NULL;
- 		goto err_out;
- 	}
- 	RCU_INIT_POINTER(link->map, map);
-+	mutex_unlock(&update_mutex);
- 
- 	return bpf_link_settle(&link_primer);
- 
-diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
-index 2a69a9a36c0f..6f1abcb4b084 100644
---- a/kernel/bpf/helpers.c
-+++ b/kernel/bpf/helpers.c
-@@ -2744,6 +2744,122 @@ __bpf_kfunc void bpf_preempt_enable(void)
- 	preempt_enable();
- }
- 
-+struct bpf_iter_bits {
-+	__u64 __opaque[2];
-+} __aligned(8);
-+
-+struct bpf_iter_bits_kern {
-+	union {
-+		unsigned long *bits;
-+		unsigned long bits_copy;
-+	};
-+	u32 nr_bits;
-+	int bit;
-+} __aligned(8);
-+
-+/**
-+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
-+ * @it: The new bpf_iter_bits to be created
-+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
-+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
-+ * Due to the limitation of memalloc, it can't be greater than 512.
-+ *
-+ * This function initializes a new bpf_iter_bits structure for iterating over
-+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
-+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
-+ * subsequent iteration operations.
-+ *
-+ * On success, 0 is returned. On failure, ERR is returned.
-+ */
-+__bpf_kfunc int
-+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
-+{
-+	struct bpf_iter_bits_kern *kit = (void *)it;
-+	u32 nr_bytes = nr_words * sizeof(u64);
-+	u32 nr_bits = BYTES_TO_BITS(nr_bytes);
-+	int err;
-+
-+	BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
-+	BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
-+		     __alignof__(struct bpf_iter_bits));
-+
-+	kit->nr_bits = 0;
-+	kit->bits_copy = 0;
-+	kit->bit = -1;
-+
-+	if (!unsafe_ptr__ign || !nr_words)
-+		return -EINVAL;
-+
-+	/* Optimization for u64 mask */
-+	if (nr_bits == 64) {
-+		err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
-+		if (err)
-+			return -EFAULT;
-+
-+		kit->nr_bits = nr_bits;
-+		return 0;
-+	}
-+
-+	/* Fallback to memalloc */
-+	kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
-+	if (!kit->bits)
-+		return -ENOMEM;
-+
-+	err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
-+	if (err) {
-+		bpf_mem_free(&bpf_global_ma, kit->bits);
-+		return err;
-+	}
-+
-+	kit->nr_bits = nr_bits;
-+	return 0;
-+}
-+
-+/**
-+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
-+ * @it: The bpf_iter_bits to be checked
-+ *
-+ * This function returns a pointer to a number representing the value of the
-+ * next bit in the bits.
-+ *
-+ * If there are no further bits available, it returns NULL.
-+ */
-+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
-+{
-+	struct bpf_iter_bits_kern *kit = (void *)it;
-+	u32 nr_bits = kit->nr_bits;
-+	const unsigned long *bits;
-+	int bit;
-+
-+	if (nr_bits == 0)
-+		return NULL;
-+
-+	bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
-+	bit = find_next_bit(bits, nr_bits, kit->bit + 1);
-+	if (bit >= nr_bits) {
-+		kit->nr_bits = 0;
-+		return NULL;
-+	}
-+
-+	kit->bit = bit;
-+	return &kit->bit;
-+}
-+
-+/**
-+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
-+ * @it: The bpf_iter_bits to be destroyed
-+ *
-+ * Destroy the resource associated with the bpf_iter_bits.
-+ */
-+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
-+{
-+	struct bpf_iter_bits_kern *kit = (void *)it;
-+
-+	if (kit->nr_bits <= 64)
-+		return;
-+	bpf_mem_free(&bpf_global_ma, kit->bits);
-+}
-+
- __bpf_kfunc_end_defs();
- 
- BTF_KFUNCS_START(generic_btf_ids)
-@@ -2826,6 +2942,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
- BTF_ID_FLAGS(func, bpf_wq_start)
- BTF_ID_FLAGS(func, bpf_preempt_disable)
- BTF_ID_FLAGS(func, bpf_preempt_enable)
-+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
-+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
-+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
- BTF_KFUNCS_END(common_btf_ids)
- 
- static const struct btf_kfunc_id_set common_kfunc_set = {
-diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
-index 2222c3ff88e7..5070fa20d05c 100644
---- a/kernel/bpf/syscall.c
-+++ b/kernel/bpf/syscall.c
-@@ -3150,6 +3150,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
- }
- #endif
- 
-+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
-+{
-+	struct bpf_link *link = file->private_data;
-+
-+	return link->ops->poll(file, pts);
-+}
-+
- static const struct file_operations bpf_link_fops = {
- #ifdef CONFIG_PROC_FS
- 	.show_fdinfo	= bpf_link_show_fdinfo,
-@@ -3159,6 +3166,16 @@ static const struct file_operations bpf_link_fops = {
- 	.write		= bpf_dummy_write,
- };
- 
-+static const struct file_operations bpf_link_fops_poll = {
-+#ifdef CONFIG_PROC_FS
-+	.show_fdinfo	= bpf_link_show_fdinfo,
-+#endif
-+	.release	= bpf_link_release,
-+	.read		= bpf_dummy_read,
-+	.write		= bpf_dummy_write,
-+	.poll		= bpf_link_poll,
-+};
-+
- static int bpf_link_alloc_id(struct bpf_link *link)
- {
- 	int id;
-@@ -3201,7 +3218,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
- 		return id;
- 	}
- 
--	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
-+	file = anon_inode_getfile("bpf_link",
-+				  link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
-+				  link, O_CLOEXEC);
- 	if (IS_ERR(file)) {
- 		bpf_link_free_id(id);
- 		put_unused_fd(fd);
-@@ -3229,7 +3248,9 @@ int bpf_link_settle(struct bpf_link_primer *primer)
- 
- int bpf_link_new_fd(struct bpf_link *link)
- {
--	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-+	return anon_inode_getfd("bpf-link",
-+				link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
-+				link, O_CLOEXEC);
- }
- 
- struct bpf_link *bpf_link_get_from_fd(u32 ufd)
-@@ -3239,7 +3260,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
- 
- 	if (!f.file)
- 		return ERR_PTR(-EBADF);
--	if (f.file->f_op != &bpf_link_fops) {
-+	if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
- 		fdput(f);
- 		return ERR_PTR(-EINVAL);
- 	}
-@@ -4971,7 +4992,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
- 					     uattr);
- 	else if (f.file->f_op == &btf_fops)
- 		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
--	else if (f.file->f_op == &bpf_link_fops)
-+	else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
- 		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
- 					      attr, uattr);
- 	else
-@@ -5106,7 +5127,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
- 	if (!file)
- 		return -EBADF;
- 
--	if (file->f_op == &bpf_link_fops) {
-+	if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
- 		struct bpf_link *link = file->private_data;
- 
- 		if (link->ops == &bpf_raw_tp_link_lops) {
-@@ -5416,10 +5437,11 @@ static int link_detach(union bpf_attr *attr)
- 	return ret;
- }
- 
--static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
-+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
- {
- 	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
- }
-+EXPORT_SYMBOL(bpf_link_inc_not_zero);
- 
- struct bpf_link *bpf_link_by_id(u32 id)
- {
-diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
-index e32b6972c478..47dcf14b33c8 100644
---- a/kernel/cgroup/cgroup.c
-+++ b/kernel/cgroup/cgroup.c
-@@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
- 	if (IS_ERR(kn))
- 		return PTR_ERR(kn);
- 
-+	kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN));
-+
- 	if (cft->file_offset) {
- 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
- 
- 		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
-+		cfile->cft = cft;
- 
- 		spin_lock_irq(&cgroup_file_kn_lock);
- 		cfile->kn = kn;
-@@ -4485,6 +4488,24 @@ void cgroup_file_notify(struct cgroup_file *cfile)
- 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
- }
- 
-+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile)
-+{
-+	struct kernfs_node *kn;
-+
-+	spin_lock_irq(&cgroup_file_kn_lock);
-+	kn = cfile->kn;
-+	kernfs_get(kn);
-+	spin_unlock_irq(&cgroup_file_kn_lock);
-+
-+	return kn;
-+}
-+
-+static bool cfile_visible(struct cgroup_file *cfile)
-+{
-+	return !(cfile->cft->flags & CFTYPE_HIDDEN) &&
-+		!(cfile->flags & CFILE_HIDDEN);
-+}
-+
- /**
-  * cgroup_file_show - show or hide a hidden cgroup file
-  * @cfile: target cgroup_file obtained by setting cftype->file_offset
-@@ -4494,15 +4515,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
- {
- 	struct kernfs_node *kn;
- 
--	spin_lock_irq(&cgroup_file_kn_lock);
--	kn = cfile->kn;
--	kernfs_get(kn);
--	spin_unlock_irq(&cgroup_file_kn_lock);
-+	mutex_lock(&cgroup_mutex);
- 
--	if (kn)
--		kernfs_show(kn, show);
-+	if (show)
-+		cfile->flags &= ~CFILE_HIDDEN;
-+	else
-+		cfile->flags |= CFILE_HIDDEN;
- 
--	kernfs_put(kn);
-+	kn = cfile_kn_get(cfile);
-+	if (kn) {
-+		kernfs_show(kn, cfile_visible(cfile));
-+		kernfs_put(kn);
-+	}
-+
-+	mutex_unlock(&cgroup_mutex);
- }
- 
- /**
-@@ -5527,6 +5553,63 @@ static void offline_css(struct cgroup_subsys_state *css)
- 	wake_up_all(&css->cgroup->offline_waitq);
- }
- 
-+/**
-+ * cgroup_show_cftype - show or hide a cgroup file type
-+ * @cft: cftype to show or hide
-+ * @show: whether to show or hide
-+ *
-+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show.
-+ * @cft may or may not be added at the time of this call. After hiding, it's
-+ * guaranteed that there are no in-flight operations on the hidden files.
-+ */
-+void cgroup_show_cftype(struct cftype *cft, bool show)
-+{
-+	struct cgroup_subsys *ss = cft->ss;
-+	struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp;
-+	struct cgroup_subsys_state *css;
-+
-+	mutex_lock(&cgroup_mutex);
-+
-+	if (show)
-+		cft->flags &= ~CFTYPE_HIDDEN;
-+	else
-+		cft->flags |= CFTYPE_HIDDEN;
-+
-+	if (!(cft->flags & __CFTYPE_ADDED))
-+		goto out_unlock;
-+
-+	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
-+		struct cgroup *cgrp = css->cgroup;
-+		struct kernfs_node *kn;
-+
-+		if (!(css->flags & CSS_VISIBLE))
-+			continue;
-+
-+		if (cft->file_offset) {
-+			struct cgroup_file *cfile =
-+				(void *)css + cft->file_offset;
-+
-+			kn = cfile_kn_get(cfile);
-+			if (kn) {
-+				kernfs_show(kn, cfile_visible(cfile));
-+				kernfs_put(kn);
-+			}
-+		} else {
-+			char buf[CGROUP_FILE_NAME_MAX];
-+
-+			kn = kernfs_find_and_get(cgrp->kn,
-+					cgroup_file_name(cgrp, cft, buf));
-+			if (kn) {
-+				kernfs_show(kn, show);
-+				kernfs_put(kn);
-+			}
-+		}
-+	}
-+
-+out_unlock:
-+	mutex_unlock(&cgroup_mutex);
-+}
-+
- /**
-  * css_create - create a cgroup_subsys_state
-  * @cgrp: the cgroup new css will be associated with
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 18750b83c564..d973d23b3768 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -23,6 +23,7 @@
- #include <linux/sched/task.h>
- #include <linux/sched/task_stack.h>
- #include <linux/sched/cputime.h>
-+#include <linux/sched/ext.h>
- #include <linux/seq_file.h>
- #include <linux/rtmutex.h>
- #include <linux/init.h>
-@@ -975,6 +976,7 @@ void __put_task_struct(struct task_struct *tsk)
- 	WARN_ON(refcount_read(&tsk->usage));
- 	WARN_ON(tsk == current);
- 
-+	sched_ext_free(tsk);
- 	io_uring_free(tsk);
- 	cgroup_free(tsk);
- 	task_numa_free(tsk, true);
-@@ -2371,7 +2373,7 @@ __latent_entropy struct task_struct *copy_process(
- 
- 	retval = perf_event_init_task(p, clone_flags);
- 	if (retval)
--		goto bad_fork_cleanup_policy;
-+		goto bad_fork_sched_cancel_fork;
- 	retval = audit_alloc(p);
- 	if (retval)
- 		goto bad_fork_cleanup_perf;
-@@ -2504,7 +2506,9 @@ __latent_entropy struct task_struct *copy_process(
- 	 * cgroup specific, it unconditionally needs to place the task on a
- 	 * runqueue.
- 	 */
--	sched_cgroup_fork(p, args);
-+	retval = sched_cgroup_fork(p, args);
-+	if (retval)
-+		goto bad_fork_cancel_cgroup;
- 
- 	/*
- 	 * From this point on we must avoid any synchronous user-space
-@@ -2550,13 +2554,13 @@ __latent_entropy struct task_struct *copy_process(
- 	/* Don't start children in a dying pid namespace */
- 	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
- 		retval = -ENOMEM;
--		goto bad_fork_cancel_cgroup;
-+		goto bad_fork_core_free;
- 	}
- 
- 	/* Let kill terminate clone/fork in the middle */
- 	if (fatal_signal_pending(current)) {
- 		retval = -EINTR;
--		goto bad_fork_cancel_cgroup;
-+		goto bad_fork_core_free;
- 	}
- 
- 	/* No more failure paths after this point. */
-@@ -2630,10 +2634,11 @@ __latent_entropy struct task_struct *copy_process(
- 
- 	return p;
- 
--bad_fork_cancel_cgroup:
-+bad_fork_core_free:
- 	sched_core_free(p);
- 	spin_unlock(&current->sighand->siglock);
- 	write_unlock_irq(&tasklist_lock);
-+bad_fork_cancel_cgroup:
- 	cgroup_cancel_fork(p, args);
- bad_fork_put_pidfd:
- 	if (clone_flags & CLONE_PIDFD) {
-@@ -2672,6 +2677,8 @@ __latent_entropy struct task_struct *copy_process(
- 	audit_free(p);
- bad_fork_cleanup_perf:
- 	perf_event_free_task(p);
-+bad_fork_sched_cancel_fork:
-+	sched_cancel_fork(p);
- bad_fork_cleanup_policy:
- 	lockdep_free_task(p);
- #ifdef CONFIG_NUMA
-diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
-index d9dc9ab3773f..0de5477f876e 100644
---- a/kernel/sched/build_policy.c
-+++ b/kernel/sched/build_policy.c
-@@ -21,13 +21,19 @@
- 
- #include <linux/cpuidle.h>
- #include <linux/jiffies.h>
-+#include <linux/kobject.h>
- #include <linux/livepatch.h>
-+#include <linux/pm.h>
- #include <linux/psi.h>
-+#include <linux/rhashtable.h>
-+#include <linux/seq_buf.h>
- #include <linux/seqlock_api.h>
- #include <linux/slab.h>
- #include <linux/suspend.h>
- #include <linux/tsacct_kern.h>
- #include <linux/vtime.h>
-+#include <linux/sysrq.h>
-+#include <linux/percpu-rwsem.h>
- 
- #include <uapi/linux/sched/types.h>
- 
-@@ -52,3 +58,6 @@
- #include "cputime.c"
- #include "deadline.c"
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+# include "ext.c"
-+#endif
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index bcf2c4cc0522..6161dd1928d4 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p)
- 	if (p->sched_class == &idle_sched_class)
- 		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
- 
--	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
-+	if (task_on_scx(p))
-+		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
-+
-+	return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
- }
- 
- /*
-@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a,
- 	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
- 		return cfs_prio_less(a, b, in_fi);
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	if (pa == MAX_RT_PRIO + MAX_NICE + 1)	/* ext */
-+		return scx_prio_less(a, b, in_fi);
-+#endif
-+
- 	return false;
- }
- 
-@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq)
- 		return true;
- 
- 	/*
--	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
--	 * if there's more than one we need the tick for involuntary
--	 * preemption.
-+	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
-+	 * left. For CFS, if there's more than one we need the tick for
-+	 * involuntary preemption. For SCX, ask.
- 	 */
--	if (rq->nr_running > 1)
-+	if (!scx_switched_all() && rq->nr_running > 1)
-+		return false;
-+
-+	if (scx_enabled() && !scx_can_stop_tick(rq))
- 		return false;
- 
- 	/*
-@@ -1342,8 +1353,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
- 	 * SCHED_OTHER tasks have to update their load when changing their
- 	 * weight
- 	 */
--	if (update_load && p->sched_class == &fair_sched_class) {
--		reweight_task(p, prio);
-+	if (update_load && p->sched_class->reweight_task) {
-+		p->sched_class->reweight_task(task_rq(p), p, prio);
- 	} else {
- 		load->weight = scale_load(sched_prio_to_weight[prio]);
- 		load->inv_weight = sched_prio_to_wmult[prio];
-@@ -2214,6 +2225,17 @@ inline int task_curr(const struct task_struct *p)
- 	return cpu_curr(task_cpu(p)) == p;
- }
- 
-+/*
-+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
-+ * mess with locking.
-+ */
-+void check_class_changing(struct rq *rq, struct task_struct *p,
-+			  const struct sched_class *prev_class)
-+{
-+	if (prev_class != p->sched_class && p->sched_class->switching_to)
-+		p->sched_class->switching_to(rq, p);
-+}
-+
- /*
-  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
-  * use the balance_callback list if you want balancing.
-@@ -2221,9 +2243,9 @@ inline int task_curr(const struct task_struct *p)
-  * this means any call to check_class_changed() must be followed by a call to
-  * balance_callback().
-  */
--static inline void check_class_changed(struct rq *rq, struct task_struct *p,
--				       const struct sched_class *prev_class,
--				       int oldprio)
-+void check_class_changed(struct rq *rq, struct task_struct *p,
-+			 const struct sched_class *prev_class,
-+			 int oldprio)
- {
- 	if (prev_class != p->sched_class) {
- 		if (prev_class->switched_from)
-@@ -3986,6 +4008,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
- 
- static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
- {
-+	/*
-+	 * The BPF scheduler may depend on select_task_rq() being invoked during
-+	 * wakeups. In addition, @p may end up executing on a different CPU
-+	 * regardless of what happens in the wakeup path making the ttwu_queue
-+	 * optimization less meaningful. Skip if on SCX.
-+	 */
-+	if (task_on_scx(p))
-+		return false;
-+
- 	/*
- 	 * Do not complicate things with the async wake_list while the CPU is
- 	 * in hotplug state.
-@@ -4553,6 +4584,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	p->rt.on_rq		= 0;
- 	p->rt.on_list		= 0;
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	init_scx_entity(&p->scx);
-+#endif
-+
- #ifdef CONFIG_PREEMPT_NOTIFIERS
- 	INIT_HLIST_HEAD(&p->preempt_notifiers);
- #endif
-@@ -4755,6 +4790,8 @@ late_initcall(sched_core_sysctl_init);
-  */
- int sched_fork(unsigned long clone_flags, struct task_struct *p)
- {
-+	int ret;
-+
- 	__sched_fork(clone_flags, p);
- 	/*
- 	 * We mark the process as NEW here. This guarantees that
-@@ -4791,12 +4828,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
- 		p->sched_reset_on_fork = 0;
- 	}
- 
--	if (dl_prio(p->prio))
--		return -EAGAIN;
--	else if (rt_prio(p->prio))
-+	scx_pre_fork(p);
-+
-+	if (dl_prio(p->prio)) {
-+		ret = -EAGAIN;
-+		goto out_cancel;
-+	} else if (rt_prio(p->prio)) {
- 		p->sched_class = &rt_sched_class;
--	else
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	} else if (task_should_scx(p)) {
-+		p->sched_class = &ext_sched_class;
-+#endif
-+	} else {
- 		p->sched_class = &fair_sched_class;
-+	}
- 
- 	init_entity_runnable_average(&p->se);
- 
-@@ -4814,9 +4859,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
- 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
- #endif
- 	return 0;
-+
-+out_cancel:
-+	scx_cancel_fork(p);
-+	return ret;
- }
- 
--void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
-+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
- {
- 	unsigned long flags;
- 
-@@ -4843,11 +4892,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
- 	if (p->sched_class->task_fork)
- 		p->sched_class->task_fork(p);
- 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-+
-+	return scx_fork(p);
-+}
-+
-+void sched_cancel_fork(struct task_struct *p)
-+{
-+	scx_cancel_fork(p);
- }
- 
- void sched_post_fork(struct task_struct *p)
- {
- 	uclamp_post_fork(p);
-+	scx_post_fork(p);
- }
- 
- unsigned long to_ratio(u64 period, u64 runtime)
-@@ -5686,6 +5743,7 @@ void sched_tick(void)
- 	calc_global_load_tick(rq);
- 	sched_core_tick(rq);
- 	task_tick_mm_cid(rq, curr);
-+	scx_tick(rq);
- 
- 	rq_unlock(rq, &rf);
- 
-@@ -5698,8 +5756,10 @@ void sched_tick(void)
- 		wq_worker_tick(curr);
- 
- #ifdef CONFIG_SMP
--	rq->idle_balance = idle_cpu(cpu);
--	sched_balance_trigger(rq);
-+	if (!scx_switched_all()) {
-+		rq->idle_balance = idle_cpu(cpu);
-+		sched_balance_trigger(rq);
-+	}
- #endif
- }
- 
-@@ -5999,7 +6059,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
- 	 * We can terminate the balance pass as soon as we know there is
- 	 * a runnable task of @class priority or higher.
- 	 */
--	for_class_range(class, prev->sched_class, &idle_sched_class) {
-+	for_balance_class_range(class, prev->sched_class, &idle_sched_class) {
- 		if (class->balance(rq, prev, rf))
- 			break;
- 	}
-@@ -6017,6 +6077,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- 	const struct sched_class *class;
- 	struct task_struct *p;
- 
-+	if (scx_enabled())
-+		goto restart;
-+
- 	/*
- 	 * Optimization: we know that if all tasks are in the fair class we can
- 	 * call that function directly, but only if the @prev task wasn't of a
-@@ -6057,10 +6120,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- 	if (prev->dl_server)
- 		prev->dl_server = NULL;
- 
--	for_each_class(class) {
-+	for_each_active_class(class) {
- 		p = class->pick_next_task(rq);
--		if (p)
-+		if (p) {
-+			scx_next_task_picked(rq, p, class);
- 			return p;
-+		}
- 	}
- 
- 	BUG(); /* The idle class should always have a runnable task. */
-@@ -6090,7 +6155,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
- 	const struct sched_class *class;
- 	struct task_struct *p;
- 
--	for_each_class(class) {
-+	for_each_active_class(class) {
- 		p = class->pick_task(rq);
- 		if (p)
- 			return p;
-@@ -7080,12 +7145,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
- }
- EXPORT_SYMBOL(default_wake_function);
- 
--static void __setscheduler_prio(struct task_struct *p, int prio)
-+void __setscheduler_prio(struct task_struct *p, int prio)
- {
- 	if (dl_prio(prio))
- 		p->sched_class = &dl_sched_class;
- 	else if (rt_prio(prio))
- 		p->sched_class = &rt_sched_class;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	else if (task_should_scx(p))
-+		p->sched_class = &ext_sched_class;
-+#endif
- 	else
- 		p->sched_class = &fair_sched_class;
- 
-@@ -7246,6 +7315,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
- 	}
- 
- 	__setscheduler_prio(p, prio);
-+	check_class_changing(rq, p, prev_class);
- 
- 	if (queued)
- 		enqueue_task(rq, p, queue_flag);
-@@ -7467,6 +7537,25 @@ int sched_core_idle_cpu(int cpu)
- #endif
- 
- #ifdef CONFIG_SMP
-+/*
-+ * Load avg and utiliztion metrics need to be updated periodically and before
-+ * consumption. This function updates the metrics for all subsystems except for
-+ * the fair class. @rq must be locked and have its clock updated.
-+ */
-+bool update_other_load_avgs(struct rq *rq)
-+{
-+	u64 now = rq_clock_pelt(rq);
-+	const struct sched_class *curr_class = rq->curr->sched_class;
-+	unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
-+		update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
-+		update_hw_load_avg(now, rq, hw_pressure) |
-+		update_irq_load_avg(rq, 0);
-+}
-+
- /*
-  * This function computes an effective utilization for the given CPU, to be
-  * used for frequency selection given the linear relation: f = u * f_max.
-@@ -7789,6 +7878,10 @@ static int __sched_setscheduler(struct task_struct *p,
- 		goto unlock;
- 	}
- 
-+	retval = scx_check_setscheduler(p, policy);
-+	if (retval)
-+		goto unlock;
-+
- 	/*
- 	 * If not changing anything there's no need to proceed further,
- 	 * but store a possible modification of reset_on_fork.
-@@ -7891,6 +7984,7 @@ static int __sched_setscheduler(struct task_struct *p,
- 		__setscheduler_prio(p, newprio);
- 	}
- 	__setscheduler_uclamp(p, attr);
-+	check_class_changing(rq, p, prev_class);
- 
- 	if (queued) {
- 		/*
-@@ -9066,6 +9160,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 		break;
- 	}
-@@ -9093,6 +9188,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
- 	case SCHED_NORMAL:
- 	case SCHED_BATCH:
- 	case SCHED_IDLE:
-+	case SCHED_EXT:
- 		ret = 0;
- 	}
- 	return ret;
-@@ -9188,6 +9284,7 @@ void sched_show_task(struct task_struct *p)
- 
- 	print_worker_info(KERN_INFO, p);
- 	print_stop_info(KERN_INFO, p);
-+	print_scx_info(KERN_INFO, p);
- 	show_stack(p, NULL, KERN_INFO);
- 	put_task_stack(p);
- }
-@@ -9680,6 +9777,8 @@ int sched_cpu_activate(unsigned int cpu)
- 		cpuset_cpu_active();
- 	}
- 
-+	scx_rq_activate(rq);
-+
- 	/*
- 	 * Put the rq online, if not already. This happens:
- 	 *
-@@ -9740,6 +9839,8 @@ int sched_cpu_deactivate(unsigned int cpu)
- 	}
- 	rq_unlock_irqrestore(rq, &rf);
- 
-+	scx_rq_deactivate(rq);
-+
- #ifdef CONFIG_SCHED_SMT
- 	/*
- 	 * When going down, decrement the number of cores with SMT present.
-@@ -9923,11 +10024,15 @@ void __init sched_init(void)
- 	int i;
- 
- 	/* Make sure the linker didn't screw up */
--	BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
--	       &fair_sched_class != &rt_sched_class + 1 ||
--	       &rt_sched_class   != &dl_sched_class + 1);
- #ifdef CONFIG_SMP
--	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
-+	BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-+#endif
-+	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
-+	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
-+	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
-+	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
- #endif
- 
- 	wait_bit_init();
-@@ -9951,6 +10056,9 @@ void __init sched_init(void)
- 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
- #endif /* CONFIG_FAIR_GROUP_SCHED */
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
-+#endif /* CONFIG_EXT_GROUP_SCHED */
- #ifdef CONFIG_RT_GROUP_SCHED
- 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
- 		ptr += nr_cpu_ids * sizeof(void **);
-@@ -10096,6 +10204,7 @@ void __init sched_init(void)
- 	balance_push_set(smp_processor_id(), false);
- #endif
- 	init_sched_fair_class();
-+	init_sched_ext_class();
- 
- 	psi_init();
- 
-@@ -10381,6 +10490,7 @@ struct task_group *sched_create_group(struct task_group *parent)
- 	if (!alloc_rt_sched_group(tg, parent))
- 		goto err;
- 
-+	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
- 	alloc_uclamp_sched_group(tg, parent);
- 
- 	return tg;
-@@ -10508,6 +10618,7 @@ void sched_move_task(struct task_struct *tsk)
- 		put_prev_task(rq, tsk);
- 
- 	sched_change_group(tsk, group);
-+	scx_move_task(tsk);
- 
- 	if (queued)
- 		enqueue_task(rq, tsk, queue_flags);
-@@ -10522,11 +10633,6 @@ void sched_move_task(struct task_struct *tsk)
- 	}
- }
- 
--static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
--{
--	return css ? container_of(css, struct task_group, css) : NULL;
--}
--
- static struct cgroup_subsys_state *
- cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
- {
-@@ -10550,6 +10656,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
- {
- 	struct task_group *tg = css_tg(css);
- 	struct task_group *parent = css_tg(css->parent);
-+	int ret;
-+
-+	ret = scx_tg_online(tg);
-+	if (ret)
-+		return ret;
- 
- 	if (parent)
- 		sched_online_group(tg, parent);
-@@ -10564,6 +10675,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
- 	return 0;
- }
- 
-+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-+{
-+	struct task_group *tg = css_tg(css);
-+
-+	scx_tg_offline(tg);
-+}
-+
- static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
- {
- 	struct task_group *tg = css_tg(css);
-@@ -10581,9 +10699,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
- 	sched_unregister_group(tg);
- }
- 
--#ifdef CONFIG_RT_GROUP_SCHED
-+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
- static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
- {
-+#ifdef CONFIG_RT_GROUP_SCHED
- 	struct task_struct *task;
- 	struct cgroup_subsys_state *css;
- 
-@@ -10591,7 +10710,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
- 		if (!sched_rt_can_attach(css_tg(css), task))
- 			return -EINVAL;
- 	}
--	return 0;
-+#endif
-+	return scx_cgroup_can_attach(tset);
- }
- #endif
- 
-@@ -10602,8 +10722,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
- 
- 	cgroup_taskset_for_each(task, css, tset)
- 		sched_move_task(task);
-+
-+	scx_cgroup_finish_attach();
- }
- 
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
-+{
-+	scx_cgroup_cancel_attach(tset);
-+}
-+#endif
-+
- #ifdef CONFIG_UCLAMP_TASK_GROUP
- static void cpu_util_update_eff(struct cgroup_subsys_state *css)
- {
-@@ -10782,9 +10911,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
- static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
- 				struct cftype *cftype, u64 shareval)
- {
-+	int ret;
-+
- 	if (shareval > scale_load_down(ULONG_MAX))
- 		shareval = MAX_SHARES;
--	return sched_group_set_shares(css_tg(css), scale_load(shareval));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css),
-+				     sched_weight_to_cgroup(shareval));
-+	return ret;
- }
- 
- static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
-@@ -11181,7 +11316,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
- }
- #endif
- 
--static struct cftype cpu_legacy_files[] = {
-+static struct cftype cpu_legacy_cftypes[] = {
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 	{
- 		.name = "shares",
-@@ -11292,38 +11427,44 @@ static int cpu_local_stat_show(struct seq_file *sf,
- 	return 0;
- }
- 
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+
-+static unsigned long tg_weight(struct task_group *tg)
-+{
- #ifdef CONFIG_FAIR_GROUP_SCHED
-+	return scale_load_down(tg->shares);
-+#else
-+	return sched_weight_from_cgroup(tg->scx_weight);
-+#endif
-+}
-+
- static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
- 			       struct cftype *cft)
- {
--	struct task_group *tg = css_tg(css);
--	u64 weight = scale_load_down(tg->shares);
--
--	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
-+	return sched_weight_to_cgroup(tg_weight(css_tg(css)));
- }
- 
- static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
--				struct cftype *cft, u64 weight)
-+				struct cftype *cft, u64 cgrp_weight)
- {
--	/*
--	 * cgroup weight knobs should use the common MIN, DFL and MAX
--	 * values which are 1, 100 and 10000 respectively.  While it loses
--	 * a bit of range on both ends, it maps pretty well onto the shares
--	 * value used by scheduler and the round-trip conversions preserve
--	 * the original value over the entire range.
--	 */
--	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
-+	unsigned long weight;
-+	int ret;
-+
-+	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
- 		return -ERANGE;
- 
--	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
-+	weight = sched_weight_from_cgroup(cgrp_weight);
- 
--	return sched_group_set_shares(css_tg(css), scale_load(weight));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css), cgrp_weight);
-+	return ret;
- }
- 
- static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
- 				    struct cftype *cft)
- {
--	unsigned long weight = scale_load_down(css_tg(css)->shares);
-+	unsigned long weight = tg_weight(css_tg(css));
- 	int last_delta = INT_MAX;
- 	int prio, delta;
- 
-@@ -11342,7 +11483,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
- 				     struct cftype *cft, s64 nice)
- {
- 	unsigned long weight;
--	int idx;
-+	int idx, ret;
- 
- 	if (nice < MIN_NICE || nice > MAX_NICE)
- 		return -ERANGE;
-@@ -11351,7 +11492,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
- 	idx = array_index_nospec(idx, 40);
- 	weight = sched_prio_to_weight[idx];
- 
--	return sched_group_set_shares(css_tg(css), scale_load(weight));
-+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
-+	if (!ret)
-+		scx_group_set_weight(css_tg(css),
-+				     sched_weight_to_cgroup(weight));
-+	return ret;
- }
- #endif
- 
-@@ -11412,21 +11557,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
- }
- #endif
- 
--static struct cftype cpu_files[] = {
--#ifdef CONFIG_FAIR_GROUP_SCHED
--	{
-+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+	[CPU_CFTYPE_WEIGHT] = {
- 		.name = "weight",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_u64 = cpu_weight_read_u64,
- 		.write_u64 = cpu_weight_write_u64,
- 	},
--	{
-+	[CPU_CFTYPE_WEIGHT_NICE] = {
- 		.name = "weight.nice",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_s64 = cpu_weight_nice_read_s64,
- 		.write_s64 = cpu_weight_nice_write_s64,
- 	},
--	{
-+#endif
-+#ifdef CONFIG_FAIR_GROUP_SCHED
-+	[CPU_CFTYPE_IDLE] = {
- 		.name = "idle",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_s64 = cpu_idle_read_s64,
-@@ -11434,13 +11581,13 @@ static struct cftype cpu_files[] = {
- 	},
- #endif
- #ifdef CONFIG_CFS_BANDWIDTH
--	{
-+	[CPU_CFTYPE_MAX] = {
- 		.name = "max",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_max_show,
- 		.write = cpu_max_write,
- 	},
--	{
-+	[CPU_CFTYPE_MAX_BURST] = {
- 		.name = "max.burst",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.read_u64 = cpu_cfs_burst_read_u64,
-@@ -11448,13 +11595,13 @@ static struct cftype cpu_files[] = {
- 	},
- #endif
- #ifdef CONFIG_UCLAMP_TASK_GROUP
--	{
-+	[CPU_CFTYPE_UCLAMP_MIN] = {
- 		.name = "uclamp.min",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_uclamp_min_show,
- 		.write = cpu_uclamp_min_write,
- 	},
--	{
-+	[CPU_CFTYPE_UCLAMP_MAX] = {
- 		.name = "uclamp.max",
- 		.flags = CFTYPE_NOT_ON_ROOT,
- 		.seq_show = cpu_uclamp_max_show,
-@@ -11467,16 +11614,20 @@ static struct cftype cpu_files[] = {
- struct cgroup_subsys cpu_cgrp_subsys = {
- 	.css_alloc	= cpu_cgroup_css_alloc,
- 	.css_online	= cpu_cgroup_css_online,
-+	.css_offline	= cpu_cgroup_css_offline,
- 	.css_released	= cpu_cgroup_css_released,
- 	.css_free	= cpu_cgroup_css_free,
- 	.css_extra_stat_show = cpu_extra_stat_show,
- 	.css_local_stat_show = cpu_local_stat_show,
--#ifdef CONFIG_RT_GROUP_SCHED
-+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
- 	.can_attach	= cpu_cgroup_can_attach,
- #endif
- 	.attach		= cpu_cgroup_attach,
--	.legacy_cftypes	= cpu_legacy_files,
--	.dfl_cftypes	= cpu_files,
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	.cancel_attach	= cpu_cgroup_cancel_attach,
-+#endif
-+	.legacy_cftypes	= cpu_legacy_cftypes,
-+	.dfl_cftypes	= cpu_cftypes,
- 	.early_init	= true,
- 	.threaded	= true,
- };
-@@ -12064,3 +12215,38 @@ void sched_mm_cid_fork(struct task_struct *t)
- 	t->mm_cid_active = 1;
- }
- #endif
-+
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-+			    struct sched_enq_and_set_ctx *ctx)
-+{
-+	struct rq *rq = task_rq(p);
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	*ctx = (struct sched_enq_and_set_ctx){
-+		.p = p,
-+		.queue_flags = queue_flags,
-+		.queued = task_on_rq_queued(p),
-+		.running = task_current(rq, p),
-+	};
-+
-+	update_rq_clock(rq);
-+	if (ctx->queued)
-+		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
-+	if (ctx->running)
-+		put_prev_task(rq, p);
-+}
-+
-+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
-+{
-+	struct rq *rq = task_rq(ctx->p);
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	if (ctx->queued)
-+		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
-+	if (ctx->running)
-+		set_next_task(rq, ctx->p);
-+}
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
-index eece6244f9d2..12174c0137a5 100644
---- a/kernel/sched/cpufreq_schedutil.c
-+++ b/kernel/sched/cpufreq_schedutil.c
-@@ -197,7 +197,9 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
- 
- static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
- {
--	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
-+	unsigned long min, max;
-+	unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu) +
-+		scx_cpuperf_target(sg_cpu->cpu);
- 
- 	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
- 	util = max(util, boost);
-@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- }
- 
- #ifdef CONFIG_NO_HZ_COMMON
--static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
-+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
- {
--	unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
--	bool ret = idle_calls == sg_cpu->saved_idle_calls;
-+	unsigned long idle_calls;
-+	bool ret;
-+
-+	/*
-+	 * The heuristics in this function is for the fair class. For SCX, the
-+	 * performance target comes directly from the BPF scheduler. Let's just
-+	 * follow it.
-+	 */
-+	if (scx_switched_all())
-+		return false;
-+
-+	/* if capped by uclamp_max, always update to be in compliance */
-+	if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
-+		return false;
-+
-+	/*
-+	 * Maintain the frequency if the CPU has not been idle recently, as
-+	 * reduction is likely to be premature.
-+	 */
-+	idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
-+	ret = idle_calls == sg_cpu->saved_idle_calls;
- 
- 	sg_cpu->saved_idle_calls = idle_calls;
- 	return ret;
- }
- #else
--static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
-+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
- #endif /* CONFIG_NO_HZ_COMMON */
- 
- /*
-@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
- 		return;
- 
- 	next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
--	/*
--	 * Do not reduce the frequency if the CPU has not been idle
--	 * recently, as the reduction is likely to be premature then.
--	 *
--	 * Except when the rq is capped by uclamp_max.
--	 */
--	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
--	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
-+
-+	if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
- 	    !sg_policy->need_freq_update) {
- 		next_f = sg_policy->next_freq;
- 
-@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
- 	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
- 		return;
- 
--	/*
--	 * Do not reduce the target performance level if the CPU has not been
--	 * idle recently, as the reduction is likely to be premature then.
--	 *
--	 * Except when the rq is capped by uclamp_max.
--	 */
--	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
--	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
-+	if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
- 		sg_cpu->util = prev_util;
- 
- 	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index c1eb9a1afd13..c057ef46c5f8 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -1090,6 +1090,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
- 		P(dl.runtime);
- 		P(dl.deadline);
- 	}
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	__PS("ext.enabled", task_on_scx(p));
-+#endif
- #undef PN_SCHEDSTAT
- #undef P_SCHEDSTAT
- 
-diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
-new file mode 100644
-index 000000000000..93e041e2f8d7
---- /dev/null
-+++ b/kernel/sched/ext.c
-@@ -0,0 +1,6973 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-+
-+enum scx_consts {
-+	SCX_DSP_DFL_MAX_BATCH		= 32,
-+	SCX_DSP_MAX_LOOPS		= 32,
-+	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
-+
-+	SCX_EXIT_BT_LEN			= 64,
-+	SCX_EXIT_MSG_LEN		= 1024,
-+	SCX_EXIT_DUMP_DFL_LEN		= 32768,
-+
-+	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,
-+};
-+
-+enum scx_exit_kind {
-+	SCX_EXIT_NONE,
-+	SCX_EXIT_DONE,
-+
-+	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
-+	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
-+	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
-+	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
-+
-+	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
-+	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
-+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
-+};
-+
-+/*
-+ * An exit code can be specified when exiting with scx_bpf_exit() or
-+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
-+ * respectively. The codes are 64bit of the format:
-+ *
-+ *   Bits: [63  ..  48 47   ..  32 31 .. 0]
-+ *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
-+ *
-+ *   SYS ACT: System-defined exit actions
-+ *   SYS RSN: System-defined exit reasons
-+ *   USR    : User-defined exit codes and reasons
-+ *
-+ * Using the above, users may communicate intention and context by ORing system
-+ * actions and/or system reasons with a user-defined exit code.
-+ */
-+enum scx_exit_code {
-+	/* Reasons */
-+	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
-+
-+	/* Actions */
-+	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
-+};
-+
-+/*
-+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
-+ * being disabled.
-+ */
-+struct scx_exit_info {
-+	/* %SCX_EXIT_* - broad category of the exit reason */
-+	enum scx_exit_kind	kind;
-+
-+	/* exit code if gracefully exiting */
-+	s64			exit_code;
-+
-+	/* textual representation of the above */
-+	const char		*reason;
-+
-+	/* backtrace if exiting due to an error */
-+	unsigned long		*bt;
-+	u32			bt_len;
-+
-+	/* informational message */
-+	char			*msg;
-+
-+	/* debug dump */
-+	char			*dump;
-+};
-+
-+/* sched_ext_ops.flags */
-+enum scx_ops_flags {
-+	/*
-+	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
-+	 */
-+	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-+
-+	/*
-+	 * By default, if there are no other task to run on the CPU, ext core
-+	 * keeps running the current task even after its slice expires. If this
-+	 * flag is specified, such tasks are passed to ops.enqueue() with
-+	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
-+	 */
-+	SCX_OPS_ENQ_LAST	= 1LLU << 1,
-+
-+	/*
-+	 * An exiting task may schedule after PF_EXITING is set. In such cases,
-+	 * bpf_task_from_pid() may not be able to find the task and if the BPF
-+	 * scheduler depends on pid lookup for dispatching, the task will be
-+	 * lost leading to various issues including RCU grace period stalls.
-+	 *
-+	 * To mask this problem, by default, unhashed tasks are automatically
-+	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
-+	 * depend on pid lookups and wants to handle these tasks directly, the
-+	 * following flag can be used.
-+	 */
-+	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
-+
-+	/*
-+	 * If set, only tasks with policy set to SCHED_EXT are attached to
-+	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
-+	 */
-+	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
-+
-+	/*
-+	 * CPU cgroup knob enable flags
-+	 */
-+	SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16,	/* cpu.weight */
-+
-+	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
-+				  SCX_OPS_ENQ_LAST |
-+				  SCX_OPS_ENQ_EXITING |
-+				  SCX_OPS_SWITCH_PARTIAL |
-+				  SCX_OPS_CGROUP_KNOB_WEIGHT,
-+};
-+
-+/* argument container for ops.init_task() */
-+struct scx_init_task_args {
-+	/*
-+	 * Set if ops.init_task() is being invoked on the fork path, as opposed
-+	 * to the scheduler transition path.
-+	 */
-+	bool			fork;
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	/* the cgroup the task is joining */
-+	struct cgroup		*cgroup;
-+#endif
-+};
-+
-+/* argument container for ops.exit_task() */
-+struct scx_exit_task_args {
-+	/* Whether the task exited before running on sched_ext. */
-+	bool cancelled;
-+};
-+
-+/* argument container for ops->cgroup_init() */
-+struct scx_cgroup_init_args {
-+	/* the weight of the cgroup [1..10000] */
-+	u32			weight;
-+};
-+
-+enum scx_cpu_preempt_reason {
-+	/* next task is being scheduled by &sched_class_rt */
-+	SCX_CPU_PREEMPT_RT,
-+	/* next task is being scheduled by &sched_class_dl */
-+	SCX_CPU_PREEMPT_DL,
-+	/* next task is being scheduled by &sched_class_stop */
-+	SCX_CPU_PREEMPT_STOP,
-+	/* unknown reason for SCX being preempted */
-+	SCX_CPU_PREEMPT_UNKNOWN,
-+};
-+
-+/*
-+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
-+ * expanded in the future.
-+ */
-+struct scx_cpu_acquire_args {};
-+
-+/* argument container for ops->cpu_release() */
-+struct scx_cpu_release_args {
-+	/* the reason the CPU was preempted */
-+	enum scx_cpu_preempt_reason reason;
-+
-+	/* the task that's going to be scheduled on the CPU */
-+	struct task_struct	*task;
-+};
-+
-+/*
-+ * Informational context provided to dump operations.
-+ */
-+struct scx_dump_ctx {
-+	enum scx_exit_kind	kind;
-+	s64			exit_code;
-+	const char		*reason;
-+	u64			at_ns;
-+	u64			at_jiffies;
-+};
-+
-+/**
-+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
-+ *
-+ * Userland can implement an arbitrary scheduling policy by implementing and
-+ * loading operations in this table.
-+ */
-+struct sched_ext_ops {
-+	/**
-+	 * select_cpu - Pick the target CPU for a task which is being woken up
-+	 * @p: task being woken up
-+	 * @prev_cpu: the cpu @p was on before sleeping
-+	 * @wake_flags: SCX_WAKE_*
-+	 *
-+	 * Decision made here isn't final. @p may be moved to any CPU while it
-+	 * is getting dispatched for execution later. However, as @p is not on
-+	 * the rq at this point, getting the eventual execution CPU right here
-+	 * saves a small bit of overhead down the line.
-+	 *
-+	 * If an idle CPU is returned, the CPU is kicked and will try to
-+	 * dispatch. While an explicit custom mechanism can be added,
-+	 * select_cpu() serves as the default way to wake up idle CPUs.
-+	 *
-+	 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
-+	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
-+	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
-+	 * local DSQ of whatever CPU is returned by this callback.
-+	 */
-+	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-+
-+	/**
-+	 * enqueue - Enqueue a task on the BPF scheduler
-+	 * @p: task being enqueued
-+	 * @enq_flags: %SCX_ENQ_*
-+	 *
-+	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
-+	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
-+	 * scheduler owns @p and if it fails to dispatch @p, the task will
-+	 * stall.
-+	 *
-+	 * If @p was dispatched from ops.select_cpu(), this callback is
-+	 * skipped.
-+	 */
-+	void (*enqueue)(struct task_struct *p, u64 enq_flags);
-+
-+	/**
-+	 * dequeue - Remove a task from the BPF scheduler
-+	 * @p: task being dequeued
-+	 * @deq_flags: %SCX_DEQ_*
-+	 *
-+	 * Remove @p from the BPF scheduler. This is usually called to isolate
-+	 * the task while updating its scheduling properties (e.g. priority).
-+	 *
-+	 * The ext core keeps track of whether the BPF side owns a given task or
-+	 * not and can gracefully ignore spurious dispatches from BPF side,
-+	 * which makes it safe to not implement this method. However, depending
-+	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
-+	 * scheduling position not being updated across a priority change.
-+	 */
-+	void (*dequeue)(struct task_struct *p, u64 deq_flags);
-+
-+	/**
-+	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
-+	 * @cpu: CPU to dispatch tasks for
-+	 * @prev: previous task being switched out
-+	 *
-+	 * Called when a CPU's local dsq is empty. The operation should dispatch
-+	 * one or more tasks from the BPF scheduler into the DSQs using
-+	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
-+	 * scx_bpf_consume().
-+	 *
-+	 * The maximum number of times scx_bpf_dispatch() can be called without
-+	 * an intervening scx_bpf_consume() is specified by
-+	 * ops.dispatch_max_batch. See the comments on top of the two functions
-+	 * for more details.
-+	 *
-+	 * When not %NULL, @prev is an SCX task with its slice depleted. If
-+	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
-+	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
-+	 * ops.dispatch() returns. To keep executing @prev, return without
-+	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
-+	 */
-+	void (*dispatch)(s32 cpu, struct task_struct *prev);
-+
-+	/**
-+	 * tick - Periodic tick
-+	 * @p: task running currently
-+	 *
-+	 * This operation is called every 1/HZ seconds on CPUs which are
-+	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
-+	 * immediate dispatch cycle on the CPU.
-+	 */
-+	void (*tick)(struct task_struct *p);
-+
-+	/**
-+	 * runnable - A task is becoming runnable on its associated CPU
-+	 * @p: task becoming runnable
-+	 * @enq_flags: %SCX_ENQ_*
-+	 *
-+	 * This and the following three functions can be used to track a task's
-+	 * execution state transitions. A task becomes ->runnable() on a CPU,
-+	 * and then goes through one or more ->running() and ->stopping() pairs
-+	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
-+	 * done running on the CPU.
-+	 *
-+	 * @p is becoming runnable on the CPU because it's
-+	 *
-+	 * - waking up (%SCX_ENQ_WAKEUP)
-+	 * - being moved from another CPU
-+	 * - being restored after temporarily taken off the queue for an
-+	 *   attribute change.
-+	 *
-+	 * This and ->enqueue() are related but not coupled. This operation
-+	 * notifies @p's state transition and may not be followed by ->enqueue()
-+	 * e.g. when @p is being dispatched to a remote CPU, or when @p is
-+	 * being enqueued on a CPU experiencing a hotplug event. Likewise, a
-+	 * task may be ->enqueue()'d without being preceded by this operation
-+	 * e.g. after exhausting its slice.
-+	 */
-+	void (*runnable)(struct task_struct *p, u64 enq_flags);
-+
-+	/**
-+	 * running - A task is starting to run on its associated CPU
-+	 * @p: task starting to run
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers.
-+	 */
-+	void (*running)(struct task_struct *p);
-+
-+	/**
-+	 * stopping - A task is stopping execution
-+	 * @p: task stopping to run
-+	 * @runnable: is task @p still runnable?
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers. If
-+	 * !@runnable, ->quiescent() will be invoked after this operation
-+	 * returns.
-+	 */
-+	void (*stopping)(struct task_struct *p, bool runnable);
-+
-+	/**
-+	 * quiescent - A task is becoming not runnable on its associated CPU
-+	 * @p: task becoming not runnable
-+	 * @deq_flags: %SCX_DEQ_*
-+	 *
-+	 * See ->runnable() for explanation on the task state notifiers.
-+	 *
-+	 * @p is becoming quiescent on the CPU because it's
-+	 *
-+	 * - sleeping (%SCX_DEQ_SLEEP)
-+	 * - being moved to another CPU
-+	 * - being temporarily taken off the queue for an attribute change
-+	 *   (%SCX_DEQ_SAVE)
-+	 *
-+	 * This and ->dequeue() are related but not coupled. This operation
-+	 * notifies @p's state transition and may not be preceded by ->dequeue()
-+	 * e.g. when @p is being dispatched to a remote CPU.
-+	 */
-+	void (*quiescent)(struct task_struct *p, u64 deq_flags);
-+
-+	/**
-+	 * yield - Yield CPU
-+	 * @from: yielding task
-+	 * @to: optional yield target task
-+	 *
-+	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
-+	 * The BPF scheduler should ensure that other available tasks are
-+	 * dispatched before the yielding task. Return value is ignored in this
-+	 * case.
-+	 *
-+	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
-+	 * scheduler can implement the request, return %true; otherwise, %false.
-+	 */
-+	bool (*yield)(struct task_struct *from, struct task_struct *to);
-+
-+	/**
-+	 * core_sched_before - Task ordering for core-sched
-+	 * @a: task A
-+	 * @b: task B
-+	 *
-+	 * Used by core-sched to determine the ordering between two tasks. See
-+	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
-+	 * core-sched.
-+	 *
-+	 * Both @a and @b are runnable and may or may not currently be queued on
-+	 * the BPF scheduler. Should return %true if @a should run before @b.
-+	 * %false if there's no required ordering or @b should run before @a.
-+	 *
-+	 * If not specified, the default is ordering them according to when they
-+	 * became runnable.
-+	 */
-+	bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-+
-+	/**
-+	 * set_weight - Set task weight
-+	 * @p: task to set weight for
-+	 * @weight: new eight [1..10000]
-+	 *
-+	 * Update @p's weight to @weight.
-+	 */
-+	void (*set_weight)(struct task_struct *p, u32 weight);
-+
-+	/**
-+	 * set_cpumask - Set CPU affinity
-+	 * @p: task to set CPU affinity for
-+	 * @cpumask: cpumask of cpus that @p can run on
-+	 *
-+	 * Update @p's CPU affinity to @cpumask.
-+	 */
-+	void (*set_cpumask)(struct task_struct *p,
-+			    const struct cpumask *cpumask);
-+
-+	/**
-+	 * update_idle - Update the idle state of a CPU
-+	 * @cpu: CPU to udpate the idle state for
-+	 * @idle: whether entering or exiting the idle state
-+	 *
-+	 * This operation is called when @rq's CPU goes or leaves the idle
-+	 * state. By default, implementing this operation disables the built-in
-+	 * idle CPU tracking and the following helpers become unavailable:
-+	 *
-+	 * - scx_bpf_select_cpu_dfl()
-+	 * - scx_bpf_test_and_clear_cpu_idle()
-+	 * - scx_bpf_pick_idle_cpu()
-+	 *
-+	 * The user also must implement ops.select_cpu() as the default
-+	 * implementation relies on scx_bpf_select_cpu_dfl().
-+	 *
-+	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
-+	 * tracking.
-+	 */
-+	void (*update_idle)(s32 cpu, bool idle);
-+
-+	/**
-+	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
-+	 * @cpu: The CPU being acquired by the BPF scheduler.
-+	 * @args: Acquire arguments, see the struct definition.
-+	 *
-+	 * A CPU that was previously released from the BPF scheduler is now once
-+	 * again under its control.
-+	 */
-+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-+
-+	/**
-+	 * cpu_release - A CPU is taken away from the BPF scheduler
-+	 * @cpu: The CPU being released by the BPF scheduler.
-+	 * @args: Release arguments, see the struct definition.
-+	 *
-+	 * The specified CPU is no longer under the control of the BPF
-+	 * scheduler. This could be because it was preempted by a higher
-+	 * priority sched_class, though there may be other reasons as well. The
-+	 * caller should consult @args->reason to determine the cause.
-+	 */
-+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-+
-+	/**
-+	 * init_task - Initialize a task to run in a BPF scheduler
-+	 * @p: task to initialize for BPF scheduling
-+	 * @args: init arguments, see the struct definition
-+	 *
-+	 * Either we're loading a BPF scheduler or a new task is being forked.
-+	 * Initialize @p for BPF scheduling. This operation may block and can
-+	 * be used for allocations, and is called exactly once for a task.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return while
-+	 * loading will abort loading of the BPF scheduler. During a fork, it
-+	 * will abort that specific fork.
-+	 */
-+	s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-+
-+	/**
-+	 * exit_task - Exit a previously-running task from the system
-+	 * @p: task to exit
-+	 *
-+	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
-+	 * necessary cleanup for @p.
-+	 */
-+	void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-+
-+	/**
-+	 * enable - Enable BPF scheduling for a task
-+	 * @p: task to enable BPF scheduling for
-+	 *
-+	 * Enable @p for BPF scheduling. enable() is called on @p any time it
-+	 * enters SCX, and is always paired with a matching disable().
-+	 */
-+	void (*enable)(struct task_struct *p);
-+
-+	/**
-+	 * disable - Disable BPF scheduling for a task
-+	 * @p: task to disable BPF scheduling for
-+	 *
-+	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
-+	 * Disable BPF scheduling for @p. A disable() call is always matched
-+	 * with a prior enable() call.
-+	 */
-+	void (*disable)(struct task_struct *p);
-+
-+	/**
-+	 * dump - Dump BPF scheduler state on error
-+	 * @ctx: debug dump context
-+	 *
-+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
-+	 */
-+	void (*dump)(struct scx_dump_ctx *ctx);
-+
-+	/**
-+	 * dump_cpu - Dump BPF scheduler state for a CPU on error
-+	 * @ctx: debug dump context
-+	 * @cpu: CPU to generate debug dump for
-+	 * @idle: @cpu is currently idle without any runnable tasks
-+	 *
-+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
-+	 * @cpu. If @idle is %true and this operation doesn't produce any
-+	 * output, @cpu is skipped for dump.
-+	 */
-+	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-+
-+	/**
-+	 * dump_task - Dump BPF scheduler state for a runnable task on error
-+	 * @ctx: debug dump context
-+	 * @p: runnable task to generate debug dump for
-+	 *
-+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
-+	 * @p.
-+	 */
-+	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	/**
-+	 * cgroup_init - Initialize a cgroup
-+	 * @cgrp: cgroup being initialized
-+	 * @args: init arguments, see the struct definition
-+	 *
-+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
-+	 * @cgrp for sched_ext. This operation may block.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return while
-+	 * loading will abort loading of the BPF scheduler. During cgroup
-+	 * creation, it will abort the specific cgroup creation.
-+	 */
-+	s32 (*cgroup_init)(struct cgroup *cgrp,
-+			   struct scx_cgroup_init_args *args);
-+
-+	/**
-+	 * cgroup_exit - Exit a cgroup
-+	 * @cgrp: cgroup being exited
-+	 *
-+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
-+	 * @cgrp for sched_ext. This operation my block.
-+	 */
-+	void (*cgroup_exit)(struct cgroup *cgrp);
-+
-+	/**
-+	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
-+	 * @p: task being moved
-+	 * @from: cgroup @p is being moved from
-+	 * @to: cgroup @p is being moved to
-+	 *
-+	 * Prepare @p for move from cgroup @from to @to. This operation may
-+	 * block and can be used for allocations.
-+	 *
-+	 * Return 0 for success, -errno for failure. An error return aborts the
-+	 * migration.
-+	 */
-+	s32 (*cgroup_prep_move)(struct task_struct *p,
-+				struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_move - Commit cgroup move
-+	 * @p: task being moved
-+	 * @from: cgroup @p is being moved from
-+	 * @to: cgroup @p is being moved to
-+	 *
-+	 * Commit the move. @p is dequeued during this operation.
-+	 */
-+	void (*cgroup_move)(struct task_struct *p,
-+			    struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_cancel_move - Cancel cgroup move
-+	 * @p: task whose cgroup move is being canceled
-+	 * @from: cgroup @p was being moved from
-+	 * @to: cgroup @p was being moved to
-+	 *
-+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
-+	 * Undo the preparation.
-+	 */
-+	void (*cgroup_cancel_move)(struct task_struct *p,
-+				   struct cgroup *from, struct cgroup *to);
-+
-+	/**
-+	 * cgroup_set_weight - A cgroup's weight is being changed
-+	 * @cgrp: cgroup whose weight is being updated
-+	 * @weight: new weight [1..10000]
-+	 *
-+	 * Update @tg's weight to @weight.
-+	 */
-+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-+#endif	/* CONFIG_CGROUPS */
-+
-+	/*
-+	 * All online ops must come before ops.cpu_online().
-+	 */
-+
-+	/**
-+	 * cpu_online - A CPU became online
-+	 * @cpu: CPU which just came up
-+	 *
-+	 * @cpu just came online. @cpu will not call ops.enqueue() or
-+	 * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
-+	 */
-+	void (*cpu_online)(s32 cpu);
-+
-+	/**
-+	 * cpu_offline - A CPU is going offline
-+	 * @cpu: CPU which is going offline
-+	 *
-+	 * @cpu is going offline. @cpu will not call ops.enqueue() or
-+	 * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
-+	 */
-+	void (*cpu_offline)(s32 cpu);
-+
-+	/*
-+	 * All CPU hotplug ops must come before ops.init().
-+	 */
-+
-+	/**
-+	 * init - Initialize the BPF scheduler
-+	 */
-+	s32 (*init)(void);
-+
-+	/**
-+	 * exit - Clean up after the BPF scheduler
-+	 * @info: Exit info
-+	 */
-+	void (*exit)(struct scx_exit_info *info);
-+
-+	/**
-+	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
-+	 */
-+	u32 dispatch_max_batch;
-+
-+	/**
-+	 * flags - %SCX_OPS_* flags
-+	 */
-+	u64 flags;
-+
-+	/**
-+	 * timeout_ms - The maximum amount of time, in milliseconds, that a
-+	 * runnable task should be able to wait before being scheduled. The
-+	 * maximum timeout may not exceed the default timeout of 30 seconds.
-+	 *
-+	 * Defaults to the maximum allowed timeout value of 30 seconds.
-+	 */
-+	u32 timeout_ms;
-+
-+	/**
-+	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
-+	 * value of 32768 is used.
-+	 */
-+	u32 exit_dump_len;
-+
-+	/**
-+	 * hotplug_seq - A sequence number that may be set by the scheduler to
-+	 * detect when a hotplug event has occurred during the loading process.
-+	 * If 0, no detection occurs. Otherwise, the scheduler will fail to
-+	 * load if the sequence number does not match @scx_hotplug_seq on the
-+	 * enable path.
-+	 */
-+	u64 hotplug_seq;
-+
-+	/**
-+	 * name - BPF scheduler's name
-+	 *
-+	 * Must be a non-zero valid BPF object name including only isalnum(),
-+	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
-+	 * BPF scheduler is enabled.
-+	 */
-+	char name[SCX_OPS_NAME_LEN];
-+};
-+
-+enum scx_opi {
-+	SCX_OPI_BEGIN			= 0,
-+	SCX_OPI_NORMAL_BEGIN		= 0,
-+	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
-+	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
-+	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
-+	SCX_OPI_END			= SCX_OP_IDX(init),
-+};
-+
-+enum scx_wake_flags {
-+	/* expose select WF_* flags as enums */
-+	SCX_WAKE_FORK		= WF_FORK,
-+	SCX_WAKE_TTWU		= WF_TTWU,
-+	SCX_WAKE_SYNC		= WF_SYNC,
-+};
-+
-+enum scx_enq_flags {
-+	/* expose select ENQUEUE_* flags as enums */
-+	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
-+	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
-+
-+	/* high 32bits are SCX specific */
-+
-+	/*
-+	 * Set the following to trigger preemption when calling
-+	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
-+	 * current task is cleared to zero and the CPU is kicked into the
-+	 * scheduling path. Implies %SCX_ENQ_HEAD.
-+	 */
-+	SCX_ENQ_PREEMPT		= 1LLU << 32,
-+
-+	/*
-+	 * The task being enqueued was previously enqueued on the current CPU's
-+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
-+	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
-+	 * invoked in a ->cpu_release() callback, and the task is again
-+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
-+	 * task will not be scheduled on the CPU until at least the next invocation
-+	 * of the ->cpu_acquire() callback.
-+	 */
-+	SCX_ENQ_REENQ		= 1LLU << 40,
-+
-+	/*
-+	 * The task being enqueued is the only task available for the cpu. By
-+	 * default, ext core keeps executing such tasks but when
-+	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
-+	 * %SCX_ENQ_LAST flag set.
-+	 *
-+	 * If the BPF scheduler wants to continue executing the task,
-+	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
-+	 * If the task gets queued on a different dsq or the BPF side, the BPF
-+	 * scheduler is responsible for triggering a follow-up scheduling event.
-+	 * Otherwise, Execution may stall.
-+	 */
-+	SCX_ENQ_LAST		= 1LLU << 41,
-+
-+	/* high 8 bits are internal */
-+	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
-+
-+	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
-+	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
-+};
-+
-+enum scx_deq_flags {
-+	/* expose select DEQUEUE_* flags as enums */
-+	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
-+
-+	/* high 32bits are SCX specific */
-+
-+	/*
-+	 * The generic core-sched layer decided to execute the task even though
-+	 * it hasn't been dispatched yet. Dequeue from the BPF side.
-+	 */
-+	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
-+};
-+
-+enum scx_pick_idle_cpu_flags {
-+	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
-+};
-+
-+enum scx_kick_flags {
-+	/*
-+	 * Kick the target CPU if idle. Guarantees that the target CPU goes
-+	 * through at least one full scheduling cycle before going idle. If the
-+	 * target CPU can be determined to be currently not idle and going to go
-+	 * through a scheduling cycle before going idle, noop.
-+	 */
-+	SCX_KICK_IDLE		= 1LLU << 0,
-+
-+	/*
-+	 * Preempt the current task and execute the dispatch path. If the
-+	 * current task of the target CPU is an SCX task, its ->scx.slice is
-+	 * cleared to zero before the scheduling path is invoked so that the
-+	 * task expires and the dispatch path is invoked.
-+	 */
-+	SCX_KICK_PREEMPT	= 1LLU << 1,
-+
-+	/*
-+	 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
-+	 * return after the target CPU finishes picking the next task.
-+	 */
-+	SCX_KICK_WAIT		= 1LLU << 2,
-+};
-+
-+enum scx_tg_flags {
-+	SCX_TG_ONLINE		= 1U << 0,
-+	SCX_TG_INITED		= 1U << 1,
-+};
-+
-+enum scx_ops_enable_state {
-+	SCX_OPS_PREPPING,
-+	SCX_OPS_ENABLING,
-+	SCX_OPS_ENABLED,
-+	SCX_OPS_DISABLING,
-+	SCX_OPS_DISABLED,
-+};
-+
-+static const char *scx_ops_enable_state_str[] = {
-+	[SCX_OPS_PREPPING]	= "prepping",
-+	[SCX_OPS_ENABLING]	= "enabling",
-+	[SCX_OPS_ENABLED]	= "enabled",
-+	[SCX_OPS_DISABLING]	= "disabling",
-+	[SCX_OPS_DISABLED]	= "disabled",
-+};
-+
-+/*
-+ * sched_ext_entity->ops_state
-+ *
-+ * Used to track the task ownership between the SCX core and the BPF scheduler.
-+ * State transitions look as follows:
-+ *
-+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
-+ *   ^              |                 |
-+ *   |              v                 v
-+ *   \-------------------------------/
-+ *
-+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
-+ * sites for explanations on the conditions being waited upon and why they are
-+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
-+ * waiters should load_acquire.
-+ *
-+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
-+ * any given task can be dispatched by the BPF scheduler at all times and thus
-+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
-+ * to try to dispatch any task anytime regardless of its state as the SCX core
-+ * can safely reject invalid dispatches.
-+ */
-+enum scx_ops_state {
-+	SCX_OPSS_NONE,		/* owned by the SCX core */
-+	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
-+	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
-+	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
-+
-+	/*
-+	 * QSEQ brands each QUEUED instance so that, when dispatch races
-+	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
-+	 * on the task being dispatched.
-+	 *
-+	 * As some 32bit archs can't do 64bit store_release/load_acquire,
-+	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
-+	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
-+	 * and runs with IRQ disabled. 30 bits should be sufficient.
-+	 */
-+	SCX_OPSS_QSEQ_SHIFT	= 2,
-+};
-+
-+/* Use macros to ensure that the type is unsigned long for the masks */
-+#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-+#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
-+
-+/*
-+ * During exit, a task may schedule after losing its PIDs. When disabling the
-+ * BPF scheduler, we need to be able to iterate tasks in every state to
-+ * guarantee system safety. Maintain a dedicated task list which contains every
-+ * task between its fork and eventual free.
-+ */
-+static DEFINE_SPINLOCK(scx_tasks_lock);
-+static LIST_HEAD(scx_tasks);
-+
-+/* ops enable/disable */
-+static struct kthread_worker *scx_ops_helper;
-+static DEFINE_MUTEX(scx_ops_enable_mutex);
-+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
-+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
-+static bool scx_switching_all;
-+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-+
-+static struct sched_ext_ops scx_ops;
-+static bool scx_warned_zero_slice;
-+
-+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-+
-+struct static_key_false scx_has_op[SCX_OPI_END] =
-+	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
-+
-+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-+static struct scx_exit_info *scx_exit_info;
-+
-+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
-+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
-+
-+/*
-+ * The maximum amount of time in jiffies that a task may be runnable without
-+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
-+ * scx_ops_error().
-+ */
-+static unsigned long scx_watchdog_timeout;
-+
-+/*
-+ * The last time the delayed work was run. This delayed work relies on
-+ * ksoftirqd being able to run to service timer interrupts, so it's possible
-+ * that this work itself could get wedged. To account for this, we check that
-+ * it's not stalled in the timer tick, and trigger an error if it is.
-+ */
-+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
-+
-+static struct delayed_work scx_watchdog_work;
-+
-+/* idle tracking */
-+#ifdef CONFIG_SMP
-+#ifdef CONFIG_CPUMASK_OFFSTACK
-+#define CL_ALIGNED_IF_ONSTACK
-+#else
-+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-+#endif
-+
-+static struct {
-+	cpumask_var_t cpu;
-+	cpumask_var_t smt;
-+} idle_masks CL_ALIGNED_IF_ONSTACK;
-+
-+#endif	/* CONFIG_SMP */
-+
-+/* for %SCX_KICK_WAIT */
-+static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
-+
-+/*
-+ * Direct dispatch marker.
-+ *
-+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
-+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
-+ * to indicate that direct dispatch has already happened.
-+ */
-+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-+
-+/* dispatch queues */
-+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
-+
-+static const struct rhashtable_params dsq_hash_params = {
-+	.key_len		= 8,
-+	.key_offset		= offsetof(struct scx_dispatch_q, id),
-+	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
-+};
-+
-+static struct rhashtable dsq_hash;
-+static LLIST_HEAD(dsqs_to_free);
-+
-+/* dispatch buf */
-+struct scx_dsp_buf_ent {
-+	struct task_struct	*task;
-+	unsigned long		qseq;
-+	u64			dsq_id;
-+	u64			enq_flags;
-+};
-+
-+static u32 scx_dsp_max_batch;
-+
-+struct scx_dsp_ctx {
-+	struct rq		*rq;
-+	struct rq_flags		*rf;
-+	u32			cursor;
-+	u32			nr_tasks;
-+	struct scx_dsp_buf_ent	buf[];
-+};
-+
-+static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
-+
-+/* string formatting from BPF */
-+struct scx_bstr_buf {
-+	u64			data[MAX_BPRINTF_VARARGS];
-+	char			line[SCX_EXIT_MSG_LEN];
-+};
-+
-+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
-+static struct scx_bstr_buf scx_exit_bstr_buf;
-+
-+/* ops debug dump */
-+struct scx_dump_data {
-+	s32			cpu;
-+	bool			first;
-+	s32			cursor;
-+	struct seq_buf		*s;
-+	const char		*prefix;
-+	struct scx_bstr_buf	buf;
-+};
-+
-+struct scx_dump_data scx_dump_data = {
-+	.cpu			= -1,
-+};
-+
-+/* /sys/kernel/sched_ext interface */
-+static struct kset *scx_kset;
-+static struct kobject *scx_root_kobj;
-+
-+#define CREATE_TRACE_POINTS
-+#include <trace/events/sched_ext.h>
-+
-+static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
-+					     s64 exit_code,
-+					     const char *fmt, ...);
-+
-+#define scx_ops_error_kind(err, fmt, args...)					\
-+	scx_ops_exit_kind((err), 0, fmt, ##args)
-+
-+#define scx_ops_exit(code, fmt, args...)					\
-+	scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
-+
-+#define scx_ops_error(fmt, args...)						\
-+	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
-+
-+#define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
-+
-+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
-+{
-+	if (time_after(at, now))
-+		return jiffies_to_msecs(at - now);
-+	else
-+		return -(long)jiffies_to_msecs(now - at);
-+}
-+
-+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
-+static u32 higher_bits(u32 flags)
-+{
-+	return ~((1 << fls(flags)) - 1);
-+}
-+
-+/* return the mask with only the highest bit set */
-+static u32 highest_bit(u32 flags)
-+{
-+	int bit = fls(flags);
-+	return ((u64) 1 << bit) >> 1;
-+}
-+
-+/*
-+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
-+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
-+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
-+ * whether it's running from an allowed context.
-+ *
-+ * @mask is constant, always inline to cull the mask calculations.
-+ */
-+static __always_inline void scx_kf_allow(u32 mask)
-+{
-+	/* nesting is allowed only in increasing scx_kf_mask order */
-+	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
-+		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
-+		  current->scx.kf_mask, mask);
-+	current->scx.kf_mask |= mask;
-+	barrier();
-+}
-+
-+static void scx_kf_disallow(u32 mask)
-+{
-+	barrier();
-+	current->scx.kf_mask &= ~mask;
-+}
-+
-+#define SCX_CALL_OP(mask, op, args...)						\
-+do {										\
-+	if (mask) {								\
-+		scx_kf_allow(mask);						\
-+		scx_ops.op(args);						\
-+		scx_kf_disallow(mask);						\
-+	} else {								\
-+		scx_ops.op(args);						\
-+	}									\
-+} while (0)
-+
-+#define SCX_CALL_OP_RET(mask, op, args...)					\
-+({										\
-+	__typeof__(scx_ops.op(args)) __ret;					\
-+	if (mask) {								\
-+		scx_kf_allow(mask);						\
-+		__ret = scx_ops.op(args);					\
-+		scx_kf_disallow(mask);						\
-+	} else {								\
-+		__ret = scx_ops.op(args);					\
-+	}									\
-+	__ret;									\
-+})
-+
-+/*
-+ * Some kfuncs are allowed only on the tasks that are subjects of the
-+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
-+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
-+ * invoking scx_ops operations that take task arguments. These can only be used
-+ * for non-nesting operations due to the way the tasks are tracked.
-+ *
-+ * kfuncs which can only operate on such tasks can in turn use
-+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
-+ * the specific task.
-+ */
-+#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
-+do {										\
-+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task;					\
-+	SCX_CALL_OP(mask, op, task, ##args);					\
-+	current->scx.kf_tasks[0] = NULL;					\
-+} while (0)
-+
-+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
-+({										\
-+	__typeof__(scx_ops.op(task, ##args)) __ret;				\
-+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task;					\
-+	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
-+	current->scx.kf_tasks[0] = NULL;					\
-+	__ret;									\
-+})
-+
-+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
-+({										\
-+	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
-+	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
-+	current->scx.kf_tasks[0] = task0;					\
-+	current->scx.kf_tasks[1] = task1;					\
-+	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
-+	current->scx.kf_tasks[0] = NULL;					\
-+	current->scx.kf_tasks[1] = NULL;					\
-+	__ret;									\
-+})
-+
-+/* @mask is constant, always inline to cull unnecessary branches */
-+static __always_inline bool scx_kf_allowed(u32 mask)
-+{
-+	if (unlikely(!(current->scx.kf_mask & mask))) {
-+		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
-+			      mask, current->scx.kf_mask);
-+		return false;
-+	}
-+
-+	if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) {
-+		scx_ops_error("sleepable kfunc called from non-sleepable context");
-+		return false;
-+	}
-+
-+	/*
-+	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
-+	 * DISPATCH must not be called if we're running DEQUEUE which is nested
-+	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
-+	 * boundary thanks to the above in_interrupt() check.
-+	 */
-+	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
-+		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
-+		scx_ops_error("cpu_release kfunc called from a nested operation");
-+		return false;
-+	}
-+
-+	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
-+		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
-+		scx_ops_error("dispatch kfunc called from a nested operation");
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+/* see SCX_CALL_OP_TASK() */
-+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
-+							struct task_struct *p)
-+{
-+	if (!scx_kf_allowed(mask))
-+		return false;
-+
-+	if (unlikely((p != current->scx.kf_tasks[0] &&
-+		      p != current->scx.kf_tasks[1]))) {
-+		scx_ops_error("called on a task not being operated on");
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+/**
-+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
-+ * @dsq: user dsq being interated
-+ * @cur: current position, %NULL to start iteration
-+ * @rev: walk backwards
-+ *
-+ * Returns %NULL when iteration is finished.
-+ */
-+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
-+					   struct task_struct *cur, bool rev)
-+{
-+	struct list_head *list_node;
-+	struct scx_dsq_node *dsq_node;
-+
-+	lockdep_assert_held(&dsq->lock);
-+
-+	if (cur)
-+		list_node = &cur->scx.dsq_node.list;
-+	else
-+		list_node = &dsq->list;
-+
-+	/* find the next task, need to skip BPF iteration cursors */
-+	do {
-+		if (rev)
-+			list_node = list_node->prev;
-+		else
-+			list_node = list_node->next;
-+
-+		if (list_node == &dsq->list)
-+			return NULL;
-+
-+		dsq_node = container_of(list_node, struct scx_dsq_node, list);
-+	} while (dsq_node->flags & SCX_TASK_DSQ_CURSOR);
-+
-+	return container_of(dsq_node, struct task_struct, scx.dsq_node);
-+}
-+
-+#define nldsq_for_each_task(p, dsq)						\
-+	for ((p) = nldsq_next_task((dsq), NULL, false); (p);			\
-+	     (p) = nldsq_next_task((dsq), (p), false))
-+
-+
-+/*
-+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
-+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
-+ * changes without breaking backward compatibility. Can be used with
-+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
-+ */
-+enum scx_dsq_iter_flags {
-+	/* iterate in the reverse dispatch order */
-+	SCX_DSQ_ITER_REV		= 1LLU << 0,
-+
-+	__SCX_DSQ_ITER_ALL_FLAGS	= SCX_DSQ_ITER_REV,
-+};
-+
-+struct bpf_iter_scx_dsq_kern {
-+	/*
-+	 * Must be the first field. Used to work around BPF restriction and pass
-+	 * in the iterator pointer to scx_bpf_consume_task().
-+	 */
-+	struct bpf_iter_scx_dsq_kern	*self;
-+
-+	struct scx_dsq_node		cursor;
-+	struct scx_dispatch_q		*dsq;
-+	u64				dsq_seq;
-+	u64				flags;
-+} __attribute__((aligned(8)));
-+
-+struct bpf_iter_scx_dsq {
-+	u64				__opaque[12];
-+} __attribute__((aligned(8)));
-+
-+
-+/*
-+ * SCX task iterator.
-+ */
-+struct scx_task_iter {
-+	struct sched_ext_entity		cursor;
-+	struct task_struct		*locked;
-+	struct rq			*rq;
-+	struct rq_flags			rf;
-+};
-+
-+/**
-+ * scx_task_iter_init - Initialize a task iterator
-+ * @iter: iterator to init
-+ *
-+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
-+ * @iter must eventually be exited with scx_task_iter_exit().
-+ *
-+ * scx_tasks_lock may be released between this and the first next() call or
-+ * between any two next() calls. If scx_tasks_lock is released between two
-+ * next() calls, the caller is responsible for ensuring that the task being
-+ * iterated remains accessible either through RCU read lock or obtaining a
-+ * reference count.
-+ *
-+ * All tasks which existed when the iteration started are guaranteed to be
-+ * visited as long as they still exist.
-+ */
-+static void scx_task_iter_init(struct scx_task_iter *iter)
-+{
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
-+	list_add(&iter->cursor.tasks_node, &scx_tasks);
-+	iter->locked = NULL;
-+}
-+
-+/**
-+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
-+ * @iter: iterator to unlock rq for
-+ *
-+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
-+ * the task currently being visited. Unlock the rq if so. This function can be
-+ * safely called anytime during an iteration.
-+ *
-+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
-+ * not locking an rq.
-+ */
-+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
-+{
-+	if (iter->locked) {
-+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-+		iter->locked = NULL;
-+		return true;
-+	} else {
-+		return false;
-+	}
-+}
-+
-+/**
-+ * scx_task_iter_exit - Exit a task iterator
-+ * @iter: iterator to exit
-+ *
-+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
-+ * If the iterator holds a task's rq lock, that rq lock is released. See
-+ * scx_task_iter_init() for details.
-+ */
-+static void scx_task_iter_exit(struct scx_task_iter *iter)
-+{
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	scx_task_iter_rq_unlock(iter);
-+	list_del_init(&iter->cursor.tasks_node);
-+}
-+
-+/**
-+ * scx_task_iter_next - Next task
-+ * @iter: iterator to walk
-+ *
-+ * Visit the next task. See scx_task_iter_init() for details.
-+ */
-+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
-+{
-+	struct list_head *cursor = &iter->cursor.tasks_node;
-+	struct sched_ext_entity *pos;
-+
-+	lockdep_assert_held(&scx_tasks_lock);
-+
-+	list_for_each_entry(pos, cursor, tasks_node) {
-+		if (&pos->tasks_node == &scx_tasks)
-+			return NULL;
-+		if (!(pos->flags & SCX_TASK_CURSOR)) {
-+			list_move(cursor, &pos->tasks_node);
-+			return container_of(pos, struct task_struct, scx);
-+		}
-+	}
-+
-+	/* can't happen, should always terminate at scx_tasks above */
-+	BUG();
-+}
-+
-+/**
-+ * scx_task_iter_next_locked - Next non-idle task with its rq locked
-+ * @iter: iterator to walk
-+ * @include_dead: Whether we should include dead tasks in the iteration
-+ *
-+ * Visit the non-idle task with its rq lock held. Allows callers to specify
-+ * whether they would like to filter out dead tasks. See scx_task_iter_init()
-+ * for details.
-+ */
-+static struct task_struct *
-+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
-+{
-+	struct task_struct *p;
-+retry:
-+	scx_task_iter_rq_unlock(iter);
-+
-+	while ((p = scx_task_iter_next(iter))) {
-+		/*
-+		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
-+		 * which haven't yet been onlined. Test sched_class directly.
-+		 */
-+		if (p->sched_class != &idle_sched_class)
-+			break;
-+	}
-+	if (!p)
-+		return NULL;
-+
-+	iter->rq = task_rq_lock(p, &iter->rf);
-+	iter->locked = p;
-+
-+	/*
-+	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
-+	 * the final __schedule(), won't ever need to be scheduled again and can
-+	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
-+	 * the final __schedle() while we're locking its rq and thus will stay
-+	 * alive until the rq is unlocked.
-+	 */
-+	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
-+		goto retry;
-+
-+	return p;
-+}
-+
-+static enum scx_ops_enable_state scx_ops_enable_state(void)
-+{
-+	return atomic_read(&scx_ops_enable_state_var);
-+}
-+
-+static enum scx_ops_enable_state
-+scx_ops_set_enable_state(enum scx_ops_enable_state to)
-+{
-+	return atomic_xchg(&scx_ops_enable_state_var, to);
-+}
-+
-+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
-+					enum scx_ops_enable_state from)
-+{
-+	int from_v = from;
-+
-+	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
-+}
-+
-+static bool scx_ops_bypassing(void)
-+{
-+	return unlikely(atomic_read(&scx_ops_bypass_depth));
-+}
-+
-+/**
-+ * wait_ops_state - Busy-wait the specified ops state to end
-+ * @p: target task
-+ * @opss: state to wait the end of
-+ *
-+ * Busy-wait for @p to transition out of @opss. This can only be used when the
-+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
-+ * has load_acquire semantics to ensure that the caller can see the updates made
-+ * in the enqueueing and dispatching paths.
-+ */
-+static void wait_ops_state(struct task_struct *p, unsigned long opss)
-+{
-+	do {
-+		cpu_relax();
-+	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
-+}
-+
-+/**
-+ * ops_cpu_valid - Verify a cpu number
-+ * @cpu: cpu number which came from a BPF ops
-+ * @where: extra information reported on error
-+ *
-+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
-+ * Verify that it is in range and one of the possible cpus. If invalid, trigger
-+ * an ops error.
-+ */
-+static bool ops_cpu_valid(s32 cpu, const char *where)
-+{
-+	if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
-+		return true;
-+	} else {
-+		scx_ops_error("invalid CPU %d%s%s", cpu,
-+			      where ? " " : "", where ?: "");
-+		return false;
-+	}
-+}
-+
-+/**
-+ * ops_sanitize_err - Sanitize a -errno value
-+ * @ops_name: operation to blame on failure
-+ * @err: -errno value to sanitize
-+ *
-+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
-+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
-+ * cause misbehaviors. For an example, a large negative return from
-+ * ops.init_task() triggers an oops when passed up the call chain because the
-+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
-+ * handled as a pointer.
-+ */
-+static int ops_sanitize_err(const char *ops_name, s32 err)
-+{
-+	if (err < 0 && err >= -MAX_ERRNO)
-+		return err;
-+
-+	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
-+	return -EPROTO;
-+}
-+
-+/**
-+ * touch_core_sched - Update timestamp used for core-sched task ordering
-+ * @rq: rq to read clock from, must be locked
-+ * @p: task to update the timestamp for
-+ *
-+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
-+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
-+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
-+ * exhaustion).
-+ */
-+static void touch_core_sched(struct rq *rq, struct task_struct *p)
-+{
-+#ifdef CONFIG_SCHED_CORE
-+	/*
-+	 * It's okay to update the timestamp spuriously. Use
-+	 * sched_core_disabled() which is cheaper than enabled().
-+	 */
-+	if (!sched_core_disabled())
-+		p->scx.core_sched_at = rq_clock_task(rq);
-+#endif
-+}
-+
-+/**
-+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
-+ * @rq: rq to read clock from, must be locked
-+ * @p: task being dispatched
-+ *
-+ * If the BPF scheduler implements custom core-sched ordering via
-+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
-+ * ordering within each local DSQ. This function is called from dispatch paths
-+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
-+ */
-+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(rq);
-+	assert_clock_updated(rq);
-+
-+#ifdef CONFIG_SCHED_CORE
-+	if (SCX_HAS_OP(core_sched_before))
-+		touch_core_sched(rq, p);
-+#endif
-+}
-+
-+static void update_curr_scx(struct rq *rq)
-+{
-+	struct task_struct *curr = rq->curr;
-+	u64 now = rq_clock_task(rq);
-+	u64 delta_exec;
-+
-+	if (time_before_eq64(now, curr->se.exec_start))
-+		return;
-+
-+	delta_exec = now - curr->se.exec_start;
-+	curr->se.exec_start = now;
-+	curr->se.sum_exec_runtime += delta_exec;
-+	account_group_exec_runtime(curr, delta_exec);
-+	cgroup_account_cputime(curr, delta_exec);
-+
-+	if (curr->scx.slice != SCX_SLICE_INF) {
-+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
-+		if (!curr->scx.slice)
-+			touch_core_sched(rq, curr);
-+	}
-+}
-+
-+static bool scx_dsq_priq_less(struct rb_node *node_a,
-+			      const struct rb_node *node_b)
-+{
-+	const struct task_struct *a =
-+		container_of(node_a, struct task_struct, scx.dsq_node.priq);
-+	const struct task_struct *b =
-+		container_of(node_b, struct task_struct, scx.dsq_node.priq);
-+
-+	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
-+}
-+
-+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
-+{
-+	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
-+	WRITE_ONCE(dsq->nr, dsq->nr + delta);
-+}
-+
-+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
-+			     u64 enq_flags)
-+{
-+	bool is_local = dsq->id == SCX_DSQ_LOCAL;
-+
-+	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list));
-+	WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) ||
-+		     !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
-+
-+	if (!is_local) {
-+		raw_spin_lock(&dsq->lock);
-+		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
-+			scx_ops_error("attempting to dispatch to a destroyed dsq");
-+			/* fall back to the global dsq */
-+			raw_spin_unlock(&dsq->lock);
-+			dsq = &scx_dsq_global;
-+			raw_spin_lock(&dsq->lock);
-+		}
-+	}
-+
-+	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
-+		     (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
-+		/*
-+		 * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
-+		 * their FIFO queues. To avoid confusion and accidentally
-+		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
-+		 * disallow any internal DSQ from doing vtime ordering of
-+		 * tasks.
-+		 */
-+		scx_ops_error("cannot use vtime ordering for built-in DSQs");
-+		enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
-+	}
-+
-+	if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
-+		struct rb_node *rbp;
-+
-+		/*
-+		 * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
-+		 * linked to both the rbtree and list on PRIQs, this can only be
-+		 * tested easily when adding the first task.
-+		 */
-+		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
-+			     nldsq_next_task(dsq, NULL, false)))
-+			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
-+				      dsq->id);
-+
-+		p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ;
-+		rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less);
-+
-+		/*
-+		 * Find the previous task and insert after it on the list so
-+		 * that @dsq->list is vtime ordered.
-+		 */
-+		rbp = rb_prev(&p->scx.dsq_node.priq);
-+		if (rbp) {
-+			struct task_struct *prev =
-+				container_of(rbp, struct task_struct,
-+					     scx.dsq_node.priq);
-+			list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list);
-+		} else {
-+			list_add(&p->scx.dsq_node.list, &dsq->list);
-+		}
-+	} else {
-+		/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
-+		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
-+			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
-+				      dsq->id);
-+
-+		if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
-+			list_add(&p->scx.dsq_node.list, &dsq->list);
-+		else
-+			list_add_tail(&p->scx.dsq_node.list, &dsq->list);
-+	}
-+
-+	/* seq records the order tasks are queued, used by BPF DSQ iterator */
-+	dsq->seq++;
-+	p->scx.dsq_seq = dsq->seq;
-+
-+	dsq_mod_nr(dsq, 1);
-+	WRITE_ONCE(p->scx.dsq, dsq);
-+
-+	/*
-+	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
-+	 * direct dispatch path, but we clear them here because the direct
-+	 * dispatch verdict may be overridden on the enqueue path during e.g.
-+	 * bypass.
-+	 */
-+	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
-+	p->scx.ddsp_enq_flags = 0;
-+
-+	/*
-+	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
-+	 * match waiters' load_acquire.
-+	 */
-+	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-+
-+	if (is_local) {
-+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
-+		bool preempt = false;
-+
-+		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
-+		    rq->curr->sched_class == &ext_sched_class) {
-+			rq->curr->scx.slice = 0;
-+			preempt = true;
-+		}
-+
-+		if (preempt || sched_class_above(&ext_sched_class,
-+						 rq->curr->sched_class))
-+			resched_curr(rq);
-+	} else {
-+		raw_spin_unlock(&dsq->lock);
-+	}
-+}
-+
-+static void task_unlink_from_dsq(struct task_struct *p,
-+				 struct scx_dispatch_q *dsq)
-+{
-+	if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) {
-+		rb_erase(&p->scx.dsq_node.priq, &dsq->priq);
-+		RB_CLEAR_NODE(&p->scx.dsq_node.priq);
-+		p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ;
-+	}
-+
-+	list_del_init(&p->scx.dsq_node.list);
-+}
-+
-+static bool task_linked_on_dsq(struct task_struct *p)
-+{
-+	return !list_empty(&p->scx.dsq_node.list);
-+}
-+
-+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
-+{
-+	struct scx_dispatch_q *dsq = p->scx.dsq;
-+	bool is_local = dsq == &rq->scx.local_dsq;
-+
-+	if (!dsq) {
-+		WARN_ON_ONCE(task_linked_on_dsq(p));
-+		/*
-+		 * When dispatching directly from the BPF scheduler to a local
-+		 * DSQ, the task isn't associated with any DSQ but
-+		 * @p->scx.holding_cpu may be set under the protection of
-+		 * %SCX_OPSS_DISPATCHING.
-+		 */
-+		if (p->scx.holding_cpu >= 0)
-+			p->scx.holding_cpu = -1;
-+		return;
-+	}
-+
-+	if (!is_local)
-+		raw_spin_lock(&dsq->lock);
-+
-+	/*
-+	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
-+	 * can't change underneath us.
-+	*/
-+	if (p->scx.holding_cpu < 0) {
-+		/* @p must still be on @dsq, dequeue */
-+		WARN_ON_ONCE(!task_linked_on_dsq(p));
-+		task_unlink_from_dsq(p, dsq);
-+		dsq_mod_nr(dsq, -1);
-+	} else {
-+		/*
-+		 * We're racing against dispatch_to_local_dsq() which already
-+		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
-+		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
-+		 * the race.
-+		 */
-+		WARN_ON_ONCE(task_linked_on_dsq(p));
-+		p->scx.holding_cpu = -1;
-+	}
-+	WRITE_ONCE(p->scx.dsq, NULL);
-+
-+	if (!is_local)
-+		raw_spin_unlock(&dsq->lock);
-+}
-+
-+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
-+{
-+	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
-+}
-+
-+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
-+{
-+	lockdep_assert(rcu_read_lock_any_held());
-+
-+	if (dsq_id == SCX_DSQ_GLOBAL)
-+		return &scx_dsq_global;
-+	else
-+		return find_user_dsq(dsq_id);
-+}
-+
-+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
-+						    struct task_struct *p)
-+{
-+	struct scx_dispatch_q *dsq;
-+
-+	if (dsq_id == SCX_DSQ_LOCAL)
-+		return &rq->scx.local_dsq;
-+
-+	dsq = find_non_local_dsq(dsq_id);
-+	if (unlikely(!dsq)) {
-+		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
-+			      dsq_id, p->comm, p->pid);
-+		return &scx_dsq_global;
-+	}
-+
-+	return dsq;
-+}
-+
-+static void mark_direct_dispatch(struct task_struct *ddsp_task,
-+				 struct task_struct *p, u64 dsq_id,
-+				 u64 enq_flags)
-+{
-+	/*
-+	 * Mark that dispatch already happened from ops.select_cpu() or
-+	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
-+	 * which can never match a valid task pointer.
-+	 */
-+	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
-+
-+	/* @p must match the task on the enqueue path */
-+	if (unlikely(p != ddsp_task)) {
-+		if (IS_ERR(ddsp_task))
-+			scx_ops_error("%s[%d] already direct-dispatched",
-+				      p->comm, p->pid);
-+		else
-+			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
-+				      ddsp_task->comm, ddsp_task->pid,
-+				      p->comm, p->pid);
-+		return;
-+	}
-+
-+	/*
-+	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
-+	 * dispatching to the local DSQ of a different CPU requires unlocking
-+	 * the current rq which isn't allowed in the enqueue path. Use
-+	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
-+	 */
-+	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
-+		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
-+		return;
-+	}
-+
-+	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
-+	WARN_ON_ONCE(p->scx.ddsp_enq_flags);
-+
-+	p->scx.ddsp_dsq_id = dsq_id;
-+	p->scx.ddsp_enq_flags = enq_flags;
-+}
-+
-+static void direct_dispatch(struct task_struct *p, u64 enq_flags)
-+{
-+	struct scx_dispatch_q *dsq;
-+
-+	touch_core_sched_dispatch(task_rq(p), p);
-+
-+	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
-+	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
-+	dispatch_enqueue(dsq, p, enq_flags);
-+}
-+
-+static bool scx_rq_online(struct rq *rq)
-+{
-+	return likely(rq->scx.flags & SCX_RQ_ONLINE);
-+}
-+
-+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
-+			    int sticky_cpu)
-+{
-+	struct task_struct **ddsp_taskp;
-+	unsigned long qseq;
-+
-+	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
-+
-+	/* rq migration */
-+	if (sticky_cpu == cpu_of(rq))
-+		goto local_norefill;
-+
-+	/*
-+	 * If !rq->online, we already told the BPF scheduler that the CPU is
-+	 * offline. We're just trying to on/offline the CPU. Don't bother the
-+	 * BPF scheduler.
-+	 */
-+	if (!scx_rq_online(rq))
-+		goto local;
-+
-+	if (scx_ops_bypassing()) {
-+		if (enq_flags & SCX_ENQ_LAST)
-+			goto local;
-+		else
-+			goto global;
-+	}
-+
-+	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
-+		goto direct;
-+
-+	/* see %SCX_OPS_ENQ_EXITING */
-+	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
-+	    unlikely(p->flags & PF_EXITING))
-+		goto local;
-+
-+	/* see %SCX_OPS_ENQ_LAST */
-+	if (!static_branch_unlikely(&scx_ops_enq_last) &&
-+	    (enq_flags & SCX_ENQ_LAST))
-+		goto local;
-+
-+	if (!SCX_HAS_OP(enqueue))
-+		goto global;
-+
-+	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
-+	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
-+
-+	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
-+	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
-+
-+	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
-+	WARN_ON_ONCE(*ddsp_taskp);
-+	*ddsp_taskp = p;
-+
-+	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
-+
-+	*ddsp_taskp = NULL;
-+	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
-+		goto direct;
-+
-+	/*
-+	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
-+	 * dequeue may be waiting. The store_release matches their load_acquire.
-+	 */
-+	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
-+	return;
-+
-+direct:
-+	direct_dispatch(p, enq_flags);
-+	return;
-+
-+local:
-+	/*
-+	 * For task-ordering, slice refill must be treated as implying the end
-+	 * of the current slice. Otherwise, the longer @p stays on the CPU, the
-+	 * higher priority it becomes from scx_prio_less()'s POV.
-+	 */
-+	touch_core_sched(rq, p);
-+	p->scx.slice = SCX_SLICE_DFL;
-+local_norefill:
-+	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
-+	return;
-+
-+global:
-+	touch_core_sched(rq, p);	/* see the comment in local: */
-+	p->scx.slice = SCX_SLICE_DFL;
-+	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
-+}
-+
-+static bool task_runnable(const struct task_struct *p)
-+{
-+	return !list_empty(&p->scx.runnable_node);
-+}
-+
-+static void set_task_runnable(struct rq *rq, struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(rq);
-+
-+	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
-+		p->scx.runnable_at = jiffies;
-+		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
-+	}
-+
-+	/*
-+	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
-+	 * appened to the runnable_list.
-+	 */
-+	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
-+}
-+
-+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
-+{
-+	list_del_init(&p->scx.runnable_node);
-+	if (reset_runnable_at)
-+		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
-+}
-+
-+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
-+{
-+	int sticky_cpu = p->scx.sticky_cpu;
-+
-+	enq_flags |= rq->scx.extra_enq_flags;
-+
-+	if (sticky_cpu >= 0)
-+		p->scx.sticky_cpu = -1;
-+
-+	/*
-+	 * Restoring a running task will be immediately followed by
-+	 * set_next_task_scx() which expects the task to not be on the BPF
-+	 * scheduler as tasks can only start running through local DSQs. Force
-+	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
-+	 */
-+	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
-+		sticky_cpu = cpu_of(rq);
-+
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		WARN_ON_ONCE(!task_runnable(p));
-+		return;
-+	}
-+
-+	set_task_runnable(rq, p);
-+	p->scx.flags |= SCX_TASK_QUEUED;
-+	rq->scx.nr_running++;
-+	add_nr_running(rq, 1);
-+
-+	if (SCX_HAS_OP(runnable))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
-+
-+	if (enq_flags & SCX_ENQ_WAKEUP)
-+		touch_core_sched(rq, p);
-+
-+	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
-+}
-+
-+static void ops_dequeue(struct task_struct *p, u64 deq_flags)
-+{
-+	unsigned long opss;
-+
-+	/* dequeue is always temporary, don't reset runnable_at */
-+	clr_task_runnable(p, false);
-+
-+	/* acquire ensures that we see the preceding updates on QUEUED */
-+	opss = atomic_long_read_acquire(&p->scx.ops_state);
-+
-+	switch (opss & SCX_OPSS_STATE_MASK) {
-+	case SCX_OPSS_NONE:
-+		break;
-+	case SCX_OPSS_QUEUEING:
-+		/*
-+		 * QUEUEING is started and finished while holding @p's rq lock.
-+		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
-+		 */
-+		BUG();
-+	case SCX_OPSS_QUEUED:
-+		if (SCX_HAS_OP(dequeue))
-+			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
-+
-+		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
-+					    SCX_OPSS_NONE))
-+			break;
-+		fallthrough;
-+	case SCX_OPSS_DISPATCHING:
-+		/*
-+		 * If @p is being dispatched from the BPF scheduler to a DSQ,
-+		 * wait for the transfer to complete so that @p doesn't get
-+		 * added to its DSQ after dequeueing is complete.
-+		 *
-+		 * As we're waiting on DISPATCHING with the rq locked, the
-+		 * dispatching side shouldn't try to lock the rq while
-+		 * DISPATCHING is set. See dispatch_to_local_dsq().
-+		 *
-+		 * DISPATCHING shouldn't have qseq set and control can reach
-+		 * here with NONE @opss from the above QUEUED case block.
-+		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
-+		 */
-+		wait_ops_state(p, SCX_OPSS_DISPATCHING);
-+		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
-+		break;
-+	}
-+}
-+
-+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
-+{
-+	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
-+		WARN_ON_ONCE(task_runnable(p));
-+		return;
-+	}
-+
-+	ops_dequeue(p, deq_flags);
-+
-+	/*
-+	 * A currently running task which is going off @rq first gets dequeued
-+	 * and then stops running. As we want running <-> stopping transitions
-+	 * to be contained within runnable <-> quiescent transitions, trigger
-+	 * ->stopping() early here instead of in put_prev_task_scx().
-+	 *
-+	 * @p may go through multiple stopping <-> running transitions between
-+	 * here and put_prev_task_scx() if task attribute changes occur while
-+	 * balance_scx() leaves @rq unlocked. However, they don't contain any
-+	 * information meaningful to the BPF scheduler and can be suppressed by
-+	 * skipping the callbacks if the task is !QUEUED.
-+	 */
-+	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
-+		update_curr_scx(rq);
-+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
-+	}
-+
-+	if (SCX_HAS_OP(quiescent))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
-+
-+	if (deq_flags & SCX_DEQ_SLEEP)
-+		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
-+	else
-+		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
-+
-+	p->scx.flags &= ~SCX_TASK_QUEUED;
-+	rq->scx.nr_running--;
-+	sub_nr_running(rq, 1);
-+
-+	dispatch_dequeue(rq, p);
-+}
-+
-+static void yield_task_scx(struct rq *rq)
-+{
-+	struct task_struct *p = rq->curr;
-+
-+	if (SCX_HAS_OP(yield))
-+		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
-+	else
-+		p->scx.slice = 0;
-+}
-+
-+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
-+{
-+	struct task_struct *from = rq->curr;
-+
-+	if (SCX_HAS_OP(yield))
-+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
-+	else
-+		return false;
-+}
-+
-+#ifdef CONFIG_SMP
-+/**
-+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
-+ * @rq: rq to move the task into, currently locked
-+ * @p: task to move
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller
-+ * must:
-+ *
-+ * 1. Start with exclusive access to @p either through its DSQ lock or
-+ *    %SCX_OPSS_DISPATCHING flag.
-+ *
-+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
-+ *
-+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't
-+ *    deadlock with dequeue.
-+ *
-+ * 4. Lock @rq and the task_rq from #3.
-+ *
-+ * 5. Call this function.
-+ *
-+ * Returns %true if @p was successfully moved. %false after racing dequeue and
-+ * losing.
-+ */
-+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
-+				   u64 enq_flags)
-+{
-+	struct rq *task_rq;
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * If dequeue got to @p while we were trying to lock both rq's, it'd
-+	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
-+	 * updated it to different values afterwards, as this operation can't be
-+	 * preempted or recurse, @p->scx.holding_cpu can never become
-+	 * raw_smp_processor_id() again before we're done. Thus, we can tell
-+	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
-+	 * still raw_smp_processor_id().
-+	 *
-+	 * See dispatch_dequeue() for the counterpart.
-+	 */
-+	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
-+		return false;
-+
-+	/* @p->rq couldn't have changed if we're still the holding cpu */
-+	task_rq = task_rq(p);
-+	lockdep_assert_rq_held(task_rq);
-+
-+	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
-+	deactivate_task(task_rq, p, 0);
-+	set_task_cpu(p, cpu_of(rq));
-+	p->scx.sticky_cpu = cpu_of(rq);
-+
-+	/*
-+	 * We want to pass scx-specific enq_flags but activate_task() will
-+	 * truncate the upper 32 bit. As we own @rq, we can pass them through
-+	 * @rq->scx.extra_enq_flags instead.
-+	 */
-+	WARN_ON_ONCE(rq->scx.extra_enq_flags);
-+	rq->scx.extra_enq_flags = enq_flags;
-+	activate_task(rq, p, 0);
-+	rq->scx.extra_enq_flags = 0;
-+
-+	return true;
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
-+ *
-+ * We're holding @rq lock and trying to dispatch a task from @src_rq to
-+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
-+ * @rq stays locked isn't important as long as the state is restored after
-+ * dispatch_to_local_dsq_unlock().
-+ */
-+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
-+				       struct rq *src_rq, struct rq *dst_rq)
-+{
-+	rq_unpin_lock(rq, rf);
-+
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(rq);
-+		raw_spin_rq_lock(dst_rq);
-+	} else if (rq == src_rq) {
-+		double_lock_balance(rq, dst_rq);
-+		rq_repin_lock(rq, rf);
-+	} else if (rq == dst_rq) {
-+		double_lock_balance(rq, src_rq);
-+		rq_repin_lock(rq, rf);
-+	} else {
-+		raw_spin_rq_unlock(rq);
-+		double_rq_lock(src_rq, dst_rq);
-+	}
-+}
-+
-+/**
-+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @src_rq: rq to move task from
-+ * @dst_rq: rq to move task to
-+ *
-+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
-+ */
-+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
-+					 struct rq *src_rq, struct rq *dst_rq)
-+{
-+	if (src_rq == dst_rq) {
-+		raw_spin_rq_unlock(dst_rq);
-+		raw_spin_rq_lock(rq);
-+		rq_repin_lock(rq, rf);
-+	} else if (rq == src_rq) {
-+		double_unlock_balance(rq, dst_rq);
-+	} else if (rq == dst_rq) {
-+		double_unlock_balance(rq, src_rq);
-+	} else {
-+		double_rq_unlock(src_rq, dst_rq);
-+		raw_spin_rq_lock(rq);
-+		rq_repin_lock(rq, rf);
-+	}
-+}
-+#endif	/* CONFIG_SMP */
-+
-+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
-+			       struct task_struct *p)
-+{
-+	lockdep_assert_held(&dsq->lock);	/* released on return */
-+
-+	/* @dsq is locked and @p is on this rq */
-+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+	task_unlink_from_dsq(p, dsq);
-+	list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
-+	dsq_mod_nr(dsq, -1);
-+	dsq_mod_nr(&rq->scx.local_dsq, 1);
-+	WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq);
-+	raw_spin_unlock(&dsq->lock);
-+}
-+
-+#ifdef CONFIG_SMP
-+/*
-+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
-+ * can be pulled to @rq.
-+ */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
-+{
-+	int cpu = cpu_of(rq);
-+
-+	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
-+		return false;
-+	if (unlikely(is_migration_disabled(p)))
-+		return false;
-+	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
-+		return false;
-+	if (!scx_rq_online(rq))
-+		return false;
-+	return true;
-+}
-+
-+static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
-+				struct scx_dispatch_q *dsq,
-+				struct task_struct *p, struct rq *task_rq)
-+{
-+	bool moved = false;
-+
-+	lockdep_assert_held(&dsq->lock);	/* released on return */
-+
-+	/*
-+	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
-+	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
-+	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
-+	 * rq lock or fail, do a little dancing from our side. See
-+	 * move_task_to_local_dsq().
-+	 */
-+	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
-+	task_unlink_from_dsq(p, dsq);
-+	dsq_mod_nr(dsq, -1);
-+	p->scx.holding_cpu = raw_smp_processor_id();
-+	raw_spin_unlock(&dsq->lock);
-+
-+	rq_unpin_lock(rq, rf);
-+	double_lock_balance(rq, task_rq);
-+	rq_repin_lock(rq, rf);
-+
-+	moved = move_task_to_local_dsq(rq, p, 0);
-+
-+	double_unlock_balance(rq, task_rq);
-+
-+	return moved;
-+}
-+#else	/* CONFIG_SMP */
-+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; }
-+static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
-+				struct scx_dispatch_q *dsq,
-+				struct task_struct *p, struct rq *task_rq) { return false; }
-+#endif	/* CONFIG_SMP */
-+
-+static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
-+			       struct scx_dispatch_q *dsq)
-+{
-+	struct task_struct *p;
-+retry:
-+	if (list_empty(&dsq->list))
-+		return false;
-+
-+	raw_spin_lock(&dsq->lock);
-+
-+	nldsq_for_each_task(p, dsq) {
-+		struct rq *task_rq = task_rq(p);
-+
-+		if (rq == task_rq) {
-+			consume_local_task(rq, dsq, p);
-+			return true;
-+		}
-+
-+		if (task_can_run_on_remote_rq(p, rq)) {
-+			if (likely(consume_remote_task(rq, rf, dsq, p, task_rq)))
-+				return true;
-+			goto retry;
-+		}
-+	}
-+
-+	raw_spin_unlock(&dsq->lock);
-+	return false;
-+}
-+
-+enum dispatch_to_local_dsq_ret {
-+	DTL_DISPATCHED,		/* successfully dispatched */
-+	DTL_LOST,		/* lost race to dequeue */
-+	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
-+	DTL_INVALID,		/* invalid local dsq_id */
-+};
-+
-+/**
-+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @dsq_id: destination dsq ID
-+ * @p: task to dispatch
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
-+ * @dsq_id. This function performs all the synchronization dancing needed
-+ * because local DSQs are protected with rq locks.
-+ *
-+ * The caller must have exclusive ownership of @p (e.g. through
-+ * %SCX_OPSS_DISPATCHING).
-+ */
-+static enum dispatch_to_local_dsq_ret
-+dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
-+		      struct task_struct *p, u64 enq_flags)
-+{
-+	struct rq *src_rq = task_rq(p);
-+	struct rq *dst_rq;
-+
-+	/*
-+	 * We're synchronized against dequeue through DISPATCHING. As @p can't
-+	 * be dequeued, its task_rq and cpus_allowed are stable too.
-+	 */
-+	if (dsq_id == SCX_DSQ_LOCAL) {
-+		dst_rq = rq;
-+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+		if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
-+			return DTL_INVALID;
-+		dst_rq = cpu_rq(cpu);
-+	} else {
-+		return DTL_NOT_LOCAL;
-+	}
-+
-+	/* if dispatching to @rq that @p is already on, no lock dancing needed */
-+	if (rq == src_rq && rq == dst_rq) {
-+		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+				 enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		return DTL_DISPATCHED;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
-+		struct rq *locked_dst_rq = dst_rq;
-+		bool dsp;
-+
-+		/*
-+		 * @p is on a possibly remote @src_rq which we need to lock to
-+		 * move the task. If dequeue is in progress, it'd be locking
-+		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
-+		 * lock while holding DISPATCHING.
-+		 *
-+		 * As DISPATCHING guarantees that @p is wholly ours, we can
-+		 * pretend that we're moving from a DSQ and use the same
-+		 * mechanism - mark the task under transfer with holding_cpu,
-+		 * release DISPATCHING and then follow the same protocol.
-+		 */
-+		p->scx.holding_cpu = raw_smp_processor_id();
-+
-+		/* store_release ensures that dequeue sees the above */
-+		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-+
-+		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
-+
-+		/*
-+		 * We don't require the BPF scheduler to avoid dispatching to
-+		 * offline CPUs mostly for convenience but also because CPUs can
-+		 * go offline between scx_bpf_dispatch() calls and here. If @p
-+		 * is destined to an offline CPU, queue it on its current CPU
-+		 * instead, which should always be safe. As this is an allowed
-+		 * behavior, don't trigger an ops error.
-+		 */
-+		if (!scx_rq_online(dst_rq))
-+			dst_rq = src_rq;
-+
-+		if (src_rq == dst_rq) {
-+			/*
-+			 * As @p is staying on the same rq, there's no need to
-+			 * go through the full deactivate/activate cycle.
-+			 * Optimize by abbreviating the operations in
-+			 * move_task_to_local_dsq().
-+			 */
-+			dsp = p->scx.holding_cpu == raw_smp_processor_id();
-+			if (likely(dsp)) {
-+				p->scx.holding_cpu = -1;
-+				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
-+						 enq_flags);
-+			}
-+		} else {
-+			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
-+		}
-+
-+		/* if the destination CPU is idle, wake it up */
-+		if (dsp && p->sched_class < dst_rq->curr->sched_class)
-+			resched_curr(dst_rq);
-+
-+		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
-+
-+		return dsp ? DTL_DISPATCHED : DTL_LOST;
-+	}
-+#endif	/* CONFIG_SMP */
-+
-+	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
-+		      cpu_of(dst_rq), p->comm, p->pid);
-+	return DTL_INVALID;
-+}
-+
-+/**
-+ * finish_dispatch - Asynchronously finish dispatching a task
-+ * @rq: current rq which is locked
-+ * @rf: rq_flags to use when unlocking @rq
-+ * @p: task to finish dispatching
-+ * @qseq_at_dispatch: qseq when @p started getting dispatched
-+ * @dsq_id: destination DSQ ID
-+ * @enq_flags: %SCX_ENQ_*
-+ *
-+ * Dispatching to local DSQs may need to wait for queueing to complete or
-+ * require rq lock dancing. As we don't wanna do either while inside
-+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
-+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
-+ * task and its qseq. Once ops.dispatch() returns, this function is called to
-+ * finish up.
-+ *
-+ * There is no guarantee that @p is still valid for dispatching or even that it
-+ * was valid in the first place. Make sure that the task is still owned by the
-+ * BPF scheduler and claim the ownership before dispatching.
-+ */
-+static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
-+			    struct task_struct *p,
-+			    unsigned long qseq_at_dispatch,
-+			    u64 dsq_id, u64 enq_flags)
-+{
-+	struct scx_dispatch_q *dsq;
-+	unsigned long opss;
-+
-+	touch_core_sched_dispatch(rq, p);
-+retry:
-+	/*
-+	 * No need for _acquire here. @p is accessed only after a successful
-+	 * try_cmpxchg to DISPATCHING.
-+	 */
-+	opss = atomic_long_read(&p->scx.ops_state);
-+
-+	switch (opss & SCX_OPSS_STATE_MASK) {
-+	case SCX_OPSS_DISPATCHING:
-+	case SCX_OPSS_NONE:
-+		/* someone else already got to it */
-+		return;
-+	case SCX_OPSS_QUEUED:
-+		/*
-+		 * If qseq doesn't match, @p has gone through at least one
-+		 * dispatch/dequeue and re-enqueue cycle between
-+		 * scx_bpf_dispatch() and here and we have no claim on it.
-+		 */
-+		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
-+			return;
-+
-+		/*
-+		 * While we know @p is accessible, we don't yet have a claim on
-+		 * it - the BPF scheduler is allowed to dispatch tasks
-+		 * spuriously and there can be a racing dequeue attempt. Let's
-+		 * claim @p by atomically transitioning it from QUEUED to
-+		 * DISPATCHING.
-+		 */
-+		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
-+						   SCX_OPSS_DISPATCHING)))
-+			break;
-+		goto retry;
-+	case SCX_OPSS_QUEUEING:
-+		/*
-+		 * do_enqueue_task() is in the process of transferring the task
-+		 * to the BPF scheduler while holding @p's rq lock. As we aren't
-+		 * holding any kernel or BPF resource that the enqueue path may
-+		 * depend upon, it's safe to wait.
-+		 */
-+		wait_ops_state(p, opss);
-+		goto retry;
-+	}
-+
-+	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
-+
-+	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
-+	case DTL_DISPATCHED:
-+		break;
-+	case DTL_LOST:
-+		break;
-+	case DTL_INVALID:
-+		dsq_id = SCX_DSQ_GLOBAL;
-+		fallthrough;
-+	case DTL_NOT_LOCAL:
-+		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
-+					    dsq_id, p);
-+		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
-+		break;
-+	}
-+}
-+
-+static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+	u32 u;
-+
-+	for (u = 0; u < dspc->cursor; u++) {
-+		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
-+
-+		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
-+				ent->enq_flags);
-+	}
-+
-+	dspc->nr_tasks += dspc->cursor;
-+	dspc->cursor = 0;
-+}
-+
-+static int balance_one(struct rq *rq, struct task_struct *prev,
-+		       struct rq_flags *rf, bool local)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
-+	int nr_loops = SCX_DSP_MAX_LOOPS;
-+	bool has_tasks = false;
-+
-+	lockdep_assert_rq_held(rq);
-+	rq->scx.flags |= SCX_RQ_BALANCING;
-+
-+	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
-+	    unlikely(rq->scx.cpu_released)) {
-+		/*
-+		 * If the previous sched_class for the current CPU was not SCX,
-+		 * notify the BPF scheduler that it again has control of the
-+		 * core. This callback complements ->cpu_release(), which is
-+		 * emitted in scx_next_task_picked().
-+		 */
-+		if (SCX_HAS_OP(cpu_acquire))
-+			SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq),
-+				    NULL);
-+		rq->scx.cpu_released = false;
-+	}
-+
-+	if (prev_on_scx) {
-+		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
-+		update_curr_scx(rq);
-+
-+		/*
-+		 * If @prev is runnable & has slice left, it has priority and
-+		 * fetching more just increases latency for the fetched tasks.
-+		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
-+		 * BPF scheduler wants to handle this explicitly, it should
-+		 * implement ->cpu_released().
-+		 *
-+		 * See scx_ops_disable_workfn() for the explanation on the
-+		 * bypassing test.
-+		 *
-+		 * When balancing a remote CPU for core-sched, there won't be a
-+		 * following put_prev_task_scx() call and we don't own
-+		 * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
-+		 * same conditions later and pick @rq->curr accordingly.
-+		 */
-+		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
-+		    prev->scx.slice && !scx_ops_bypassing()) {
-+			if (local)
-+				prev->scx.flags |= SCX_TASK_BAL_KEEP;
-+			goto has_tasks;
-+		}
-+	}
-+
-+	/* if there already are tasks to run, nothing to do */
-+	if (rq->scx.local_dsq.nr)
-+		goto has_tasks;
-+
-+	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-+		goto has_tasks;
-+
-+	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
-+		goto out;
-+
-+	dspc->rq = rq;
-+	dspc->rf = rf;
-+
-+	/*
-+	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
-+	 * the local DSQ might still end up empty after a successful
-+	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
-+	 * produced some tasks, retry. The BPF scheduler may depend on this
-+	 * looping behavior to simplify its implementation.
-+	 */
-+	do {
-+		dspc->nr_tasks = 0;
-+
-+		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
-+			    prev_on_scx ? prev : NULL);
-+
-+		flush_dispatch_buf(rq, rf);
-+
-+		if (rq->scx.local_dsq.nr)
-+			goto has_tasks;
-+		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
-+			goto has_tasks;
-+
-+		/*
-+		 * ops.dispatch() can trap us in this loop by repeatedly
-+		 * dispatching ineligible tasks. Break out once in a while to
-+		 * allow the watchdog to run. As IRQ can't be enabled in
-+		 * balance(), we want to complete this scheduling cycle and then
-+		 * start a new one. IOW, we want to call resched_curr() on the
-+		 * next, most likely idle, task, not the current one. Use
-+		 * scx_bpf_kick_cpu() for deferred kicking.
-+		 */
-+		if (unlikely(!--nr_loops)) {
-+			scx_bpf_kick_cpu(cpu_of(rq), 0);
-+			break;
-+		}
-+	} while (dspc->nr_tasks);
-+
-+	goto out;
-+
-+has_tasks:
-+	has_tasks = true;
-+out:
-+	rq->scx.flags &= ~SCX_RQ_BALANCING;
-+	return has_tasks;
-+}
-+
-+static int balance_scx(struct rq *rq, struct task_struct *prev,
-+		       struct rq_flags *rf)
-+{
-+	int ret;
-+
-+	ret = balance_one(rq, prev, rf, true);
-+
-+#ifdef CONFIG_SCHED_SMT
-+	/*
-+	 * When core-sched is enabled, this ops.balance() call will be followed
-+	 * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
-+	 * on the SMT siblings. Balance the siblings too.
-+	 */
-+	if (sched_core_enabled(rq)) {
-+		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
-+		int scpu;
-+
-+		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
-+			struct rq *srq = cpu_rq(scpu);
-+			struct rq_flags srf;
-+			struct task_struct *sprev = srq->curr;
-+
-+			/*
-+			 * While core-scheduling, rq lock is shared among
-+			 * siblings but the debug annotations and rq clock
-+			 * aren't. Do pinning dance to transfer the ownership.
-+			 */
-+			WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
-+			rq_unpin_lock(rq, rf);
-+			rq_pin_lock(srq, &srf);
-+
-+			update_rq_clock(srq);
-+			balance_one(srq, sprev, &srf, false);
-+
-+			rq_unpin_lock(srq, &srf);
-+			rq_repin_lock(rq, rf);
-+		}
-+	}
-+#endif
-+	return ret;
-+}
-+
-+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
-+{
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		/*
-+		 * Core-sched might decide to execute @p before it is
-+		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
-+		 */
-+		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
-+		dispatch_dequeue(rq, p);
-+	}
-+
-+	p->se.exec_start = rq_clock_task(rq);
-+
-+	/* see dequeue_task_scx() on why we skip when !QUEUED */
-+	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
-+
-+	clr_task_runnable(p, true);
-+
-+	/*
-+	 * @p is getting newly scheduled or got kicked after someone updated its
-+	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
-+	 */
-+	if ((p->scx.slice == SCX_SLICE_INF) !=
-+	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
-+		if (p->scx.slice == SCX_SLICE_INF)
-+			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
-+		else
-+			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
-+
-+		sched_update_tick_dependency(rq);
-+
-+		/*
-+		 * For now, let's refresh the load_avgs just when transitioning
-+		 * in and out of nohz. In the future, we might want to add a
-+		 * mechanism which calls the following periodically on
-+		 * tick-stopped CPUs.
-+		 */
-+		update_other_load_avgs(rq);
-+	}
-+}
-+
-+static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
-+{
-+#ifndef CONFIG_SMP
-+	/*
-+	 * UP workaround.
-+	 *
-+	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
-+	 * is performed from its balance operation which isn't called in UP.
-+	 * Let's work around by calling it from the operations which come right
-+	 * after.
-+	 *
-+	 * 1. If the prev task is on SCX, pick_next_task() calls
-+	 *    .put_prev_task() right after. As .put_prev_task() is also called
-+	 *    from other places, we need to distinguish the calls which can be
-+	 *    done by looking at the previous task's state - if still queued or
-+	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
-+	 *    This case is handled here.
-+	 *
-+	 * 2. If the prev task is not on SCX, the first following call into SCX
-+	 *    will be .pick_next_task(), which is covered by calling
-+	 *    balance_scx() from pick_next_task_scx().
-+	 *
-+	 * Note that we can't merge the first case into the second as
-+	 * balance_scx() must be called before the previous SCX task goes
-+	 * through put_prev_task_scx().
-+	 *
-+	 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf.
-+	 * Pass in %NULL.
-+	 */
-+	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
-+		balance_scx(rq, p, NULL);
-+#endif
-+
-+	update_curr_scx(rq);
-+
-+	/* see dequeue_task_scx() on why we skip when !QUEUED */
-+	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
-+
-+	/*
-+	 * If we're being called from put_prev_task_balance(), balance_scx() may
-+	 * have decided that @p should keep running.
-+	 */
-+	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
-+		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
-+		set_task_runnable(rq, p);
-+		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-+		return;
-+	}
-+
-+	if (p->scx.flags & SCX_TASK_QUEUED) {
-+		set_task_runnable(rq, p);
-+
-+		/*
-+		 * If @p has slice left and balance_scx() didn't tag it for
-+		 * keeping, @p is getting preempted by a higher priority
-+		 * scheduler class or core-sched forcing a different task. Leave
-+		 * it at the head of the local DSQ.
-+		 */
-+		if (p->scx.slice && !scx_ops_bypassing()) {
-+			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
-+			return;
-+		}
-+
-+		/*
-+		 * If we're in the pick_next_task path, balance_scx() should
-+		 * have already populated the local DSQ if there are any other
-+		 * available tasks. If empty, tell ops.enqueue() that @p is the
-+		 * only one available for this cpu. ops.enqueue() should put it
-+		 * on the local DSQ so that the subsequent pick_next_task_scx()
-+		 * can find the task unless it wants to trigger a separate
-+		 * follow-up scheduling event.
-+		 */
-+		if (list_empty(&rq->scx.local_dsq.list))
-+			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
-+		else
-+			do_enqueue_task(rq, p, 0, -1);
-+	}
-+}
-+
-+static struct task_struct *first_local_task(struct rq *rq)
-+{
-+	return list_first_entry_or_null(&rq->scx.local_dsq.list,
-+					struct task_struct, scx.dsq_node.list);
-+}
-+
-+static struct task_struct *pick_next_task_scx(struct rq *rq)
-+{
-+	struct task_struct *p;
-+
-+#ifndef CONFIG_SMP
-+	/* UP workaround - see the comment at the head of put_prev_task_scx() */
-+	if (unlikely(rq->curr->sched_class != &ext_sched_class))
-+		balance_scx(rq, rq->curr, NULL);
-+#endif
-+
-+	p = first_local_task(rq);
-+	if (!p)
-+		return NULL;
-+
-+	set_next_task_scx(rq, p, true);
-+
-+	if (unlikely(!p->scx.slice)) {
-+		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
-+			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
-+					p->comm, p->pid);
-+			scx_warned_zero_slice = true;
-+		}
-+		p->scx.slice = SCX_SLICE_DFL;
-+	}
-+
-+	return p;
-+}
-+
-+#ifdef CONFIG_SCHED_CORE
-+/**
-+ * scx_prio_less - Task ordering for core-sched
-+ * @a: task A
-+ * @b: task B
-+ *
-+ * Core-sched is implemented as an additional scheduling layer on top of the
-+ * usual sched_class'es and needs to find out the expected task ordering. For
-+ * SCX, core-sched calls this function to interrogate the task ordering.
-+ *
-+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
-+ * to implement the default task ordering. The older the timestamp, the higher
-+ * prority the task - the global FIFO ordering matching the default scheduling
-+ * behavior.
-+ *
-+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
-+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
-+ */
-+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
-+		   bool in_fi)
-+{
-+	/*
-+	 * The const qualifiers are dropped from task_struct pointers when
-+	 * calling ops.core_sched_before(). Accesses are controlled by the
-+	 * verifier.
-+	 */
-+	if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
-+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
-+					      (struct task_struct *)a,
-+					      (struct task_struct *)b);
-+	else
-+		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
-+}
-+
-+/**
-+ * pick_task_scx - Pick a candidate task for core-sched
-+ * @rq: rq to pick the candidate task from
-+ *
-+ * Core-sched calls this function on each SMT sibling to determine the next
-+ * tasks to run on the SMT siblings. balance_one() has been called on all
-+ * siblings and put_prev_task_scx() has been called only for the current CPU.
-+ *
-+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
-+ * at the first task in the local dsq. @rq->curr has to be considered explicitly
-+ * to mimic %SCX_TASK_BAL_KEEP.
-+ */
-+static struct task_struct *pick_task_scx(struct rq *rq)
-+{
-+	struct task_struct *curr = rq->curr;
-+	struct task_struct *first = first_local_task(rq);
-+
-+	if (curr->scx.flags & SCX_TASK_QUEUED) {
-+		/* is curr the only runnable task? */
-+		if (!first)
-+			return curr;
-+
-+		/*
-+		 * Does curr trump first? We can always go by core_sched_at for
-+		 * this comparison as it represents global FIFO ordering when
-+		 * the default core-sched ordering is used and local-DSQ FIFO
-+		 * ordering otherwise.
-+		 *
-+		 * We can have a task with an earlier timestamp on the DSQ. For
-+		 * example, when a current task is preempted by a sibling
-+		 * picking a different cookie, the task would be requeued at the
-+		 * head of the local DSQ with an earlier timestamp than the
-+		 * core-sched picked next task. Besides, the BPF scheduler may
-+		 * dispatch any tasks to the local DSQ anytime.
-+		 */
-+		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
-+						     first->scx.core_sched_at))
-+			return curr;
-+	}
-+
-+	return first;	/* this may be %NULL */
-+}
-+#endif	/* CONFIG_SCHED_CORE */
-+
-+static enum scx_cpu_preempt_reason
-+preempt_reason_from_class(const struct sched_class *class)
-+{
-+#ifdef CONFIG_SMP
-+	if (class == &stop_sched_class)
-+		return SCX_CPU_PREEMPT_STOP;
-+#endif
-+	if (class == &dl_sched_class)
-+		return SCX_CPU_PREEMPT_DL;
-+	if (class == &rt_sched_class)
-+		return SCX_CPU_PREEMPT_RT;
-+	return SCX_CPU_PREEMPT_UNKNOWN;
-+}
-+
-+void scx_next_task_picked(struct rq *rq, struct task_struct *p,
-+			  const struct sched_class *active)
-+{
-+	lockdep_assert_rq_held(rq);
-+
-+	if (!scx_enabled())
-+		return;
-+#ifdef CONFIG_SMP
-+	/*
-+	 * Pairs with the smp_load_acquire() issued by a CPU in
-+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
-+	 * resched.
-+	 */
-+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-+#endif
-+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
-+		return;
-+
-+	/*
-+	 * The callback is conceptually meant to convey that the CPU is no
-+	 * longer under the control of SCX. Therefore, don't invoke the
-+	 * callback if the CPU is is staying on SCX, or going idle (in which
-+	 * case the SCX scheduler has actively decided not to schedule any
-+	 * tasks on the CPU).
-+	 */
-+	if (likely(active >= &ext_sched_class))
-+		return;
-+
-+	/*
-+	 * At this point we know that SCX was preempted by a higher priority
-+	 * sched_class, so invoke the ->cpu_release() callback if we have not
-+	 * done so already. We only send the callback once between SCX being
-+	 * preempted, and it regaining control of the CPU.
-+	 *
-+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
-+	 *  next time that balance_scx() is invoked.
-+	 */
-+	if (!rq->scx.cpu_released) {
-+		if (SCX_HAS_OP(cpu_release)) {
-+			struct scx_cpu_release_args args = {
-+				.reason = preempt_reason_from_class(active),
-+				.task = p,
-+			};
-+
-+			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-+				    cpu_release, cpu_of(rq), &args);
-+		}
-+		rq->scx.cpu_released = true;
-+	}
-+}
-+
-+#ifdef CONFIG_SMP
-+
-+static bool test_and_clear_cpu_idle(int cpu)
-+{
-+#ifdef CONFIG_SCHED_SMT
-+	/*
-+	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
-+	 * cluster is not wholly idle either way. This also prevents
-+	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
-+	 */
-+	if (sched_smt_active()) {
-+		const struct cpumask *smt = cpu_smt_mask(cpu);
-+
-+		/*
-+		 * If offline, @cpu is not its own sibling and
-+		 * scx_pick_idle_cpu() can get caught in an infinite loop as
-+		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
-+		 * is eventually cleared.
-+		 */
-+		if (cpumask_intersects(smt, idle_masks.smt))
-+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-+		else if (cpumask_test_cpu(cpu, idle_masks.smt))
-+			__cpumask_clear_cpu(cpu, idle_masks.smt);
-+	}
-+#endif
-+	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-+}
-+
-+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-+{
-+	int cpu;
-+
-+retry:
-+	if (sched_smt_active()) {
-+		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
-+		if (cpu < nr_cpu_ids)
-+			goto found;
-+
-+		if (flags & SCX_PICK_IDLE_CORE)
-+			return -EBUSY;
-+	}
-+
-+	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
-+	if (cpu >= nr_cpu_ids)
-+		return -EBUSY;
-+
-+found:
-+	if (test_and_clear_cpu_idle(cpu))
-+		return cpu;
-+	else
-+		goto retry;
-+}
-+
-+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
-+			      u64 wake_flags, bool *found)
-+{
-+	s32 cpu;
-+
-+	*found = false;
-+
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return prev_cpu;
-+	}
-+
-+	/*
-+	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
-+	 * under utilized, wake up @p to the local DSQ of the waker. Checking
-+	 * only for an empty local DSQ is insufficient as it could give the
-+	 * wakee an unfair advantage when the system is oversaturated.
-+	 * Checking only for the presence of idle CPUs is also insufficient as
-+	 * the local DSQ of the waker could have tasks piled up on it even if
-+	 * there is an idle core elsewhere on the system.
-+	 */
-+	cpu = smp_processor_id();
-+	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
-+	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
-+	    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
-+		if (cpumask_test_cpu(cpu, p->cpus_ptr))
-+			goto cpu_found;
-+	}
-+
-+	if (p->nr_cpus_allowed == 1) {
-+		if (test_and_clear_cpu_idle(prev_cpu)) {
-+			cpu = prev_cpu;
-+			goto cpu_found;
-+		} else {
-+			return prev_cpu;
-+		}
-+	}
-+
-+	/*
-+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-+	 * partially idle @prev_cpu.
-+	 */
-+	if (sched_smt_active()) {
-+		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
-+		    test_and_clear_cpu_idle(prev_cpu)) {
-+			cpu = prev_cpu;
-+			goto cpu_found;
-+		}
-+
-+		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
-+		if (cpu >= 0)
-+			goto cpu_found;
-+	}
-+
-+	if (test_and_clear_cpu_idle(prev_cpu)) {
-+		cpu = prev_cpu;
-+		goto cpu_found;
-+	}
-+
-+	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0)
-+		goto cpu_found;
-+
-+	return prev_cpu;
-+
-+cpu_found:
-+	*found = true;
-+	return cpu;
-+}
-+
-+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
-+{
-+	/*
-+	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
-+	 * can be a good migration opportunity with low cache and memory
-+	 * footprint. Returning a CPU different than @prev_cpu triggers
-+	 * immediate rq migration. However, for SCX, as the current rq
-+	 * association doesn't dictate where the task is going to run, this
-+	 * doesn't fit well. If necessary, we can later add a dedicated method
-+	 * which can decide to preempt self to force it through the regular
-+	 * scheduling path.
-+	 */
-+	if (unlikely(wake_flags & WF_EXEC))
-+		return prev_cpu;
-+
-+	if (SCX_HAS_OP(select_cpu)) {
-+		s32 cpu;
-+		struct task_struct **ddsp_taskp;
-+
-+		ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
-+		WARN_ON_ONCE(*ddsp_taskp);
-+		*ddsp_taskp = p;
-+
-+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
-+					   select_cpu, p, prev_cpu, wake_flags);
-+		*ddsp_taskp = NULL;
-+		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
-+			return cpu;
-+		else
-+			return prev_cpu;
-+	} else {
-+		bool found;
-+		s32 cpu;
-+
-+		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
-+		if (found) {
-+			p->scx.slice = SCX_SLICE_DFL;
-+			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
-+		}
-+		return cpu;
-+	}
-+}
-+
-+static void set_cpus_allowed_scx(struct task_struct *p,
-+				 struct affinity_context *ac)
-+{
-+	set_cpus_allowed_common(p, ac);
-+
-+	/*
-+	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
-+	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
-+	 * scheduler the effective one.
-+	 *
-+	 * Fine-grained memory write control is enforced by BPF making the const
-+	 * designation pointless. Cast it away when calling the operation.
-+	 */
-+	if (SCX_HAS_OP(set_cpumask))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-+				 (struct cpumask *)p->cpus_ptr);
-+}
-+
-+static void reset_idle_masks(void)
-+{
-+	/*
-+	 * Consider all online cpus idle. Should converge to the actual state
-+	 * quickly.
-+	 */
-+	cpumask_copy(idle_masks.cpu, cpu_online_mask);
-+	cpumask_copy(idle_masks.smt, cpu_online_mask);
-+}
-+
-+void __scx_update_idle(struct rq *rq, bool idle)
-+{
-+	int cpu = cpu_of(rq);
-+
-+	if (SCX_HAS_OP(update_idle)) {
-+		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-+		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
-+			return;
-+	}
-+
-+	if (idle)
-+		cpumask_set_cpu(cpu, idle_masks.cpu);
-+	else
-+		cpumask_clear_cpu(cpu, idle_masks.cpu);
-+
-+#ifdef CONFIG_SCHED_SMT
-+	if (sched_smt_active()) {
-+		const struct cpumask *smt = cpu_smt_mask(cpu);
-+
-+		if (idle) {
-+			/*
-+			 * idle_masks.smt handling is racy but that's fine as
-+			 * it's only for optimization and self-correcting.
-+			 */
-+			for_each_cpu(cpu, smt) {
-+				if (!cpumask_test_cpu(cpu, idle_masks.cpu))
-+					return;
-+			}
-+			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
-+		} else {
-+			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-+		}
-+	}
-+#endif
-+}
-+
-+static void handle_hotplug(struct rq *rq, bool online)
-+{
-+	int cpu = cpu_of(rq);
-+
-+	atomic_long_inc(&scx_hotplug_seq);
-+
-+	if (online && SCX_HAS_OP(cpu_online))
-+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
-+	else if (!online && SCX_HAS_OP(cpu_offline))
-+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
-+	else
-+		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
-+			     "cpu %d going %s, exiting scheduler", cpu,
-+			     online ? "online" : "offline");
-+}
-+
-+static void rq_online_scx(struct rq *rq)
-+{
-+	rq->scx.flags |= SCX_RQ_ONLINE;
-+}
-+
-+static void rq_offline_scx(struct rq *rq)
-+{
-+	rq->scx.flags &= ~SCX_RQ_ONLINE;
-+}
-+
-+void scx_rq_activate(struct rq *rq)
-+{
-+	handle_hotplug(rq, true);
-+}
-+
-+void scx_rq_deactivate(struct rq *rq)
-+{
-+	handle_hotplug(rq, false);
-+}
-+
-+#else	/* CONFIG_SMP */
-+
-+static bool test_and_clear_cpu_idle(int cpu) { return false; }
-+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-+static void reset_idle_masks(void) {}
-+
-+#endif	/* CONFIG_SMP */
-+
-+static bool check_rq_for_timeouts(struct rq *rq)
-+{
-+	struct task_struct *p;
-+	struct rq_flags rf;
-+	bool timed_out = false;
-+
-+	rq_lock_irqsave(rq, &rf);
-+	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
-+		unsigned long last_runnable = p->scx.runnable_at;
-+
-+		if (unlikely(time_after(jiffies,
-+					last_runnable + scx_watchdog_timeout))) {
-+			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
-+
-+			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
-+					   "%s[%d] failed to run for %u.%03us",
-+					   p->comm, p->pid,
-+					   dur_ms / 1000, dur_ms % 1000);
-+			timed_out = true;
-+			break;
-+		}
-+	}
-+	rq_unlock_irqrestore(rq, &rf);
-+
-+	return timed_out;
-+}
-+
-+static void scx_watchdog_workfn(struct work_struct *work)
-+{
-+	int cpu;
-+
-+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
-+
-+	for_each_online_cpu(cpu) {
-+		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
-+			break;
-+
-+		cond_resched();
-+	}
-+	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-+			   scx_watchdog_timeout / 2);
-+}
-+
-+void scx_tick(struct rq *rq)
-+{
-+	unsigned long last_check;
-+
-+	if (!scx_enabled())
-+		return;
-+
-+	last_check = READ_ONCE(scx_watchdog_timestamp);
-+	if (unlikely(time_after(jiffies,
-+				last_check + READ_ONCE(scx_watchdog_timeout)))) {
-+		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
-+
-+		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
-+				   "watchdog failed to check in for %u.%03us",
-+				   dur_ms / 1000, dur_ms % 1000);
-+	}
-+
-+	update_other_load_avgs(rq);
-+}
-+
-+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
-+{
-+	update_curr_scx(rq);
-+
-+	/*
-+	 * While disabling, always resched and refresh core-sched timestamp as
-+	 * we can't trust the slice management or ops.core_sched_before().
-+	 */
-+	if (scx_ops_bypassing()) {
-+		curr->scx.slice = 0;
-+		touch_core_sched(rq, curr);
-+	} else if (SCX_HAS_OP(tick)) {
-+		SCX_CALL_OP(SCX_KF_REST, tick, curr);
-+	}
-+
-+	if (!curr->scx.slice)
-+		resched_curr(rq);
-+}
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static struct cgroup *tg_cgrp(struct task_group *tg)
-+{
-+	/*
-+	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
-+	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
-+	 * root cgroup.
-+	 */
-+	if (tg && tg->css.cgroup)
-+		return tg->css.cgroup;
-+	else
-+		return &cgrp_dfl_root.cgrp;
-+}
-+
-+#define SCX_INIT_TASK_ARGS_CGROUP(tg)		.cgroup = tg_cgrp(tg),
-+
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+
-+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
-+
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+
-+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
-+{
-+	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
-+}
-+
-+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
-+{
-+	enum scx_task_state prev_state = scx_get_task_state(p);
-+	bool warn = false;
-+
-+	BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
-+
-+	switch (state) {
-+	case SCX_TASK_NONE:
-+		break;
-+	case SCX_TASK_INIT:
-+		warn = prev_state != SCX_TASK_NONE;
-+		break;
-+	case SCX_TASK_READY:
-+		warn = prev_state == SCX_TASK_NONE;
-+		break;
-+	case SCX_TASK_ENABLED:
-+		warn = prev_state != SCX_TASK_READY;
-+		break;
-+	default:
-+		warn = true;
-+		return;
-+	}
-+
-+	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
-+		  prev_state, state, p->comm, p->pid);
-+
-+	p->scx.flags &= ~SCX_TASK_STATE_MASK;
-+	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
-+}
-+
-+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
-+{
-+	int ret;
-+
-+	p->scx.disallow = false;
-+
-+	if (SCX_HAS_OP(init_task)) {
-+		struct scx_init_task_args args = {
-+			SCX_INIT_TASK_ARGS_CGROUP(tg)
-+			.fork = fork,
-+		};
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
-+		if (unlikely(ret)) {
-+			ret = ops_sanitize_err("init_task", ret);
-+			return ret;
-+		}
-+	}
-+
-+	scx_set_task_state(p, SCX_TASK_INIT);
-+
-+	if (p->scx.disallow) {
-+		struct rq *rq;
-+		struct rq_flags rf;
-+
-+		rq = task_rq_lock(p, &rf);
-+
-+		/*
-+		 * We're either in fork or load path and @p->policy will be
-+		 * applied right after. Reverting @p->policy here and rejecting
-+		 * %SCHED_EXT transitions from scx_check_setscheduler()
-+		 * guarantees that if ops.init_task() sets @p->disallow, @p can
-+		 * never be in SCX.
-+		 */
-+		if (p->policy == SCHED_EXT) {
-+			p->policy = SCHED_NORMAL;
-+			atomic_long_inc(&scx_nr_rejected);
-+		}
-+
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+
-+	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
-+	return 0;
-+}
-+
-+static void set_task_scx_weight(struct task_struct *p)
-+{
-+	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
-+
-+	p->scx.weight = sched_weight_to_cgroup(weight);
-+}
-+
-+static void scx_ops_enable_task(struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	/*
-+	 * Set the weight before calling ops.enable() so that the scheduler
-+	 * doesn't see a stale value if they inspect the task struct.
-+	 */
-+	set_task_scx_weight(p);
-+	if (SCX_HAS_OP(enable))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
-+	scx_set_task_state(p, SCX_TASK_ENABLED);
-+
-+	if (SCX_HAS_OP(set_weight))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
-+}
-+
-+static void scx_ops_disable_task(struct task_struct *p)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
-+
-+	if (SCX_HAS_OP(disable))
-+		SCX_CALL_OP(SCX_KF_REST, disable, p);
-+	scx_set_task_state(p, SCX_TASK_READY);
-+}
-+
-+static void scx_ops_exit_task(struct task_struct *p)
-+{
-+	struct scx_exit_task_args args = {
-+		.cancelled = false,
-+	};
-+
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	switch (scx_get_task_state(p)) {
-+	case SCX_TASK_NONE:
-+		return;
-+	case SCX_TASK_INIT:
-+		args.cancelled = true;
-+		break;
-+	case SCX_TASK_READY:
-+		break;
-+	case SCX_TASK_ENABLED:
-+		scx_ops_disable_task(p);
-+		break;
-+	default:
-+		WARN_ON_ONCE(true);
-+		return;
-+	}
-+
-+	if (SCX_HAS_OP(exit_task))
-+		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
-+	scx_set_task_state(p, SCX_TASK_NONE);
-+}
-+
-+void init_scx_entity(struct sched_ext_entity *scx)
-+{
-+	/*
-+	 * init_idle() calls this function again after fork sequence is
-+	 * complete. Don't touch ->tasks_node as it's already linked.
-+	 */
-+	memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
-+
-+	INIT_LIST_HEAD(&scx->dsq_node.list);
-+	RB_CLEAR_NODE(&scx->dsq_node.priq);
-+	scx->sticky_cpu = -1;
-+	scx->holding_cpu = -1;
-+	INIT_LIST_HEAD(&scx->runnable_node);
-+	scx->runnable_at = jiffies;
-+	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
-+	scx->slice = SCX_SLICE_DFL;
-+}
-+
-+void scx_pre_fork(struct task_struct *p)
-+{
-+	/*
-+	 * BPF scheduler enable/disable paths want to be able to iterate and
-+	 * update all tasks which can become complex when racing forks. As
-+	 * enable/disable are very cold paths, let's use a percpu_rwsem to
-+	 * exclude forks.
-+	 */
-+	percpu_down_read(&scx_fork_rwsem);
-+}
-+
-+int scx_fork(struct task_struct *p)
-+{
-+	percpu_rwsem_assert_held(&scx_fork_rwsem);
-+
-+	if (scx_enabled())
-+		return scx_ops_init_task(p, task_group(p), true);
-+	else
-+		return 0;
-+}
-+
-+void scx_post_fork(struct task_struct *p)
-+{
-+	if (scx_enabled()) {
-+		scx_set_task_state(p, SCX_TASK_READY);
-+
-+		/*
-+		 * Enable the task immediately if it's running on sched_ext.
-+		 * Otherwise, it'll be enabled in switching_to_scx() if and
-+		 * when it's ever configured to run with a SCHED_EXT policy.
-+		 */
-+		if (p->sched_class == &ext_sched_class) {
-+			struct rq_flags rf;
-+			struct rq *rq;
-+
-+			rq = task_rq_lock(p, &rf);
-+			scx_ops_enable_task(p);
-+			task_rq_unlock(rq, p, &rf);
-+		}
-+	}
-+
-+	spin_lock_irq(&scx_tasks_lock);
-+	list_add_tail(&p->scx.tasks_node, &scx_tasks);
-+	spin_unlock_irq(&scx_tasks_lock);
-+
-+	percpu_up_read(&scx_fork_rwsem);
-+}
-+
-+void scx_cancel_fork(struct task_struct *p)
-+{
-+	if (scx_enabled()) {
-+		struct rq *rq;
-+		struct rq_flags rf;
-+
-+		rq = task_rq_lock(p, &rf);
-+		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
-+		scx_ops_exit_task(p);
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+
-+	percpu_up_read(&scx_fork_rwsem);
-+}
-+
-+void sched_ext_free(struct task_struct *p)
-+{
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&scx_tasks_lock, flags);
-+	list_del_init(&p->scx.tasks_node);
-+	spin_unlock_irqrestore(&scx_tasks_lock, flags);
-+
-+	/*
-+	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
-+	 * ENABLED transitions can't race us. Disable ops for @p.
-+	 */
-+	if (scx_get_task_state(p) != SCX_TASK_NONE) {
-+		struct rq_flags rf;
-+		struct rq *rq;
-+
-+		rq = task_rq_lock(p, &rf);
-+		scx_ops_exit_task(p);
-+		task_rq_unlock(rq, p, &rf);
-+	}
-+}
-+
-+static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	set_task_scx_weight(p);
-+	if (SCX_HAS_OP(set_weight))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
-+}
-+
-+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
-+{
-+}
-+
-+static void switching_to_scx(struct rq *rq, struct task_struct *p)
-+{
-+	scx_ops_enable_task(p);
-+
-+	/*
-+	 * set_cpus_allowed_scx() is not called while @p is associated with a
-+	 * different scheduler class. Keep the BPF scheduler up-to-date.
-+	 */
-+	if (SCX_HAS_OP(set_cpumask))
-+		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-+				 (struct cpumask *)p->cpus_ptr);
-+}
-+
-+static void switched_from_scx(struct rq *rq, struct task_struct *p)
-+{
-+	scx_ops_disable_task(p);
-+}
-+
-+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
-+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
-+
-+int scx_check_setscheduler(struct task_struct *p, int policy)
-+{
-+	lockdep_assert_rq_held(task_rq(p));
-+
-+	/* if disallow, reject transitioning into SCX */
-+	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
-+	    p->policy != policy && policy == SCHED_EXT)
-+		return -EACCES;
-+
-+	return 0;
-+}
-+
-+#ifdef CONFIG_NO_HZ_FULL
-+bool scx_can_stop_tick(struct rq *rq)
-+{
-+	struct task_struct *p = rq->curr;
-+
-+	if (scx_ops_bypassing())
-+		return false;
-+
-+	if (p->sched_class != &ext_sched_class)
-+		return true;
-+
-+	/*
-+	 * @rq can dispatch from different DSQs, so we can't tell whether it
-+	 * needs the tick or not by looking at nr_running. Allow stopping ticks
-+	 * iff the BPF scheduler indicated so. See set_next_task_scx().
-+	 */
-+	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
-+}
-+#endif
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+
-+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
-+
-+int scx_tg_online(struct task_group *tg)
-+{
-+	int ret = 0;
-+
-+	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
-+
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (SCX_HAS_OP(cgroup_init)) {
-+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
-+				      tg->css.cgroup, &args);
-+		if (!ret)
-+			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
-+		else
-+			ret = ops_sanitize_err("cgroup_init", ret);
-+	} else {
-+		tg->scx_flags |= SCX_TG_ONLINE;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+	return ret;
-+}
-+
-+void scx_tg_offline(struct task_group *tg)
-+{
-+	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
-+
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
-+		SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup);
-+	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
-+{
-+	struct cgroup_subsys_state *css;
-+	struct task_struct *p;
-+	int ret;
-+
-+	/* released in scx_finish/cancel_attach() */
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (!scx_enabled())
-+		return 0;
-+
-+	cgroup_taskset_for_each(p, css, tset) {
-+		struct cgroup *from = tg_cgrp(task_group(p));
-+		struct cgroup *to = tg_cgrp(css_tg(css));
-+
-+		WARN_ON_ONCE(p->scx.cgrp_moving_from);
-+
-+		/*
-+		 * sched_move_task() omits identity migrations. Let's match the
-+		 * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
-+		 * always match one-to-one.
-+		 */
-+		if (from == to)
-+			continue;
-+
-+		if (SCX_HAS_OP(cgroup_prep_move)) {
-+			ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move,
-+					      p, from, css->cgroup);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		p->scx.cgrp_moving_from = from;
-+	}
-+
-+	return 0;
-+
-+err:
-+	cgroup_taskset_for_each(p, css, tset) {
-+		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
-+				    p->scx.cgrp_moving_from, css->cgroup);
-+		p->scx.cgrp_moving_from = NULL;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+	return ops_sanitize_err("cgroup_prep_move", ret);
-+}
-+
-+void scx_move_task(struct task_struct *p)
-+{
-+	/*
-+	 * We're called from sched_move_task() which handles both cgroup and
-+	 * autogroup moves. Ignore the latter.
-+	 *
-+	 * Also ignore exiting tasks, because in the exit path tasks transition
-+	 * from the autogroup to the root group, so task_group_is_autogroup()
-+	 * alone isn't able to catch exiting autogroup tasks. This is safe for
-+	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
-+	 * tasks.
-+	 */
-+	if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p)))
-+		return;
-+
-+	if (!scx_enabled())
-+		return;
-+
-+	/*
-+	 * @p must have ops.cgroup_prep_move() called on it and thus
-+	 * cgrp_moving_from set.
-+	 */
-+	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
-+		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
-+			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
-+	p->scx.cgrp_moving_from = NULL;
-+}
-+
-+void scx_cgroup_finish_attach(void)
-+{
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
-+{
-+	struct cgroup_subsys_state *css;
-+	struct task_struct *p;
-+
-+	if (!scx_enabled())
-+		goto out_unlock;
-+
-+	cgroup_taskset_for_each(p, css, tset) {
-+		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p,
-+				    p->scx.cgrp_moving_from, css->cgroup);
-+		p->scx.cgrp_moving_from = NULL;
-+	}
-+out_unlock:
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
-+{
-+	percpu_down_read(&scx_cgroup_rwsem);
-+
-+	if (tg->scx_weight != weight) {
-+		if (SCX_HAS_OP(cgroup_set_weight))
-+			SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight,
-+				    tg_cgrp(tg), weight);
-+		tg->scx_weight = weight;
-+	}
-+
-+	percpu_up_read(&scx_cgroup_rwsem);
-+}
-+
-+static void scx_cgroup_lock(void)
-+{
-+	percpu_down_write(&scx_cgroup_rwsem);
-+}
-+
-+static void scx_cgroup_unlock(void)
-+{
-+	percpu_up_write(&scx_cgroup_rwsem);
-+}
-+
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+
-+static inline void scx_cgroup_lock(void) {}
-+static inline void scx_cgroup_unlock(void) {}
-+
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+
-+/*
-+ * Omitted operations:
-+ *
-+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
-+ *   isn't tied to the CPU at that point. Preemption is implemented by resetting
-+ *   the victim task's slice to 0 and triggering reschedule on the target CPU.
-+ *
-+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
-+ *
-+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
-+ *   their current sched_class. Call them directly from sched core instead.
-+ *
-+ * - task_woken: Unnecessary.
-+ */
-+DEFINE_SCHED_CLASS(ext) = {
-+	.enqueue_task		= enqueue_task_scx,
-+	.dequeue_task		= dequeue_task_scx,
-+	.yield_task		= yield_task_scx,
-+	.yield_to_task		= yield_to_task_scx,
-+
-+	.wakeup_preempt		= wakeup_preempt_scx,
-+
-+	.pick_next_task		= pick_next_task_scx,
-+
-+	.put_prev_task		= put_prev_task_scx,
-+	.set_next_task		= set_next_task_scx,
-+
-+#ifdef CONFIG_SMP
-+	.balance		= balance_scx,
-+	.select_task_rq		= select_task_rq_scx,
-+	.set_cpus_allowed	= set_cpus_allowed_scx,
-+
-+	.rq_online		= rq_online_scx,
-+	.rq_offline		= rq_offline_scx,
-+#endif
-+
-+#ifdef CONFIG_SCHED_CORE
-+	.pick_task		= pick_task_scx,
-+#endif
-+
-+	.task_tick		= task_tick_scx,
-+
-+	.switching_to		= switching_to_scx,
-+	.switched_from		= switched_from_scx,
-+	.switched_to		= switched_to_scx,
-+	.reweight_task		= reweight_task_scx,
-+	.prio_changed		= prio_changed_scx,
-+
-+	.update_curr		= update_curr_scx,
-+
-+#ifdef CONFIG_UCLAMP_TASK
-+	.uclamp_enabled		= 1,
-+#endif
-+};
-+
-+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
-+{
-+	memset(dsq, 0, sizeof(*dsq));
-+
-+	raw_spin_lock_init(&dsq->lock);
-+	INIT_LIST_HEAD(&dsq->list);
-+	dsq->id = dsq_id;
-+}
-+
-+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-+{
-+	struct scx_dispatch_q *dsq;
-+	int ret;
-+
-+	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
-+		return ERR_PTR(-EINVAL);
-+
-+	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
-+	if (!dsq)
-+		return ERR_PTR(-ENOMEM);
-+
-+	init_dsq(dsq, dsq_id);
-+
-+	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
-+				     dsq_hash_params);
-+	if (ret) {
-+		kfree(dsq);
-+		return ERR_PTR(ret);
-+	}
-+	return dsq;
-+}
-+
-+static void free_dsq_irq_workfn(struct irq_work *irq_work)
-+{
-+	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
-+	struct scx_dispatch_q *dsq, *tmp_dsq;
-+
-+	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
-+		kfree_rcu(dsq, rcu);
-+}
-+
-+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-+
-+static void destroy_dsq(u64 dsq_id)
-+{
-+	struct scx_dispatch_q *dsq;
-+	unsigned long flags;
-+
-+	rcu_read_lock();
-+
-+	dsq = find_user_dsq(dsq_id);
-+	if (!dsq)
-+		goto out_unlock_rcu;
-+
-+	raw_spin_lock_irqsave(&dsq->lock, flags);
-+
-+	if (dsq->nr) {
-+		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
-+			      dsq->id, dsq->nr);
-+		goto out_unlock_dsq;
-+	}
-+
-+	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
-+		goto out_unlock_dsq;
-+
-+	/*
-+	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
-+	 * queueing more tasks. As this function can be called from anywhere,
-+	 * freeing is bounced through an irq work to avoid nesting RCU
-+	 * operations inside scheduler locks.
-+	 */
-+	dsq->id = SCX_DSQ_INVALID;
-+	llist_add(&dsq->free_node, &dsqs_to_free);
-+	irq_work_queue(&free_dsq_irq_work);
-+
-+out_unlock_dsq:
-+	raw_spin_unlock_irqrestore(&dsq->lock, flags);
-+out_unlock_rcu:
-+	rcu_read_unlock();
-+}
-+
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static void scx_cgroup_exit(void)
-+{
-+	struct cgroup_subsys_state *css;
-+
-+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-+
-+	/*
-+	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
-+	 * cgroups and exit all the inited ones, all online cgroups are exited.
-+	 */
-+	rcu_read_lock();
-+	css_for_each_descendant_post(css, &root_task_group.css) {
-+		struct task_group *tg = css_tg(css);
-+
-+		if (!(tg->scx_flags & SCX_TG_INITED))
-+			continue;
-+		tg->scx_flags &= ~SCX_TG_INITED;
-+
-+		if (!scx_ops.cgroup_exit)
-+			continue;
-+
-+		if (WARN_ON_ONCE(!css_tryget(css)))
-+			continue;
-+		rcu_read_unlock();
-+
-+		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
-+
-+		rcu_read_lock();
-+		css_put(css);
-+	}
-+	rcu_read_unlock();
-+}
-+
-+static int scx_cgroup_init(void)
-+{
-+	struct cgroup_subsys_state *css;
-+	int ret;
-+
-+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-+
-+	/*
-+	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
-+	 * cgroups and init, all online cgroups are initialized.
-+	 */
-+	rcu_read_lock();
-+	css_for_each_descendant_pre(css, &root_task_group.css) {
-+		struct task_group *tg = css_tg(css);
-+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
-+
-+		if ((tg->scx_flags &
-+		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
-+			continue;
-+
-+		if (!scx_ops.cgroup_init) {
-+			tg->scx_flags |= SCX_TG_INITED;
-+			continue;
-+		}
-+
-+		if (WARN_ON_ONCE(!css_tryget(css)))
-+			continue;
-+		rcu_read_unlock();
-+
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init,
-+				      css->cgroup, &args);
-+		if (ret) {
-+			css_put(css);
-+			return ret;
-+		}
-+		tg->scx_flags |= SCX_TG_INITED;
-+
-+		rcu_read_lock();
-+		css_put(css);
-+	}
-+	rcu_read_unlock();
-+
-+	return 0;
-+}
-+
-+static void scx_cgroup_config_knobs(void)
-+{
-+	static DEFINE_MUTEX(cgintf_mutex);
-+	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
-+	u64 knob_flags;
-+	int i;
-+
-+	/*
-+	 * Called from both class switch and ops enable/disable paths,
-+	 * synchronize internally.
-+	 */
-+	mutex_lock(&cgintf_mutex);
-+
-+	/* if fair is in use, all knobs should be shown */
-+	if (!scx_switched_all()) {
-+		bitmap_fill(mask, CPU_CFTYPE_CNT);
-+		goto apply;
-+	}
-+
-+	/*
-+	 * On ext, only show the supported knobs. Otherwise, show all possible
-+	 * knobs so that configuration attempts succeed and the states are
-+	 * remembered while ops is not loaded.
-+	 */
-+	if (scx_enabled())
-+		knob_flags = scx_ops.flags;
-+	else
-+		knob_flags = SCX_OPS_ALL_FLAGS;
-+
-+	if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) {
-+		__set_bit(CPU_CFTYPE_WEIGHT, mask);
-+		__set_bit(CPU_CFTYPE_WEIGHT_NICE, mask);
-+	}
-+apply:
-+	for (i = 0; i < CPU_CFTYPE_CNT; i++)
-+		cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask));
-+
-+	mutex_unlock(&cgintf_mutex);
-+}
-+
-+#else
-+static void scx_cgroup_exit(void) {}
-+static int scx_cgroup_init(void) { return 0; }
-+static void scx_cgroup_config_knobs(void) {}
-+#endif
-+
-+
-+/********************************************************************************
-+ * Sysfs interface and ops enable/disable.
-+ */
-+
-+#define SCX_ATTR(_name)								\
-+	static struct kobj_attribute scx_attr_##_name = {			\
-+		.attr = { .name = __stringify(_name), .mode = 0444 },		\
-+		.show = scx_attr_##_name##_show,				\
-+	}
-+
-+static ssize_t scx_attr_state_show(struct kobject *kobj,
-+				   struct kobj_attribute *ka, char *buf)
-+{
-+	return sysfs_emit(buf, "%s\n",
-+			  scx_ops_enable_state_str[scx_ops_enable_state()]);
-+}
-+SCX_ATTR(state);
-+
-+static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
-+					struct kobj_attribute *ka, char *buf)
-+{
-+	return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
-+}
-+SCX_ATTR(switch_all);
-+
-+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
-+					 struct kobj_attribute *ka, char *buf)
-+{
-+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
-+}
-+SCX_ATTR(nr_rejected);
-+
-+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
-+					 struct kobj_attribute *ka, char *buf)
-+{
-+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
-+}
-+SCX_ATTR(hotplug_seq);
-+
-+static struct attribute *scx_global_attrs[] = {
-+	&scx_attr_state.attr,
-+	&scx_attr_switch_all.attr,
-+	&scx_attr_nr_rejected.attr,
-+	&scx_attr_hotplug_seq.attr,
-+	NULL,
-+};
-+
-+static const struct attribute_group scx_global_attr_group = {
-+	.attrs = scx_global_attrs,
-+};
-+
-+static void scx_kobj_release(struct kobject *kobj)
-+{
-+	kfree(kobj);
-+}
-+
-+static ssize_t scx_attr_ops_show(struct kobject *kobj,
-+				 struct kobj_attribute *ka, char *buf)
-+{
-+	return sysfs_emit(buf, "%s\n", scx_ops.name);
-+}
-+SCX_ATTR(ops);
-+
-+static struct attribute *scx_sched_attrs[] = {
-+	&scx_attr_ops.attr,
-+	NULL,
-+};
-+ATTRIBUTE_GROUPS(scx_sched);
-+
-+static const struct kobj_type scx_ktype = {
-+	.release = scx_kobj_release,
-+	.sysfs_ops = &kobj_sysfs_ops,
-+	.default_groups = scx_sched_groups,
-+};
-+
-+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
-+{
-+	return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
-+}
-+
-+static const struct kset_uevent_ops scx_uevent_ops = {
-+	.uevent = scx_uevent,
-+};
-+
-+/*
-+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
-+ * sched_class. dl/rt are already handled.
-+ */
-+bool task_should_scx(struct task_struct *p)
-+{
-+	if (!scx_enabled() ||
-+	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
-+		return false;
-+	if (READ_ONCE(scx_switching_all))
-+		return true;
-+	return p->policy == SCHED_EXT;
-+}
-+
-+/**
-+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
-+ *
-+ * Bypassing guarantees that all runnable tasks make forward progress without
-+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
-+ * be held by tasks that the BPF scheduler is forgetting to run, which
-+ * unfortunately also excludes toggling the static branches.
-+ *
-+ * Let's work around by overriding a couple ops and modifying behaviors based on
-+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
-+ * to force global FIFO scheduling.
-+ *
-+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
-+ *
-+ * b. ops.dispatch() is ignored.
-+ *
-+ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
-+ *    trusted. Whenever a tick triggers, the running task is rotated to the tail
-+ *    of the queue with core_sched_at touched.
-+ *
-+ * d. pick_next_task() suppresses zero slice warning.
-+ *
-+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
-+ *    operations.
-+ *
-+ * f. scx_prio_less() reverts to the default core_sched_at order.
-+ */
-+static void scx_ops_bypass(bool bypass)
-+{
-+	int depth, cpu;
-+
-+	if (bypass) {
-+		depth = atomic_inc_return(&scx_ops_bypass_depth);
-+		WARN_ON_ONCE(depth <= 0);
-+		if (depth != 1)
-+			return;
-+	} else {
-+		depth = atomic_dec_return(&scx_ops_bypass_depth);
-+		WARN_ON_ONCE(depth < 0);
-+		if (depth != 0)
-+			return;
-+	}
-+
-+	/*
-+	 * We need to guarantee that no tasks are on the BPF scheduler while
-+	 * bypassing. Either we see enabled or the enable path sees the
-+	 * increased bypass_depth before moving tasks to SCX.
-+	 */
-+	if (!scx_enabled())
-+		return;
-+
-+	/*
-+	 * No task property is changing. We just need to make sure all currently
-+	 * queued tasks are re-queued according to the new scx_ops_bypassing()
-+	 * state. As an optimization, walk each rq's runnable_list instead of
-+	 * the scx_tasks list.
-+	 *
-+	 * This function can't trust the scheduler and thus can't use
-+	 * cpus_read_lock(). Walk all possible CPUs instead of online.
-+	 */
-+	for_each_possible_cpu(cpu) {
-+		struct rq *rq = cpu_rq(cpu);
-+		struct rq_flags rf;
-+		struct task_struct *p, *n;
-+
-+		rq_lock_irqsave(rq, &rf);
-+
-+		/*
-+		 * The use of list_for_each_entry_safe_reverse() is required
-+		 * because each task is going to be removed from and added back
-+		 * to the runnable_list during iteration. Because they're added
-+		 * to the tail of the list, safe reverse iteration can still
-+		 * visit all nodes.
-+		 */
-+		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
-+						 scx.runnable_node) {
-+			struct sched_enq_and_set_ctx ctx;
-+
-+			/* cycling deq/enq is enough, see the function comment */
-+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-+			sched_enq_and_set_task(&ctx);
-+		}
-+
-+		rq_unlock_irqrestore(rq, &rf);
-+
-+		/* kick to restore ticks */
-+		resched_cpu(cpu);
-+	}
-+}
-+
-+static void free_exit_info(struct scx_exit_info *ei)
-+{
-+	kfree(ei->dump);
-+	kfree(ei->msg);
-+	kfree(ei->bt);
-+	kfree(ei);
-+}
-+
-+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
-+{
-+	struct scx_exit_info *ei;
-+
-+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
-+	if (!ei)
-+		return NULL;
-+
-+	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
-+	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
-+	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
-+
-+	if (!ei->bt || !ei->msg || !ei->dump) {
-+		free_exit_info(ei);
-+		return NULL;
-+	}
-+
-+	return ei;
-+}
-+
-+static const char *scx_exit_reason(enum scx_exit_kind kind)
-+{
-+	switch (kind) {
-+	case SCX_EXIT_UNREG:
-+		return "Scheduler unregistered from user space";
-+	case SCX_EXIT_UNREG_BPF:
-+		return "Scheduler unregistered from BPF";
-+	case SCX_EXIT_UNREG_KERN:
-+		return "Scheduler unregistered from the main kernel";
-+	case SCX_EXIT_SYSRQ:
-+		return "disabled by sysrq-S";
-+	case SCX_EXIT_ERROR:
-+		return "runtime error";
-+	case SCX_EXIT_ERROR_BPF:
-+		return "scx_bpf_error";
-+	case SCX_EXIT_ERROR_STALL:
-+		return "runnable task stall";
-+	default:
-+		return "<UNKNOWN>";
-+	}
-+}
-+
-+static void scx_ops_disable_workfn(struct kthread_work *work)
-+{
-+	struct scx_exit_info *ei = scx_exit_info;
-+	struct scx_task_iter sti;
-+	struct task_struct *p;
-+	struct rhashtable_iter rht_iter;
-+	struct scx_dispatch_q *dsq;
-+	int i, kind;
-+
-+	kind = atomic_read(&scx_exit_kind);
-+	while (true) {
-+		/*
-+		 * NONE indicates that a new scx_ops has been registered since
-+		 * disable was scheduled - don't kill the new ops. DONE
-+		 * indicates that the ops has already been disabled.
-+		 */
-+		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
-+			return;
-+		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
-+			break;
-+	}
-+	ei->kind = kind;
-+	ei->reason = scx_exit_reason(ei->kind);
-+
-+	/* guarantee forward progress by bypassing scx_ops */
-+	scx_ops_bypass(true);
-+
-+	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
-+	case SCX_OPS_DISABLING:
-+		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
-+		break;
-+	case SCX_OPS_DISABLED:
-+		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-+			scx_exit_info->msg);
-+		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
-+			     SCX_OPS_DISABLING);
-+		goto done;
-+	default:
-+		break;
-+	}
-+
-+	/*
-+	 * Here, every runnable task is guaranteed to make forward progress and
-+	 * we can safely use blocking synchronization constructs. Actually
-+	 * disable ops.
-+	 */
-+	mutex_lock(&scx_ops_enable_mutex);
-+
-+	static_branch_disable(&__scx_switched_all);
-+	WRITE_ONCE(scx_switching_all, false);
-+
-+	/*
-+	 * Avoid racing against fork and cgroup changes. See scx_ops_enable()
-+	 * for explanation on the locking order.
-+	 */
-+	percpu_down_write(&scx_fork_rwsem);
-+	cpus_read_lock();
-+	scx_cgroup_lock();
-+
-+	spin_lock_irq(&scx_tasks_lock);
-+	scx_task_iter_init(&sti);
-+	/*
-+	 * Invoke scx_ops_exit_task() on all non-idle tasks, including
-+	 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount,
-+	 * we may not have invoked sched_ext_free() on them by the time a
-+	 * scheduler is disabled. We must therefore exit the task here, or we'd
-+	 * fail to invoke ops.exit_task(), as the scheduler will have been
-+	 * unloaded by the time the task is subsequently exited on the
-+	 * sched_ext_free() path.
-+	 */
-+	while ((p = scx_task_iter_next_locked(&sti, true))) {
-+		const struct sched_class *old_class = p->sched_class;
-+		struct sched_enq_and_set_ctx ctx;
-+
-+		if (READ_ONCE(p->__state) != TASK_DEAD) {
-+			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
-+					       &ctx);
-+
-+			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
-+			__setscheduler_prio(p, p->prio);
-+			check_class_changing(task_rq(p), p, old_class);
-+
-+			sched_enq_and_set_task(&ctx);
-+
-+			check_class_changed(task_rq(p), p, old_class, p->prio);
-+		}
-+		scx_ops_exit_task(p);
-+	}
-+	scx_task_iter_exit(&sti);
-+	spin_unlock_irq(&scx_tasks_lock);
-+
-+	/* no task is on scx, turn off all the switches and flush in-progress calls */
-+	static_branch_disable_cpuslocked(&__scx_ops_enabled);
-+	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
-+		static_branch_disable_cpuslocked(&scx_has_op[i]);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_last);
-+	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
-+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
-+	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
-+	synchronize_rcu();
-+
-+	scx_cgroup_exit();
-+
-+	scx_cgroup_unlock();
-+	cpus_read_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+
-+	if (ei->kind >= SCX_EXIT_ERROR) {
-+		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
-+
-+		if (ei->msg[0] == '\0')
-+			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
-+		else
-+			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
-+
-+		stack_trace_print(ei->bt, ei->bt_len, 2);
-+	}
-+
-+	if (scx_ops.exit)
-+		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
-+
-+	cancel_delayed_work_sync(&scx_watchdog_work);
-+
-+	/*
-+	 * Delete the kobject from the hierarchy eagerly in addition to just
-+	 * dropping a reference. Otherwise, if the object is deleted
-+	 * asynchronously, sysfs could observe an object of the same name still
-+	 * in the hierarchy when another scheduler is loaded.
-+	 */
-+	kobject_del(scx_root_kobj);
-+	kobject_put(scx_root_kobj);
-+	scx_root_kobj = NULL;
-+
-+	memset(&scx_ops, 0, sizeof(scx_ops));
-+
-+	rhashtable_walk_enter(&dsq_hash, &rht_iter);
-+	do {
-+		rhashtable_walk_start(&rht_iter);
-+
-+		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
-+			destroy_dsq(dsq->id);
-+
-+		rhashtable_walk_stop(&rht_iter);
-+	} while (dsq == ERR_PTR(-EAGAIN));
-+	rhashtable_walk_exit(&rht_iter);
-+
-+	free_percpu(scx_dsp_ctx);
-+	scx_dsp_ctx = NULL;
-+	scx_dsp_max_batch = 0;
-+
-+	free_exit_info(scx_exit_info);
-+	scx_exit_info = NULL;
-+
-+	mutex_unlock(&scx_ops_enable_mutex);
-+
-+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
-+		     SCX_OPS_DISABLING);
-+
-+	scx_cgroup_config_knobs();
-+done:
-+	scx_ops_bypass(false);
-+}
-+
-+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-+
-+static void schedule_scx_ops_disable_work(void)
-+{
-+	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-+
-+	/*
-+	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
-+	 * scx_ops_helper isn't set up yet, there's nothing to do.
-+	 */
-+	if (helper)
-+		kthread_queue_work(helper, &scx_ops_disable_work);
-+}
-+
-+static void scx_ops_disable(enum scx_exit_kind kind)
-+{
-+	int none = SCX_EXIT_NONE;
-+
-+	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
-+		kind = SCX_EXIT_ERROR;
-+
-+	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-+
-+	schedule_scx_ops_disable_work();
-+}
-+
-+static void dump_newline(struct seq_buf *s)
-+{
-+	trace_sched_ext_dump("");
-+
-+	/* @s may be zero sized and seq_buf triggers WARN if so */
-+	if (s->size)
-+		seq_buf_putc(s, '\n');
-+}
-+
-+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
-+{
-+	va_list args;
-+
-+#ifdef CONFIG_TRACEPOINTS
-+	if (trace_sched_ext_dump_enabled()) {
-+		/* protected by scx_dump_state()::dump_lock */
-+		static char line_buf[SCX_EXIT_MSG_LEN];
-+
-+		va_start(args, fmt);
-+		vscnprintf(line_buf, sizeof(line_buf), fmt, args);
-+		va_end(args);
-+
-+		trace_sched_ext_dump(line_buf);
-+	}
-+#endif
-+	/* @s may be zero sized and seq_buf triggers WARN if so */
-+	if (s->size) {
-+		va_start(args, fmt);
-+		seq_buf_vprintf(s, fmt, args);
-+		va_end(args);
-+
-+		seq_buf_putc(s, '\n');
-+	}
-+}
-+
-+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
-+			     const unsigned long *bt, unsigned int len)
-+{
-+	unsigned int i;
-+
-+	for (i = 0; i < len; i++)
-+		dump_line(s, "%s%pS", prefix, (void *)bt[i]);
-+}
-+
-+static void ops_dump_init(struct seq_buf *s, const char *prefix)
-+{
-+	struct scx_dump_data *dd = &scx_dump_data;
-+
-+	lockdep_assert_irqs_disabled();
-+
-+	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */
-+	dd->first = true;
-+	dd->cursor = 0;
-+	dd->s = s;
-+	dd->prefix = prefix;
-+}
-+
-+static void ops_dump_flush(void)
-+{
-+	struct scx_dump_data *dd = &scx_dump_data;
-+	char *line = dd->buf.line;
-+
-+	if (!dd->cursor)
-+		return;
-+
-+	/*
-+	 * There's something to flush and this is the first line. Insert a blank
-+	 * line to distinguish ops dump.
-+	 */
-+	if (dd->first) {
-+		dump_newline(dd->s);
-+		dd->first = false;
-+	}
-+
-+	/*
-+	 * There may be multiple lines in $line. Scan and emit each line
-+	 * separately.
-+	 */
-+	while (true) {
-+		char *end = line;
-+		char c;
-+
-+		while (*end != '\n' && *end != '\0')
-+			end++;
-+
-+		/*
-+		 * If $line overflowed, it may not have newline at the end.
-+		 * Always emit with a newline.
-+		 */
-+		c = *end;
-+		*end = '\0';
-+		dump_line(dd->s, "%s%s", dd->prefix, line);
-+		if (c == '\0')
-+			break;
-+
-+		/* move to the next line */
-+		end++;
-+		if (*end == '\0')
-+			break;
-+		line = end;
-+	}
-+
-+	dd->cursor = 0;
-+}
-+
-+static void ops_dump_exit(void)
-+{
-+	ops_dump_flush();
-+	scx_dump_data.cpu = -1;
-+}
-+
-+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
-+			  struct task_struct *p, char marker)
-+{
-+	static unsigned long bt[SCX_EXIT_BT_LEN];
-+	char dsq_id_buf[19] = "(n/a)";
-+	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
-+	unsigned int bt_len;
-+
-+	if (p->scx.dsq)
-+		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
-+			  (unsigned long long)p->scx.dsq->id);
-+
-+	dump_newline(s);
-+	dump_line(s, " %c%c %s[%d] %+ldms",
-+		  marker, task_state_to_char(p), p->comm, p->pid,
-+		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
-+	dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
-+		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
-+		  p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK,
-+		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
-+	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
-+		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
-+		  p->scx.dsq_vtime);
-+	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
-+
-+	if (SCX_HAS_OP(dump_task)) {
-+		ops_dump_init(s, "    ");
-+		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
-+		ops_dump_exit();
-+	}
-+
-+	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
-+	if (bt_len) {
-+		dump_newline(s);
-+		dump_stack_trace(s, "    ", bt, bt_len);
-+	}
-+}
-+
-+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
-+{
-+	static DEFINE_SPINLOCK(dump_lock);
-+	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
-+	struct scx_dump_ctx dctx = {
-+		.kind = ei->kind,
-+		.exit_code = ei->exit_code,
-+		.reason = ei->reason,
-+		.at_ns = ktime_get_ns(),
-+		.at_jiffies = jiffies,
-+	};
-+	struct seq_buf s;
-+	unsigned long flags;
-+	char *buf;
-+	int cpu;
-+
-+	spin_lock_irqsave(&dump_lock, flags);
-+
-+	seq_buf_init(&s, ei->dump, dump_len);
-+
-+	if (ei->kind == SCX_EXIT_NONE) {
-+		dump_line(&s, "Debug dump triggered by %s", ei->reason);
-+	} else {
-+		dump_line(&s, "%s[%d] triggered exit kind %d:",
-+			  current->comm, current->pid, ei->kind);
-+		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
-+		dump_newline(&s);
-+		dump_line(&s, "Backtrace:");
-+		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
-+	}
-+
-+	if (SCX_HAS_OP(dump)) {
-+		ops_dump_init(&s, "");
-+		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
-+		ops_dump_exit();
-+	}
-+
-+	dump_newline(&s);
-+	dump_line(&s, "CPU states");
-+	dump_line(&s, "----------");
-+
-+	for_each_possible_cpu(cpu) {
-+		struct rq *rq = cpu_rq(cpu);
-+		struct rq_flags rf;
-+		struct task_struct *p;
-+		struct seq_buf ns;
-+		size_t avail, used;
-+		bool idle;
-+
-+		rq_lock(rq, &rf);
-+
-+		idle = list_empty(&rq->scx.runnable_list) &&
-+			rq->curr->sched_class == &idle_sched_class;
-+
-+		if (idle && !SCX_HAS_OP(dump_cpu))
-+			goto next;
-+
-+		/*
-+		 * We don't yet know whether ops.dump_cpu() will produce output
-+		 * and we may want to skip the default CPU dump if it doesn't.
-+		 * Use a nested seq_buf to generate the standard dump so that we
-+		 * can decide whether to commit later.
-+		 */
-+		avail = seq_buf_get_buf(&s, &buf);
-+		seq_buf_init(&ns, buf, avail);
-+
-+		dump_newline(&ns);
-+		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
-+			  cpu, rq->scx.nr_running, rq->scx.flags,
-+			  rq->scx.cpu_released, rq->scx.ops_qseq,
-+			  rq->scx.pnt_seq);
-+		dump_line(&ns, "          curr=%s[%d] class=%ps",
-+			  rq->curr->comm, rq->curr->pid,
-+			  rq->curr->sched_class);
-+		if (!cpumask_empty(rq->scx.cpus_to_kick))
-+			dump_line(&ns, "  cpus_to_kick   : %*pb",
-+				  cpumask_pr_args(rq->scx.cpus_to_kick));
-+		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
-+			dump_line(&ns, "  idle_to_kick   : %*pb",
-+				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
-+		if (!cpumask_empty(rq->scx.cpus_to_preempt))
-+			dump_line(&ns, "  cpus_to_preempt: %*pb",
-+				  cpumask_pr_args(rq->scx.cpus_to_preempt));
-+		if (!cpumask_empty(rq->scx.cpus_to_wait))
-+			dump_line(&ns, "  cpus_to_wait   : %*pb",
-+				  cpumask_pr_args(rq->scx.cpus_to_wait));
-+
-+		used = seq_buf_used(&ns);
-+		if (SCX_HAS_OP(dump_cpu)) {
-+			ops_dump_init(&ns, "  ");
-+			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
-+			ops_dump_exit();
-+		}
-+
-+		/*
-+		 * If idle && nothing generated by ops.dump_cpu(), there's
-+		 * nothing interesting. Skip.
-+		 */
-+		if (idle && used == seq_buf_used(&ns))
-+			goto next;
-+
-+		/*
-+		 * $s may already have overflowed when $ns was created. If so,
-+		 * calling commit on it will trigger BUG.
-+		 */
-+		if (avail) {
-+			seq_buf_commit(&s, seq_buf_used(&ns));
-+			if (seq_buf_has_overflowed(&ns))
-+				seq_buf_set_overflow(&s);
-+		}
-+
-+		if (rq->curr->sched_class == &ext_sched_class)
-+			scx_dump_task(&s, &dctx, rq->curr, '*');
-+
-+		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
-+			scx_dump_task(&s, &dctx, p, ' ');
-+	next:
-+		rq_unlock(rq, &rf);
-+	}
-+
-+	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
-+		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
-+		       trunc_marker, sizeof(trunc_marker));
-+
-+	spin_unlock_irqrestore(&dump_lock, flags);
-+}
-+
-+static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
-+{
-+	struct scx_exit_info *ei = scx_exit_info;
-+
-+	if (ei->kind >= SCX_EXIT_ERROR)
-+		scx_dump_state(ei, scx_ops.exit_dump_len);
-+
-+	schedule_scx_ops_disable_work();
-+}
-+
-+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-+
-+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
-+					     s64 exit_code,
-+					     const char *fmt, ...)
-+{
-+	struct scx_exit_info *ei = scx_exit_info;
-+	int none = SCX_EXIT_NONE;
-+	va_list args;
-+
-+	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
-+		return;
-+
-+	ei->exit_code = exit_code;
-+
-+	if (kind >= SCX_EXIT_ERROR)
-+		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
-+
-+	va_start(args, fmt);
-+	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
-+	va_end(args);
-+
-+	/*
-+	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
-+	 * in scx_ops_disable_workfn().
-+	 */
-+	ei->kind = kind;
-+	ei->reason = scx_exit_reason(ei->kind);
-+
-+	irq_work_queue(&scx_ops_error_irq_work);
-+}
-+
-+static struct kthread_worker *scx_create_rt_helper(const char *name)
-+{
-+	struct kthread_worker *helper;
-+
-+	helper = kthread_create_worker(0, name);
-+	if (helper)
-+		sched_set_fifo(helper->task);
-+	return helper;
-+}
-+
-+static void check_hotplug_seq(const struct sched_ext_ops *ops)
-+{
-+	unsigned long long global_hotplug_seq;
-+
-+	/*
-+	 * If a hotplug event has occurred between when a scheduler was
-+	 * initialized, and when we were able to attach, exit and notify user
-+	 * space about it.
-+	 */
-+	if (ops->hotplug_seq) {
-+		global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
-+		if (ops->hotplug_seq != global_hotplug_seq) {
-+			scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
-+				     "expected hotplug seq %llu did not match actual %llu",
-+				     ops->hotplug_seq, global_hotplug_seq);
-+		}
-+	}
-+}
-+
-+static int validate_ops(const struct sched_ext_ops *ops)
-+{
-+	/*
-+	 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
-+	 * ops.enqueue() callback isn't implemented.
-+	 */
-+	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
-+		scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
-+		return -EINVAL;
-+	}
-+
-+	return 0;
-+}
-+
-+static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
-+{
-+	struct scx_task_iter sti;
-+	struct task_struct *p;
-+	unsigned long timeout;
-+	int i, cpu, ret;
-+
-+	mutex_lock(&scx_ops_enable_mutex);
-+
-+	if (!scx_ops_helper) {
-+		WRITE_ONCE(scx_ops_helper,
-+			   scx_create_rt_helper("sched_ext_ops_helper"));
-+		if (!scx_ops_helper) {
-+			ret = -ENOMEM;
-+			goto err_unlock;
-+		}
-+	}
-+
-+	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
-+		ret = -EBUSY;
-+		goto err_unlock;
-+	}
-+
-+	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
-+	if (!scx_root_kobj) {
-+		ret = -ENOMEM;
-+		goto err_unlock;
-+	}
-+
-+	scx_root_kobj->kset = scx_kset;
-+	ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
-+	if (ret < 0)
-+		goto err;
-+
-+	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
-+	if (!scx_exit_info) {
-+		ret = -ENOMEM;
-+		goto err_del;
-+	}
-+
-+	/*
-+	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
-+	 * disable path. Failure triggers full disabling from here on.
-+	 */
-+	scx_ops = *ops;
-+
-+	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
-+		     SCX_OPS_DISABLED);
-+
-+	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
-+	scx_warned_zero_slice = false;
-+
-+	atomic_long_set(&scx_nr_rejected, 0);
-+
-+	for_each_possible_cpu(cpu)
-+		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
-+
-+	/*
-+	 * Keep CPUs stable during enable so that the BPF scheduler can track
-+	 * online CPUs by watching ->on/offline_cpu() after ->init().
-+	 */
-+	cpus_read_lock();
-+
-+	if (scx_ops.init) {
-+		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init);
-+		if (ret) {
-+			ret = ops_sanitize_err("init", ret);
-+			goto err_disable_unlock_cpus;
-+		}
-+	}
-+
-+	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
-+		if (((void (**)(void))ops)[i])
-+			static_branch_enable_cpuslocked(&scx_has_op[i]);
-+
-+	cpus_read_unlock();
-+
-+	ret = validate_ops(ops);
-+	if (ret)
-+		goto err_disable;
-+
-+	WARN_ON_ONCE(scx_dsp_ctx);
-+	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
-+	scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
-+						   scx_dsp_max_batch),
-+				     __alignof__(struct scx_dsp_ctx));
-+	if (!scx_dsp_ctx) {
-+		ret = -ENOMEM;
-+		goto err_disable;
-+	}
-+
-+	if (ops->timeout_ms)
-+		timeout = msecs_to_jiffies(ops->timeout_ms);
-+	else
-+		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-+
-+	WRITE_ONCE(scx_watchdog_timeout, timeout);
-+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
-+	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
-+			   scx_watchdog_timeout / 2);
-+
-+	/*
-+	 * Lock out forks, cgroup on/offlining and moves before opening the
-+	 * floodgate so that they don't wander into the operations prematurely.
-+	 *
-+	 * We don't need to keep the CPUs stable but static_branch_*() requires
-+	 * cpus_read_lock() and scx_cgroup_rwsem must nest inside
-+	 * cpu_hotplug_lock because of the following dependency chain:
-+	 *
-+	 *   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
-+	 *
-+	 * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
-+	 * static_branch_*_cpuslocked().
-+	 *
-+	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
-+	 * following dependency chain:
-+	 *
-+	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
-+	 */
-+	percpu_down_write(&scx_fork_rwsem);
-+	cpus_read_lock();
-+	scx_cgroup_lock();
-+
-+	check_hotplug_seq(ops);
-+
-+	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
-+		if (((void (**)(void))ops)[i])
-+			static_branch_enable_cpuslocked(&scx_has_op[i]);
-+
-+	if (ops->flags & SCX_OPS_ENQ_LAST)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_last);
-+
-+	if (ops->flags & SCX_OPS_ENQ_EXITING)
-+		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
-+	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
-+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
-+
-+	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
-+		reset_idle_masks();
-+		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
-+	} else {
-+		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
-+	}
-+
-+	/*
-+	 * All cgroups should be initialized before letting in tasks. cgroup
-+	 * on/offlining and task migrations are already locked out.
-+	 */
-+	ret = scx_cgroup_init();
-+	if (ret)
-+		goto err_disable_unlock_all;
-+
-+	static_branch_enable_cpuslocked(&__scx_ops_enabled);
-+
-+	/*
-+	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
-+	 * preventing new tasks from being added. No need to exclude tasks
-+	 * leaving as sched_ext_free() can handle both prepped and enabled
-+	 * tasks. Prep all tasks first and then enable them with preemption
-+	 * disabled.
-+	 */
-+	spin_lock_irq(&scx_tasks_lock);
-+
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_locked(&sti, false))) {
-+		get_task_struct(p);
-+		scx_task_iter_rq_unlock(&sti);
-+		spin_unlock_irq(&scx_tasks_lock);
-+
-+		ret = scx_ops_init_task(p, task_group(p), false);
-+		if (ret) {
-+			put_task_struct(p);
-+			spin_lock_irq(&scx_tasks_lock);
-+			scx_task_iter_exit(&sti);
-+			spin_unlock_irq(&scx_tasks_lock);
-+			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
-+			       ret, p->comm, p->pid);
-+			goto err_disable_unlock_all;
-+		}
-+
-+		put_task_struct(p);
-+		spin_lock_irq(&scx_tasks_lock);
-+	}
-+	scx_task_iter_exit(&sti);
-+
-+	/*
-+	 * All tasks are prepped but are still ops-disabled. Ensure that
-+	 * %current can't be scheduled out and switch everyone.
-+	 * preempt_disable() is necessary because we can't guarantee that
-+	 * %current won't be starved if scheduled out while switching.
-+	 */
-+	preempt_disable();
-+
-+	/*
-+	 * From here on, the disable path must assume that tasks have ops
-+	 * enabled and need to be recovered.
-+	 *
-+	 * Transition to ENABLING fails iff the BPF scheduler has already
-+	 * triggered scx_bpf_error(). Returning an error code here would lose
-+	 * the recorded error information. Exit indicating success so that the
-+	 * error is notified through ops.exit() with all the details.
-+	 */
-+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
-+		preempt_enable();
-+		spin_unlock_irq(&scx_tasks_lock);
-+		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
-+		ret = 0;
-+		goto err_disable_unlock_all;
-+	}
-+
-+	/*
-+	 * We're fully committed and can't fail. The PREPPED -> ENABLED
-+	 * transitions here are synchronized against sched_ext_free() through
-+	 * scx_tasks_lock.
-+	 */
-+	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
-+
-+	scx_task_iter_init(&sti);
-+	while ((p = scx_task_iter_next_locked(&sti, false))) {
-+		const struct sched_class *old_class = p->sched_class;
-+		struct sched_enq_and_set_ctx ctx;
-+
-+		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-+
-+		scx_set_task_state(p, SCX_TASK_READY);
-+		__setscheduler_prio(p, p->prio);
-+		check_class_changing(task_rq(p), p, old_class);
-+
-+		sched_enq_and_set_task(&ctx);
-+
-+		check_class_changed(task_rq(p), p, old_class, p->prio);
-+	}
-+	scx_task_iter_exit(&sti);
-+
-+	spin_unlock_irq(&scx_tasks_lock);
-+	preempt_enable();
-+	scx_cgroup_unlock();
-+	cpus_read_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+
-+	/* see above ENABLING transition for the explanation on exiting with 0 */
-+	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
-+		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
-+		ret = 0;
-+		goto err_disable;
-+	}
-+
-+	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
-+		static_branch_enable(&__scx_switched_all);
-+
-+	kobject_uevent(scx_root_kobj, KOBJ_ADD);
-+	mutex_unlock(&scx_ops_enable_mutex);
-+
-+	scx_cgroup_config_knobs();
-+
-+	return 0;
-+
-+err_del:
-+	kobject_del(scx_root_kobj);
-+err:
-+	kobject_put(scx_root_kobj);
-+	scx_root_kobj = NULL;
-+	if (scx_exit_info) {
-+		free_exit_info(scx_exit_info);
-+		scx_exit_info = NULL;
-+	}
-+err_unlock:
-+	mutex_unlock(&scx_ops_enable_mutex);
-+	return ret;
-+
-+err_disable_unlock_all:
-+	scx_cgroup_unlock();
-+	percpu_up_write(&scx_fork_rwsem);
-+err_disable_unlock_cpus:
-+	cpus_read_unlock();
-+err_disable:
-+	mutex_unlock(&scx_ops_enable_mutex);
-+	/* must be fully disabled before returning */
-+	scx_ops_disable(SCX_EXIT_ERROR);
-+	kthread_flush_work(&scx_ops_disable_work);
-+	return ret;
-+}
-+
-+
-+/********************************************************************************
-+ * bpf_struct_ops plumbing.
-+ */
-+#include <linux/bpf_verifier.h>
-+#include <linux/bpf.h>
-+#include <linux/btf.h>
-+
-+extern struct btf *btf_vmlinux;
-+static const struct btf_type *task_struct_type;
-+static u32 task_struct_type_id;
-+
-+/* Make the 2nd argument of .dispatch a pointer that can be NULL. */
-+static bool promote_dispatch_2nd_arg(int off, int size,
-+				     enum bpf_access_type type,
-+				     const struct bpf_prog *prog,
-+				     struct bpf_insn_access_aux *info)
-+{
-+	struct btf *btf = bpf_get_btf_vmlinux();
-+	const struct bpf_struct_ops_desc *st_ops_desc;
-+	const struct btf_member *member;
-+	const struct btf_type *t;
-+	u32 btf_id, member_idx;
-+	const char *mname;
-+
-+	/* btf_id should be the type id of struct sched_ext_ops */
-+	btf_id = prog->aux->attach_btf_id;
-+	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
-+	if (!st_ops_desc)
-+		return false;
-+
-+	/* BTF type of struct sched_ext_ops */
-+	t = st_ops_desc->type;
-+
-+	member_idx = prog->expected_attach_type;
-+	if (member_idx >= btf_type_vlen(t))
-+		return false;
-+
-+	/*
-+	 * Get the member name of this struct_ops program, which corresponds to
-+	 * a field in struct sched_ext_ops. For example, the member name of the
-+	 * dispatch struct_ops program (callback) is "dispatch".
-+	 */
-+	member = &btf_type_member(t)[member_idx];
-+	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
-+
-+	/*
-+	 * Check if it is the second argument of the function pointer at
-+	 * "dispatch" in struct sched_ext_ops. The arguments of struct_ops
-+	 * operators are sequential and 64-bit, so the second argument is at
-+	 * offset sizeof(__u64).
-+	 */
-+	if (strcmp(mname, "dispatch") == 0 &&
-+	    off == sizeof(__u64)) {
-+		/*
-+		 * The value is a pointer to a type (struct task_struct) given
-+		 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
-+		 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
-+		 * should check the pointer to make sure it is not NULL before
-+		 * using it, or the verifier will reject the program.
-+		 *
-+		 * Longer term, this is something that should be addressed by
-+		 * BTF, and be fully contained within the verifier.
-+		 */
-+		info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
-+		info->btf = btf_vmlinux;
-+		info->btf_id = task_struct_type_id;
-+
-+		return true;
-+	}
-+
-+	return false;
-+}
-+
-+static bool bpf_scx_is_valid_access(int off, int size,
-+				    enum bpf_access_type type,
-+				    const struct bpf_prog *prog,
-+				    struct bpf_insn_access_aux *info)
-+{
-+	if (type != BPF_READ)
-+		return false;
-+	if (promote_dispatch_2nd_arg(off, size, type, prog, info))
-+		return true;
-+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
-+		return false;
-+	if (off % size != 0)
-+		return false;
-+
-+	return btf_ctx_access(off, size, type, prog, info);
-+}
-+
-+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
-+				     const struct bpf_reg_state *reg, int off,
-+				     int size)
-+{
-+	const struct btf_type *t;
-+
-+	t = btf_type_by_id(reg->btf, reg->btf_id);
-+	if (t == task_struct_type) {
-+		if (off >= offsetof(struct task_struct, scx.slice) &&
-+		    off + size <= offsetofend(struct task_struct, scx.slice))
-+			return SCALAR_VALUE;
-+		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
-+		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
-+			return SCALAR_VALUE;
-+		if (off >= offsetof(struct task_struct, scx.disallow) &&
-+		    off + size <= offsetofend(struct task_struct, scx.disallow))
-+			return SCALAR_VALUE;
-+	}
-+
-+	return -EACCES;
-+}
-+
-+static const struct bpf_func_proto *
-+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-+{
-+	switch (func_id) {
-+	case BPF_FUNC_task_storage_get:
-+		return &bpf_task_storage_get_proto;
-+	case BPF_FUNC_task_storage_delete:
-+		return &bpf_task_storage_delete_proto;
-+	default:
-+		return bpf_base_func_proto(func_id, prog);
-+	}
-+}
-+
-+static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
-+	.get_func_proto = bpf_scx_get_func_proto,
-+	.is_valid_access = bpf_scx_is_valid_access,
-+	.btf_struct_access = bpf_scx_btf_struct_access,
-+};
-+
-+static int bpf_scx_init_member(const struct btf_type *t,
-+			       const struct btf_member *member,
-+			       void *kdata, const void *udata)
-+{
-+	const struct sched_ext_ops *uops = udata;
-+	struct sched_ext_ops *ops = kdata;
-+	u32 moff = __btf_member_bit_offset(t, member) / 8;
-+	int ret;
-+
-+	switch (moff) {
-+	case offsetof(struct sched_ext_ops, dispatch_max_batch):
-+		if (*(u32 *)(udata + moff) > INT_MAX)
-+			return -E2BIG;
-+		ops->dispatch_max_batch = *(u32 *)(udata + moff);
-+		return 1;
-+	case offsetof(struct sched_ext_ops, flags):
-+		if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
-+			return -EINVAL;
-+		ops->flags = *(u64 *)(udata + moff);
-+		return 1;
-+	case offsetof(struct sched_ext_ops, name):
-+		ret = bpf_obj_name_cpy(ops->name, uops->name,
-+				       sizeof(ops->name));
-+		if (ret < 0)
-+			return ret;
-+		if (ret == 0)
-+			return -EINVAL;
-+		return 1;
-+	case offsetof(struct sched_ext_ops, timeout_ms):
-+		if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
-+		    SCX_WATCHDOG_MAX_TIMEOUT)
-+			return -E2BIG;
-+		ops->timeout_ms = *(u32 *)(udata + moff);
-+		return 1;
-+	case offsetof(struct sched_ext_ops, exit_dump_len):
-+		ops->exit_dump_len =
-+			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
-+		return 1;
-+	case offsetof(struct sched_ext_ops, hotplug_seq):
-+		ops->hotplug_seq = *(u64 *)(udata + moff);
-+		return 1;
-+	}
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_check_member(const struct btf_type *t,
-+				const struct btf_member *member,
-+				const struct bpf_prog *prog)
-+{
-+	u32 moff = __btf_member_bit_offset(t, member) / 8;
-+
-+	switch (moff) {
-+	case offsetof(struct sched_ext_ops, init_task):
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	case offsetof(struct sched_ext_ops, cgroup_init):
-+	case offsetof(struct sched_ext_ops, cgroup_exit):
-+	case offsetof(struct sched_ext_ops, cgroup_prep_move):
-+#endif
-+	case offsetof(struct sched_ext_ops, cpu_online):
-+	case offsetof(struct sched_ext_ops, cpu_offline):
-+	case offsetof(struct sched_ext_ops, init):
-+	case offsetof(struct sched_ext_ops, exit):
-+		break;
-+	default:
-+		if (prog->sleepable)
-+			return -EINVAL;
-+	}
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_reg(void *kdata, struct bpf_link *link)
-+{
-+	return scx_ops_enable(kdata, link);
-+}
-+
-+static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
-+{
-+	scx_ops_disable(SCX_EXIT_UNREG);
-+	kthread_flush_work(&scx_ops_disable_work);
-+}
-+
-+static int bpf_scx_init(struct btf *btf)
-+{
-+	u32 type_id;
-+
-+	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
-+	if (type_id < 0)
-+		return -EINVAL;
-+	task_struct_type = btf_type_by_id(btf, type_id);
-+	task_struct_type_id = type_id;
-+
-+	return 0;
-+}
-+
-+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
-+{
-+	/*
-+	 * sched_ext does not support updating the actively-loaded BPF
-+	 * scheduler, as registering a BPF scheduler can always fail if the
-+	 * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
-+	 * etc. Similarly, we can always race with unregistration happening
-+	 * elsewhere, such as with sysrq.
-+	 */
-+	return -EOPNOTSUPP;
-+}
-+
-+static int bpf_scx_validate(void *kdata)
-+{
-+	return 0;
-+}
-+
-+static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
-+static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
-+static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
-+static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
-+static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
-+static void running_stub(struct task_struct *p) {}
-+static void stopping_stub(struct task_struct *p, bool runnable) {}
-+static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
-+static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
-+static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
-+static void set_weight_stub(struct task_struct *p, u32 weight) {}
-+static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
-+static void update_idle_stub(s32 cpu, bool idle) {}
-+static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
-+static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
-+static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
-+static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
-+static void enable_stub(struct task_struct *p) {}
-+static void disable_stub(struct task_struct *p) {}
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
-+static void cgroup_exit_stub(struct cgroup *cgrp) {}
-+static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
-+static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-+static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-+static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
-+#endif
-+static void cpu_online_stub(s32 cpu) {}
-+static void cpu_offline_stub(s32 cpu) {}
-+static s32 init_stub(void) { return -EINVAL; }
-+static void exit_stub(struct scx_exit_info *info) {}
-+
-+static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
-+	.select_cpu = select_cpu_stub,
-+	.enqueue = enqueue_stub,
-+	.dequeue = dequeue_stub,
-+	.dispatch = dispatch_stub,
-+	.runnable = runnable_stub,
-+	.running = running_stub,
-+	.stopping = stopping_stub,
-+	.quiescent = quiescent_stub,
-+	.yield = yield_stub,
-+	.core_sched_before = core_sched_before_stub,
-+	.set_weight = set_weight_stub,
-+	.set_cpumask = set_cpumask_stub,
-+	.update_idle = update_idle_stub,
-+	.cpu_acquire = cpu_acquire_stub,
-+	.cpu_release = cpu_release_stub,
-+	.init_task = init_task_stub,
-+	.exit_task = exit_task_stub,
-+	.enable = enable_stub,
-+	.disable = disable_stub,
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	.cgroup_init = cgroup_init_stub,
-+	.cgroup_exit = cgroup_exit_stub,
-+	.cgroup_prep_move = cgroup_prep_move_stub,
-+	.cgroup_move = cgroup_move_stub,
-+	.cgroup_cancel_move = cgroup_cancel_move_stub,
-+	.cgroup_set_weight = cgroup_set_weight_stub,
-+#endif
-+	.cpu_online = cpu_online_stub,
-+	.cpu_offline = cpu_offline_stub,
-+	.init = init_stub,
-+	.exit = exit_stub,
-+};
-+
-+static struct bpf_struct_ops bpf_sched_ext_ops = {
-+	.verifier_ops = &bpf_scx_verifier_ops,
-+	.reg = bpf_scx_reg,
-+	.unreg = bpf_scx_unreg,
-+	.check_member = bpf_scx_check_member,
-+	.init_member = bpf_scx_init_member,
-+	.init = bpf_scx_init,
-+	.update = bpf_scx_update,
-+	.validate = bpf_scx_validate,
-+	.name = "sched_ext_ops",
-+	.owner = THIS_MODULE,
-+	.cfi_stubs = &__bpf_ops_sched_ext_ops
-+};
-+
-+
-+/********************************************************************************
-+ * System integration and init.
-+ */
-+
-+static void sysrq_handle_sched_ext_reset(u8 key)
-+{
-+	if (scx_ops_helper)
-+		scx_ops_disable(SCX_EXIT_SYSRQ);
-+	else
-+		pr_info("sched_ext: BPF scheduler not yet used\n");
-+}
-+
-+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
-+	.handler	= sysrq_handle_sched_ext_reset,
-+	.help_msg	= "reset-sched-ext(S)",
-+	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
-+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
-+};
-+
-+static void sysrq_handle_sched_ext_dump(u8 key)
-+{
-+	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
-+
-+	if (scx_enabled())
-+		scx_dump_state(&ei, 0);
-+}
-+
-+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
-+	.handler	= sysrq_handle_sched_ext_dump,
-+	.help_msg	= "dump-sched-ext(D)",
-+	.action_msg	= "Trigger sched_ext debug dump",
-+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
-+};
-+
-+static bool can_skip_idle_kick(struct rq *rq)
-+{
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * We can skip idle kicking if @rq is going to go through at least one
-+	 * full SCX scheduling cycle before going idle. Just checking whether
-+	 * curr is not idle is insufficient because we could be racing
-+	 * balance_one() trying to pull the next task from a remote rq, which
-+	 * may fail, and @rq may become idle afterwards.
-+	 *
-+	 * The race window is small and we don't and can't guarantee that @rq is
-+	 * only kicked while idle anyway. Skip only when sure.
-+	 */
-+	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING);
-+}
-+
-+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
-+{
-+	struct rq *rq = cpu_rq(cpu);
-+	struct scx_rq *this_scx = &this_rq->scx;
-+	bool should_wait = false;
-+	unsigned long flags;
-+
-+	raw_spin_rq_lock_irqsave(rq, flags);
-+
-+	/*
-+	 * During CPU hotplug, a CPU may depend on kicking itself to make
-+	 * forward progress. Allow kicking self regardless of online state.
-+	 */
-+	if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
-+		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
-+			if (rq->curr->sched_class == &ext_sched_class)
-+				rq->curr->scx.slice = 0;
-+			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
-+		}
-+
-+		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
-+			pseqs[cpu] = rq->scx.pnt_seq;
-+			should_wait = true;
-+		}
-+
-+		resched_curr(rq);
-+	} else {
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
-+	}
-+
-+	raw_spin_rq_unlock_irqrestore(rq, flags);
-+
-+	return should_wait;
-+}
-+
-+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
-+{
-+	struct rq *rq = cpu_rq(cpu);
-+	unsigned long flags;
-+
-+	raw_spin_rq_lock_irqsave(rq, flags);
-+
-+	if (!can_skip_idle_kick(rq) &&
-+	    (cpu_online(cpu) || cpu == cpu_of(this_rq)))
-+		resched_curr(rq);
-+
-+	raw_spin_rq_unlock_irqrestore(rq, flags);
-+}
-+
-+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
-+{
-+	struct rq *this_rq = this_rq();
-+	struct scx_rq *this_scx = &this_rq->scx;
-+	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
-+	bool should_wait = false;
-+	s32 cpu;
-+
-+	for_each_cpu(cpu, this_scx->cpus_to_kick) {
-+		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
-+	}
-+
-+	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
-+		kick_one_cpu_if_idle(cpu, this_rq);
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
-+	}
-+
-+	if (!should_wait)
-+		return;
-+
-+	for_each_cpu(cpu, this_scx->cpus_to_wait) {
-+		unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
-+
-+		if (cpu != cpu_of(this_rq)) {
-+			/*
-+			 * Pairs with smp_store_release() issued by this CPU in
-+			 * scx_next_task_picked() on the resched path.
-+			 *
-+			 * We busy-wait here to guarantee that no other task can
-+			 * be scheduled on our core before the target CPU has
-+			 * entered the resched path.
-+			 */
-+			while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
-+				cpu_relax();
-+		}
-+
-+		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
-+	}
-+}
-+
-+/**
-+ * print_scx_info - print out sched_ext scheduler state
-+ * @log_lvl: the log level to use when printing
-+ * @p: target task
-+ *
-+ * If a sched_ext scheduler is enabled, print the name and state of the
-+ * scheduler. If @p is on sched_ext, print further information about the task.
-+ *
-+ * This function can be safely called on any task as long as the task_struct
-+ * itself is accessible. While safe, this function isn't synchronized and may
-+ * print out mixups or garbages of limited length.
-+ */
-+void print_scx_info(const char *log_lvl, struct task_struct *p)
-+{
-+	enum scx_ops_enable_state state = scx_ops_enable_state();
-+	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
-+	char runnable_at_buf[22] = "?";
-+	struct sched_class *class;
-+	unsigned long runnable_at;
-+
-+	if (state == SCX_OPS_DISABLED)
-+		return;
-+
-+	/*
-+	 * Carefully check if the task was running on sched_ext, and then
-+	 * carefully copy the time it's been runnable, and its state.
-+	 */
-+	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
-+	    class != &ext_sched_class) {
-+		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
-+		       scx_ops_enable_state_str[state], all);
-+		return;
-+	}
-+
-+	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
-+				      sizeof(runnable_at)))
-+		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
-+			  jiffies_delta_msecs(runnable_at, jiffies));
-+
-+	/* print everything onto one line to conserve console space */
-+	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
-+	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
-+	       runnable_at_buf);
-+}
-+
-+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
-+{
-+	/*
-+	 * SCX schedulers often have userspace components which are sometimes
-+	 * involved in critial scheduling paths. PM operations involve freezing
-+	 * userspace which can lead to scheduling misbehaviors including stalls.
-+	 * Let's bypass while PM operations are in progress.
-+	 */
-+	switch (event) {
-+	case PM_HIBERNATION_PREPARE:
-+	case PM_SUSPEND_PREPARE:
-+	case PM_RESTORE_PREPARE:
-+		scx_ops_bypass(true);
-+		break;
-+	case PM_POST_HIBERNATION:
-+	case PM_POST_SUSPEND:
-+	case PM_POST_RESTORE:
-+		scx_ops_bypass(false);
-+		break;
-+	}
-+
-+	return NOTIFY_OK;
-+}
-+
-+static struct notifier_block scx_pm_notifier = {
-+	.notifier_call = scx_pm_handler,
-+};
-+
-+void __init init_sched_ext_class(void)
-+{
-+	s32 cpu, v;
-+
-+	/*
-+	 * The following is to prevent the compiler from optimizing out the enum
-+	 * definitions so that BPF scheduler implementations can use them
-+	 * through the generated vmlinux.h.
-+	 */
-+	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
-+		   SCX_TG_ONLINE);
-+
-+	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-+	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
-+#ifdef CONFIG_SMP
-+	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
-+	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-+#endif
-+	scx_kick_cpus_pnt_seqs =
-+		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-+			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-+	BUG_ON(!scx_kick_cpus_pnt_seqs);
-+
-+	for_each_possible_cpu(cpu) {
-+		struct rq *rq = cpu_rq(cpu);
-+
-+		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
-+		INIT_LIST_HEAD(&rq->scx.runnable_list);
-+
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
-+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
-+		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
-+
-+		if (cpu_online(cpu))
-+			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
-+	}
-+
-+	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
-+	register_sysrq_key('D', &sysrq_sched_ext_dump_op);
-+	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
-+	scx_cgroup_config_knobs();
-+}
-+
-+
-+/********************************************************************************
-+ * Helpers that can be called from the BPF scheduler.
-+ */
-+#include <linux/btf_ids.h>
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_create_dsq - Create a custom DSQ
-+ * @dsq_id: DSQ to create
-+ * @node: NUMA node to allocate from
-+ *
-+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(),
-+ * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move().
-+ */
-+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
-+{
-+	if (!scx_kf_allowed(SCX_KF_SLEEPABLE))
-+		return -EINVAL;
-+
-+	if (unlikely(node >= (int)nr_node_ids ||
-+		     (node < 0 && node != NUMA_NO_NODE)))
-+		return -EINVAL;
-+	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_sleepable)
-+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-+BTF_KFUNCS_END(scx_kfunc_ids_sleepable)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_sleepable,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
-+ * @p: task_struct to select a CPU for
-+ * @prev_cpu: CPU @p was on previously
-+ * @wake_flags: %SCX_WAKE_* flags
-+ * @is_idle: out parameter indicating whether the returned CPU is idle
-+ *
-+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
-+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
-+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
-+ *
-+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
-+ * currently idle and thus a good candidate for direct dispatching.
-+ */
-+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
-+				       u64 wake_flags, bool *is_idle)
-+{
-+	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
-+		*is_idle = false;
-+		return prev_cpu;
-+	}
-+#ifdef CONFIG_SMP
-+	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-+#else
-+	*is_idle = false;
-+	return prev_cpu;
-+#endif
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_select_cpu,
-+};
-+
-+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
-+{
-+	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
-+		return false;
-+
-+	lockdep_assert_irqs_disabled();
-+
-+	if (unlikely(!p)) {
-+		scx_ops_error("called with NULL task");
-+		return false;
-+	}
-+
-+	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
-+		scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+	struct task_struct *ddsp_task;
-+
-+	ddsp_task = __this_cpu_read(direct_dispatch_task);
-+	if (ddsp_task) {
-+		mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
-+		return;
-+	}
-+
-+	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
-+		scx_ops_error("dispatch buffer overflow");
-+		return;
-+	}
-+
-+	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
-+		.task = p,
-+		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
-+		.dsq_id = dsq_id,
-+		.enq_flags = enq_flags,
-+	};
-+}
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
-+ * @p: task_struct to dispatch
-+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
-+ * @enq_flags: SCX_ENQ_*
-+ *
-+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
-+ * to call this function spuriously. Can be called from ops.enqueue(),
-+ * ops.select_cpu(), and ops.dispatch().
-+ *
-+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
-+ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
-+ * used to target the local DSQ of a CPU other than the enqueueing one. Use
-+ * ops.select_cpu() to be on the target CPU in the first place.
-+ *
-+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
-+ * will be directly dispatched to the corresponding dispatch queue after
-+ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
-+ * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
-+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
-+ * task is dispatched.
-+ *
-+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
-+ * and this function can be called upto ops.dispatch_max_batch times to dispatch
-+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
-+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
-+ *
-+ * This function doesn't have any locking restrictions and may be called under
-+ * BPF locks (in the future when BPF introduces more flexible locking).
-+ *
-+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
-+ * exhaustion. If zero, the current residual slice is maintained. If
-+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
-+ * scx_bpf_kick_cpu() to trigger scheduling.
-+ */
-+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
-+				  u64 enq_flags)
-+{
-+	if (!scx_dispatch_preamble(p, enq_flags))
-+		return;
-+
-+	if (slice)
-+		p->scx.slice = slice;
-+	else
-+		p->scx.slice = p->scx.slice ?: 1;
-+
-+	scx_dispatch_commit(p, dsq_id, enq_flags);
-+}
-+
-+/**
-+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
-+ * @p: task_struct to dispatch
-+ * @dsq_id: DSQ to dispatch to
-+ * @slice: duration @p can run for in nsecs
-+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
-+ * @enq_flags: SCX_ENQ_*
-+ *
-+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
-+ * Tasks queued into the priority queue are ordered by @vtime and always
-+ * consumed after the tasks in the FIFO queue. All other aspects are identical
-+ * to scx_bpf_dispatch().
-+ *
-+ * @vtime ordering is according to time_before64() which considers wrapping. A
-+ * numerically larger vtime may indicate an earlier position in the ordering and
-+ * vice-versa.
-+ */
-+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
-+					u64 slice, u64 vtime, u64 enq_flags)
-+{
-+	if (!scx_dispatch_preamble(p, enq_flags))
-+		return;
-+
-+	if (slice)
-+		p->scx.slice = slice;
-+	else
-+		p->scx.slice = p->scx.slice ?: 1;
-+
-+	p->scx.dsq_vtime = vtime;
-+
-+	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
-+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_enqueue_dispatch,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
-+ *
-+ * Can only be called from ops.dispatch().
-+ */
-+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
-+{
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return 0;
-+
-+	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
-+}
-+
-+/**
-+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch
-+ *
-+ * Cancel the latest dispatch. Can be called multiple times to cancel further
-+ * dispatches. Can only be called from ops.dispatch().
-+ */
-+__bpf_kfunc void scx_bpf_dispatch_cancel(void)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return;
-+
-+	if (dspc->cursor > 0)
-+		dspc->cursor--;
-+	else
-+		scx_ops_error("dispatch buffer underflow");
-+}
-+
-+/**
-+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
-+ * @dsq_id: DSQ to consume
-+ *
-+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
-+ * to the current CPU's local DSQ for execution. Can only be called from
-+ * ops.dispatch().
-+ *
-+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
-+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't
-+ * be called under any BPF locks.
-+ *
-+ * Returns %true if a task has been consumed, %false if there isn't any task to
-+ * consume.
-+ */
-+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-+{
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+	struct scx_dispatch_q *dsq;
-+
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return false;
-+
-+	flush_dispatch_buf(dspc->rq, dspc->rf);
-+
-+	dsq = find_non_local_dsq(dsq_id);
-+	if (unlikely(!dsq)) {
-+		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
-+		return false;
-+	}
-+
-+	if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) {
-+		/*
-+		 * A successfully consumed task can be dequeued before it starts
-+		 * running while the CPU is trying to migrate other dispatched
-+		 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
-+		 * local DSQ.
-+		 */
-+		dspc->nr_tasks++;
-+		return true;
-+	} else {
-+		return false;
-+	}
-+}
-+
-+/**
-+ * __scx_bpf_consume_task - Transfer a task from DSQ iteration to the local DSQ
-+ * @it: DSQ iterator in progress
-+ * @p: task to consume
-+ *
-+ * Transfer @p which is on the DSQ currently iterated by @it to the current
-+ * CPU's local DSQ. For the transfer to be successful, @p must still be on the
-+ * DSQ and have been queued before the DSQ iteration started. This function
-+ * doesn't care whether @p was obtained from the DSQ iteration. @p just has to
-+ * be on the DSQ and have been queued before the iteration started.
-+ *
-+ * Returns %true if @p has been consumed, %false if @p had already been consumed
-+ * or dequeued.
-+ */
-+__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
-+{
-+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+	struct scx_dispatch_q *dsq, *kit_dsq;
-+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-+	struct rq *task_rq;
-+	u64 kit_dsq_seq;
-+
-+	/* can't trust @kit, carefully fetch the values we need */
-+	if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
-+	    get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
-+		scx_ops_error("invalid @it 0x%lx", it);
-+		return false;
-+	}
-+
-+	/*
-+	 * @kit can't be trusted and we can only get the DSQ from @p. As we
-+	 * don't know @p's rq is locked, use READ_ONCE() to access the field.
-+	 * Derefing is safe as DSQs are RCU protected.
-+	 */
-+	dsq = READ_ONCE(p->scx.dsq);
-+
-+	if (unlikely(dsq->id == SCX_DSQ_LOCAL)) {
-+		scx_ops_error("local DSQ not allowed");
-+		return false;
-+	}
-+
-+	if (unlikely(!dsq || dsq != kit_dsq))
-+		return false;
-+
-+	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-+		return false;
-+
-+	flush_dispatch_buf(dspc->rq, dspc->rf);
-+
-+	raw_spin_lock(&dsq->lock);
-+
-+	/*
-+	 * Did someone else get to it? @p could have already left $dsq, got
-+	 * re-enqueud, or be in the process of being consumed by someone else.
-+	 */
-+	if (unlikely(p->scx.dsq != dsq ||
-+		     time_after64(p->scx.dsq_seq, kit_dsq_seq) ||
-+		     p->scx.holding_cpu >= 0))
-+		goto out_unlock;
-+
-+	task_rq = task_rq(p);
-+
-+	if (dspc->rq == task_rq) {
-+		consume_local_task(dspc->rq, dsq, p);
-+		return true;
-+	}
-+
-+	if (task_can_run_on_remote_rq(p, dspc->rq))
-+		return consume_remote_task(dspc->rq, dspc->rf, dsq, p, task_rq);
-+
-+out_unlock:
-+	raw_spin_unlock(&dsq->lock);
-+	return false;
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
-+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
-+BTF_ID_FLAGS(func, scx_bpf_consume)
-+BTF_ID_FLAGS(func, __scx_bpf_consume_task)
-+BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_dispatch,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
-+ *
-+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
-+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
-+ * processed tasks. Can only be called from ops.cpu_release().
-+ */
-+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
-+{
-+	u32 nr_enqueued, i;
-+	struct rq *rq;
-+
-+	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
-+		return 0;
-+
-+	rq = cpu_rq(smp_processor_id());
-+	lockdep_assert_rq_held(rq);
-+
-+	/*
-+	 * Get the number of tasks on the local DSQ before iterating over it to
-+	 * pull off tasks. The enqueue callback below can signal that it wants
-+	 * the task to stay on the local DSQ, and we want to prevent the BPF
-+	 * scheduler from causing us to loop indefinitely.
-+	 */
-+	nr_enqueued = rq->scx.local_dsq.nr;
-+	for (i = 0; i < nr_enqueued; i++) {
-+		struct task_struct *p;
-+
-+		p = first_local_task(rq);
-+		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) !=
-+			     SCX_OPSS_NONE);
-+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
-+		WARN_ON_ONCE(p->scx.holding_cpu != -1);
-+		dispatch_dequeue(rq, p);
-+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
-+	}
-+
-+	return nr_enqueued;
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
-+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
-+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_cpu_release,
-+};
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
-+ * @cpu: cpu to kick
-+ * @flags: %SCX_KICK_* flags
-+ *
-+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
-+ * trigger rescheduling on a busy CPU. This can be called from any online
-+ * scx_ops operation and the actual kicking is performed asynchronously through
-+ * an irq work.
-+ */
-+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
-+{
-+	struct rq *this_rq;
-+	unsigned long irq_flags;
-+
-+	if (!ops_cpu_valid(cpu, NULL))
-+		return;
-+
-+	/*
-+	 * While bypassing for PM ops, IRQ handling may not be online which can
-+	 * lead to irq_work_queue() malfunction such as infinite busy wait for
-+	 * IRQ status update. Suppress kicking.
-+	 */
-+	if (scx_ops_bypassing())
-+		return;
-+
-+	local_irq_save(irq_flags);
-+
-+	this_rq = this_rq();
-+
-+	/*
-+	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
-+	 * rq locks. We can probably be smarter and avoid bouncing if called
-+	 * from ops which don't hold a rq lock.
-+	 */
-+	if (flags & SCX_KICK_IDLE) {
-+		struct rq *target_rq = cpu_rq(cpu);
-+
-+		if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
-+			scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
-+
-+		if (raw_spin_rq_trylock(target_rq)) {
-+			if (can_skip_idle_kick(target_rq)) {
-+				raw_spin_rq_unlock(target_rq);
-+				goto out;
-+			}
-+			raw_spin_rq_unlock(target_rq);
-+		}
-+		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
-+	} else {
-+		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
-+
-+		if (flags & SCX_KICK_PREEMPT)
-+			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
-+		if (flags & SCX_KICK_WAIT)
-+			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
-+	}
-+
-+	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
-+out:
-+	local_irq_restore(irq_flags);
-+}
-+
-+/**
-+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
-+ * @dsq_id: id of the DSQ
-+ *
-+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
-+ * -%ENOENT is returned.
-+ */
-+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
-+{
-+	struct scx_dispatch_q *dsq;
-+	s32 ret;
-+
-+	preempt_disable();
-+
-+	if (dsq_id == SCX_DSQ_LOCAL) {
-+		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
-+		goto out;
-+	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
-+		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
-+
-+		if (ops_cpu_valid(cpu, NULL)) {
-+			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
-+			goto out;
-+		}
-+	} else {
-+		dsq = find_non_local_dsq(dsq_id);
-+		if (dsq) {
-+			ret = READ_ONCE(dsq->nr);
-+			goto out;
-+		}
-+	}
-+	ret = -ENOENT;
-+out:
-+	preempt_enable();
-+	return ret;
-+}
-+
-+/**
-+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
-+ * @dsq_id: DSQ to destroy
-+ *
-+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
-+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
-+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
-+ * which doesn't exist. Can be called from any online scx_ops operations.
-+ */
-+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
-+{
-+	destroy_dsq(dsq_id);
-+}
-+
-+/**
-+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
-+ * @it: iterator to initialize
-+ * @dsq_id: DSQ to iterate
-+ * @flags: %SCX_DSQ_ITER_*
-+ *
-+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
-+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
-+ * tasks which are already queued when this function is invoked.
-+ */
-+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
-+				     u64 flags)
-+{
-+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+
-+	BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
-+		     sizeof(struct bpf_iter_scx_dsq));
-+	BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
-+		     __alignof__(struct bpf_iter_scx_dsq));
-+
-+	if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
-+		return -EINVAL;
-+
-+	kit->dsq = find_non_local_dsq(dsq_id);
-+	if (!kit->dsq)
-+		return -ENOENT;
-+
-+	INIT_LIST_HEAD(&kit->cursor.list);
-+	RB_CLEAR_NODE(&kit->cursor.priq);
-+	kit->cursor.flags = SCX_TASK_DSQ_CURSOR;
-+	kit->self = kit;
-+	kit->dsq_seq = READ_ONCE(kit->dsq->seq);
-+	kit->flags = flags;
-+
-+	return 0;
-+}
-+
-+/**
-+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
-+ * @it: iterator to progress
-+ *
-+ * Return the next task. See bpf_iter_scx_dsq_new().
-+ */
-+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
-+{
-+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+	bool rev = kit->flags & SCX_DSQ_ITER_REV;
-+	struct task_struct *p;
-+	unsigned long flags;
-+
-+	if (!kit->dsq)
-+		return NULL;
-+
-+	raw_spin_lock_irqsave(&kit->dsq->lock, flags);
-+
-+	if (list_empty(&kit->cursor.list))
-+		p = NULL;
-+	else
-+		p = container_of(&kit->cursor, struct task_struct, scx.dsq_node);
-+
-+	/*
-+	 * Only tasks which were queued before the iteration started are
-+	 * visible. This bounds BPF iterations and guarantees that vtime never
-+	 * jumps in the other direction while iterating.
-+	 */
-+	do {
-+		p = nldsq_next_task(kit->dsq, p, rev);
-+	} while (p && unlikely(time_after64(p->scx.dsq_seq, kit->dsq_seq)));
-+
-+	if (p) {
-+		if (rev)
-+			list_move_tail(&kit->cursor.list, &p->scx.dsq_node.list);
-+		else
-+			list_move(&kit->cursor.list, &p->scx.dsq_node.list);
-+	} else {
-+		list_del_init(&kit->cursor.list);
-+	}
-+
-+	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
-+
-+	return p;
-+}
-+
-+/**
-+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
-+ * @it: iterator to destroy
-+ *
-+ * Undo scx_iter_scx_dsq_new().
-+ */
-+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
-+{
-+	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-+
-+	if (!kit->dsq)
-+		return;
-+
-+	if (!list_empty(&kit->cursor.list)) {
-+		unsigned long flags;
-+
-+		raw_spin_lock_irqsave(&kit->dsq->lock, flags);
-+		list_del_init(&kit->cursor.list);
-+		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
-+	}
-+	kit->dsq = NULL;
-+}
-+
-+__bpf_kfunc_end_defs();
-+
-+static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
-+			 char *fmt, unsigned long long *data, u32 data__sz)
-+{
-+	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
-+	s32 ret;
-+
-+	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
-+	    (data__sz && !data)) {
-+		scx_ops_error("invalid data=%p and data__sz=%u",
-+			      (void *)data, data__sz);
-+		return -EINVAL;
-+	}
-+
-+	ret = copy_from_kernel_nofault(data_buf, data, data__sz);
-+	if (ret < 0) {
-+		scx_ops_error("failed to read data fields (%d)", ret);
-+		return ret;
-+	}
-+
-+	ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
-+				  &bprintf_data);
-+	if (ret < 0) {
-+		scx_ops_error("format preparation failed (%d)", ret);
-+		return ret;
-+	}
-+
-+	ret = bstr_printf(line_buf, line_size, fmt,
-+			  bprintf_data.bin_args);
-+	bpf_bprintf_cleanup(&bprintf_data);
-+	if (ret < 0) {
-+		scx_ops_error("(\"%s\", %p, %u) failed to format",
-+			      fmt, data, data__sz);
-+		return ret;
-+	}
-+
-+	return ret;
-+}
-+
-+static s32 bstr_format(struct scx_bstr_buf *buf,
-+		       char *fmt, unsigned long long *data, u32 data__sz)
-+{
-+	return __bstr_format(buf->data, buf->line, sizeof(buf->line),
-+			     fmt, data, data__sz);
-+}
-+
-+__bpf_kfunc_start_defs();
-+
-+/**
-+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
-+ * @exit_code: Exit value to pass to user space via struct scx_exit_info.
-+ * @fmt: error message format string
-+ * @data: format string parameters packaged using ___bpf_fill() macro
-+ * @data__sz: @data len, must end in '__sz' for the verifier
-+ *
-+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
-+ * disabling.
-+ */
-+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
-+				   unsigned long long *data, u32 data__sz)
-+{
-+	unsigned long flags;
-+
-+	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-+	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
-+		scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
-+				  scx_exit_bstr_buf.line);
-+	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
-+}
-+
-+/**
-+ * scx_bpf_error_bstr - Indicate fatal error
-+ * @fmt: error message format string
-+ * @data: format string parameters packaged using ___bpf_fill() macro
-+ * @data__sz: @data len, must end in '__sz' for the verifier
-+ *
-+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
-+ * disabling.
-+ */
-+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
-+				    u32 data__sz)
-+{
-+	unsigned long flags;
-+
-+	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-+	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
-+		scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
-+				  scx_exit_bstr_buf.line);
-+	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
-+}
-+
-+/**
-+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
-+ * @fmt: format string
-+ * @data: format string parameters packaged using ___bpf_fill() macro
-+ * @data__sz: @data len, must end in '__sz' for the verifier
-+ *
-+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
-+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
-+ *
-+ * The extra dump may be multiple lines. A single line may be split over
-+ * multiple calls. The last line is automatically terminated.
-+ */
-+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
-+				   u32 data__sz)
-+{
-+	struct scx_dump_data *dd = &scx_dump_data;
-+	struct scx_bstr_buf *buf = &dd->buf;
-+	s32 ret;
-+
-+	if (raw_smp_processor_id() != dd->cpu) {
-+		scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
-+		return;
-+	}
-+
-+	/* append the formatted string to the line buf */
-+	ret = __bstr_format(buf->data, buf->line + dd->cursor,
-+			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
-+	if (ret < 0) {
-+		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
-+			  dd->prefix, fmt, data, data__sz, ret);
-+		return;
-+	}
-+
-+	dd->cursor += ret;
-+	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
-+
-+	if (!dd->cursor)
-+		return;
-+
-+	/*
-+	 * If the line buf overflowed or ends in a newline, flush it into the
-+	 * dump. This is to allow the caller to generate a single line over
-+	 * multiple calls. As ops_dump_flush() can also handle multiple lines in
-+	 * the line buf, the only case which can lead to an unexpected
-+	 * truncation is when the caller keeps generating newlines in the middle
-+	 * instead of the end consecutively. Don't do that.
-+	 */
-+	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
-+		ops_dump_flush();
-+}
-+
-+/**
-+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
-+ * @cpu: CPU of interest
-+ *
-+ * Return the maximum relative capacity of @cpu in relation to the most
-+ * performant CPU in the system. The return value is in the range [1,
-+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
-+ */
-+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
-+{
-+	if (ops_cpu_valid(cpu, NULL))
-+		return arch_scale_cpu_capacity(cpu);
-+	else
-+		return SCX_CPUPERF_ONE;
-+}
-+
-+/**
-+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
-+ * @cpu: CPU of interest
-+ *
-+ * Return the current relative performance of @cpu in relation to its maximum.
-+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
-+ *
-+ * The current performance level of a CPU in relation to the maximum performance
-+ * available in the system can be calculated as follows:
-+ *
-+ *   scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
-+ *
-+ * The result is in the range [1, %SCX_CPUPERF_ONE].
-+ */
-+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
-+{
-+	if (ops_cpu_valid(cpu, NULL))
-+		return arch_scale_freq_capacity(cpu);
-+	else
-+		return SCX_CPUPERF_ONE;
-+}
-+
-+/**
-+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
-+ * @cpu: CPU of interest
-+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
-+ * @flags: %SCX_CPUPERF_* flags
-+ *
-+ * Set the target performance level of @cpu to @perf. @perf is in linear
-+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
-+ * schedutil cpufreq governor chooses the target frequency.
-+ *
-+ * The actual performance level chosen, CPU grouping, and the overhead and
-+ * latency of the operations are dependent on the hardware and cpufreq driver in
-+ * use. Consult hardware and cpufreq documentation for more information. The
-+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
-+ */
-+__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf)
-+{
-+	if (unlikely(perf > SCX_CPUPERF_ONE)) {
-+		scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
-+		return;
-+	}
-+
-+	if (ops_cpu_valid(cpu, NULL)) {
-+		struct rq *rq = cpu_rq(cpu);
-+
-+		rq->scx.cpuperf_target = perf;
-+
-+		rcu_read_lock_sched_notrace();
-+		cpufreq_update_util(cpu_rq(cpu), 0);
-+		rcu_read_unlock_sched_notrace();
-+	}
-+}
-+
-+/**
-+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
-+ *
-+ * All valid CPU IDs in the system are smaller than the returned value.
-+ */
-+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
-+{
-+	return nr_cpu_ids;
-+}
-+
-+/**
-+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
-+ */
-+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
-+{
-+	return cpu_possible_mask;
-+}
-+
-+/**
-+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
-+ */
-+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
-+{
-+	return cpu_online_mask;
-+}
-+
-+/**
-+ * scx_bpf_put_cpumask - Release a possible/online cpumask
-+ * @cpumask: cpumask to release
-+ */
-+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
-+{
-+	/*
-+	 * Empty function body because we aren't actually acquiring or releasing
-+	 * a reference to a global cpumask, which is read-only in the caller and
-+	 * is never released. The acquire / release semantics here are just used
-+	 * to make the cpumask is a trusted pointer in the caller.
-+	 */
-+}
-+
-+/**
-+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
-+ * per-CPU cpumask.
-+ *
-+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
-+ */
-+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return cpu_none_mask;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	return idle_masks.cpu;
-+#else
-+	return cpu_none_mask;
-+#endif
-+}
-+
-+/**
-+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
-+ * per-physical-core cpumask. Can be used to determine if an entire physical
-+ * core is free.
-+ *
-+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
-+ */
-+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return cpu_none_mask;
-+	}
-+
-+#ifdef CONFIG_SMP
-+	if (sched_smt_active())
-+		return idle_masks.smt;
-+	else
-+		return idle_masks.cpu;
-+#else
-+	return cpu_none_mask;
-+#endif
-+}
-+
-+/**
-+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
-+ * either the percpu, or SMT idle-tracking cpumask.
-+ */
-+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-+{
-+	/*
-+	 * Empty function body because we aren't actually acquiring or releasing
-+	 * a reference to a global idle cpumask, which is read-only in the
-+	 * caller and is never released. The acquire / release semantics here
-+	 * are just used to make the cpumask a trusted pointer in the caller.
-+	 */
-+}
-+
-+/**
-+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
-+ * @cpu: cpu to test and clear idle for
-+ *
-+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
-+ * %false otherwise.
-+ *
-+ * Unavailable if ops.update_idle() is implemented and
-+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
-+ */
-+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return false;
-+	}
-+
-+	if (ops_cpu_valid(cpu, NULL))
-+		return test_and_clear_cpu_idle(cpu);
-+	else
-+		return false;
-+}
-+
-+/**
-+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
-+ * @cpus_allowed: Allowed cpumask
-+ * @flags: %SCX_PICK_IDLE_CPU_* flags
-+ *
-+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
-+ * number on success. -%EBUSY if no matching cpu was found.
-+ *
-+ * Idle CPU tracking may race against CPU scheduling state transitions. For
-+ * example, this function may return -%EBUSY as CPUs are transitioning into the
-+ * idle state. If the caller then assumes that there will be dispatch events on
-+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
-+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
-+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
-+ * event in the near future.
-+ *
-+ * Unavailable if ops.update_idle() is implemented and
-+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
-+ */
-+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
-+				      u64 flags)
-+{
-+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
-+		scx_ops_error("built-in idle tracking is disabled");
-+		return -EBUSY;
-+	}
-+
-+	return scx_pick_idle_cpu(cpus_allowed, flags);
-+}
-+
-+/**
-+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
-+ * @cpus_allowed: Allowed cpumask
-+ * @flags: %SCX_PICK_IDLE_CPU_* flags
-+ *
-+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
-+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
-+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
-+ * empty.
-+ *
-+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
-+ * set, this function can't tell which CPUs are idle and will always pick any
-+ * CPU.
-+ */
-+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
-+				     u64 flags)
-+{
-+	s32 cpu;
-+
-+	if (static_branch_likely(&scx_builtin_idle_enabled)) {
-+		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
-+		if (cpu >= 0)
-+			return cpu;
-+	}
-+
-+	cpu = cpumask_any_distribute(cpus_allowed);
-+	if (cpu < nr_cpu_ids)
-+		return cpu;
-+	else
-+		return -EBUSY;
-+}
-+
-+/**
-+ * scx_bpf_task_running - Is task currently running?
-+ * @p: task of interest
-+ */
-+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
-+{
-+	return task_rq(p)->curr == p;
-+}
-+
-+/**
-+ * scx_bpf_task_cpu - CPU a task is currently associated with
-+ * @p: task of interest
-+ */
-+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
-+{
-+	return task_cpu(p);
-+}
-+
-+/**
-+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
-+ * @p: task of interest
-+ *
-+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
-+ * from the scheduler's POV. SCX operations should use this function to
-+ * determine @p's current cgroup as, unlike following @p->cgroups,
-+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
-+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
-+ * operations. The restriction guarantees that @p's rq is locked by the caller.
-+ */
-+#ifdef CONFIG_CGROUP_SCHED
-+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
-+{
-+	struct task_group *tg = p->sched_task_group;
-+	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
-+
-+	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
-+		goto out;
-+
-+	/*
-+	 * A task_group may either be a cgroup or an autogroup. In the latter
-+	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
-+	 * kind once created.
-+	 */
-+	if (tg && tg->css.cgroup)
-+		cgrp = tg->css.cgroup;
-+	else
-+		cgrp = &cgrp_dfl_root.cgrp;
-+out:
-+	cgroup_get(cgrp);
-+	return cgrp;
-+}
-+#endif
-+
-+__bpf_kfunc_end_defs();
-+
-+BTF_KFUNCS_START(scx_kfunc_ids_any)
-+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
-+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
-+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
-+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
-+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
-+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
-+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
-+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
-+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
-+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
-+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
-+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
-+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
-+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
-+#ifdef CONFIG_CGROUP_SCHED
-+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
-+#endif
-+BTF_KFUNCS_END(scx_kfunc_ids_any)
-+
-+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
-+	.owner			= THIS_MODULE,
-+	.set			= &scx_kfunc_ids_any,
-+};
-+
-+static int __init scx_init(void)
-+{
-+	int ret;
-+
-+	/*
-+	 * kfunc registration can't be done from init_sched_ext_class() as
-+	 * register_btf_kfunc_id_set() needs most of the system to be up.
-+	 *
-+	 * Some kfuncs are context-sensitive and can only be called from
-+	 * specific SCX ops. They are grouped into BTF sets accordingly.
-+	 * Unfortunately, BPF currently doesn't have a way of enforcing such
-+	 * restrictions. Eventually, the verifier should be able to enforce
-+	 * them. For now, register them the same and make each kfunc explicitly
-+	 * check using scx_kf_allowed().
-+	 */
-+	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_sleepable)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_select_cpu)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_enqueue_dispatch)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_dispatch)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_cpu_release)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-+					     &scx_kfunc_set_any)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
-+					     &scx_kfunc_set_any)) ||
-+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
-+					     &scx_kfunc_set_any))) {
-+		pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
-+		return ret;
-+	}
-+
-+	ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
-+	if (ret) {
-+		pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
-+		return ret;
-+	}
-+
-+	ret = register_pm_notifier(&scx_pm_notifier);
-+	if (ret) {
-+		pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
-+		return ret;
-+	}
-+
-+	scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
-+	if (!scx_kset) {
-+		pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
-+		return -ENOMEM;
-+	}
-+
-+	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
-+	if (ret < 0) {
-+		pr_err("sched_ext: Failed to add global attributes\n");
-+		return ret;
-+	}
-+
-+	return 0;
-+}
-+__initcall(scx_init);
-diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
-new file mode 100644
-index 000000000000..52d9b7df2a25
---- /dev/null
-+++ b/kernel/sched/ext.h
-@@ -0,0 +1,143 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+
-+struct sched_enq_and_set_ctx {
-+	struct task_struct	*p;
-+	int			queue_flags;
-+	bool			queued;
-+	bool			running;
-+};
-+
-+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-+			    struct sched_enq_and_set_ctx *ctx);
-+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
-+
-+extern const struct sched_class ext_sched_class;
-+
-+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
-+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
-+#define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
-+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
-+
-+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-+
-+static inline bool task_on_scx(const struct task_struct *p)
-+{
-+	return scx_enabled() && p->sched_class == &ext_sched_class;
-+}
-+
-+void scx_next_task_picked(struct rq *rq, struct task_struct *p,
-+			  const struct sched_class *active);
-+void scx_tick(struct rq *rq);
-+void init_scx_entity(struct sched_ext_entity *scx);
-+void scx_pre_fork(struct task_struct *p);
-+int scx_fork(struct task_struct *p);
-+void scx_post_fork(struct task_struct *p);
-+void scx_cancel_fork(struct task_struct *p);
-+int scx_check_setscheduler(struct task_struct *p, int policy);
-+bool scx_can_stop_tick(struct rq *rq);
-+bool task_should_scx(struct task_struct *p);
-+void init_sched_ext_class(void);
-+void scx_rq_activate(struct rq *rq);
-+void scx_rq_deactivate(struct rq *rq);
-+
-+static inline u32 scx_cpuperf_target(s32 cpu)
-+{
-+	if (scx_enabled())
-+		return cpu_rq(cpu)->scx.cpuperf_target;
-+	else
-+		return 0;
-+}
-+
-+static inline const struct sched_class *next_active_class(const struct sched_class *class)
-+{
-+	class++;
-+	if (scx_switched_all() && class == &fair_sched_class)
-+		class++;
-+	if (!scx_enabled() && class == &ext_sched_class)
-+		class++;
-+	return class;
-+}
-+
-+#define for_active_class_range(class, _from, _to)				\
-+	for (class = (_from); class != (_to); class = next_active_class(class))
-+
-+#define for_each_active_class(class)						\
-+	for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
-+
-+/*
-+ * SCX requires a balance() call before every pick_next_task() call including
-+ * when waking up from idle.
-+ */
-+#define for_balance_class_range(class, prev_class, end_class)			\
-+	for_active_class_range(class, (prev_class) > &ext_sched_class ?		\
-+			       &ext_sched_class : (prev_class), (end_class))
-+
-+#ifdef CONFIG_SCHED_CORE
-+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
-+		   bool in_fi);
-+#endif
-+
-+#else	/* CONFIG_SCHED_CLASS_EXT */
-+
-+#define scx_enabled()		false
-+#define scx_switched_all()	false
-+
-+static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p,
-+					const struct sched_class *active) {}
-+static inline void scx_tick(struct rq *rq) {}
-+static inline void scx_pre_fork(struct task_struct *p) {}
-+static inline int scx_fork(struct task_struct *p) { return 0; }
-+static inline void scx_post_fork(struct task_struct *p) {}
-+static inline void scx_cancel_fork(struct task_struct *p) {}
-+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
-+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
-+static inline bool task_on_scx(const struct task_struct *p) { return false; }
-+static inline void init_sched_ext_class(void) {}
-+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
-+static inline void scx_rq_activate(struct rq *rq) {}
-+static inline void scx_rq_deactivate(struct rq *rq) {}
-+
-+#define for_each_active_class		for_each_class
-+#define for_balance_class_range		for_class_range
-+
-+#endif	/* CONFIG_SCHED_CLASS_EXT */
-+
-+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-+void __scx_update_idle(struct rq *rq, bool idle);
-+
-+static inline void scx_update_idle(struct rq *rq, bool idle)
-+{
-+	if (scx_enabled())
-+		__scx_update_idle(rq, idle);
-+}
-+#else
-+static inline void scx_update_idle(struct rq *rq, bool idle) {}
-+#endif
-+
-+#ifdef CONFIG_CGROUP_SCHED
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+int scx_tg_online(struct task_group *tg);
-+void scx_tg_offline(struct task_group *tg);
-+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
-+void scx_move_task(struct task_struct *p);
-+void scx_cgroup_finish_attach(void);
-+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
-+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
-+#else	/* CONFIG_EXT_GROUP_SCHED */
-+static inline int scx_tg_online(struct task_group *tg) { return 0; }
-+static inline void scx_tg_offline(struct task_group *tg) {}
-+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
-+static inline void scx_move_task(struct task_struct *p) {}
-+static inline void scx_cgroup_finish_attach(void) {}
-+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
-+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
-+#endif	/* CONFIG_EXT_GROUP_SCHED */
-+#endif	/* CONFIG_CGROUP_SCHED */
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index a241e0d45922..00fbaec603bf 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -3848,7 +3848,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- 	}
- }
- 
--void reweight_task(struct task_struct *p, int prio)
-+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio)
- {
- 	struct sched_entity *se = &p->se;
- 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-@@ -8404,7 +8404,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
- 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
- 	 * is driven by the tick):
- 	 */
--	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
-+	if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION))
- 		return;
- 
- 	find_matching_se(&se, &pse);
-@@ -9365,28 +9365,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {
- 
- static bool __update_blocked_others(struct rq *rq, bool *done)
- {
--	const struct sched_class *curr_class;
--	u64 now = rq_clock_pelt(rq);
--	unsigned long hw_pressure;
--	bool decayed;
-+	bool updated;
- 
- 	/*
- 	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
- 	 * DL and IRQ signals have been updated before updating CFS.
- 	 */
--	curr_class = rq->curr->sched_class;
--
--	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
--
--	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
--		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
--		  update_hw_load_avg(now, rq, hw_pressure) |
--		  update_irq_load_avg(rq, 0);
-+	updated = update_other_load_avgs(rq);
- 
- 	if (others_have_blocked(rq))
- 		*done = false;
- 
--	return decayed;
-+	return updated;
- }
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -13227,6 +13217,7 @@ DEFINE_SCHED_CLASS(fair) = {
- 	.task_tick		= task_tick_fair,
- 	.task_fork		= task_fork_fair,
- 
-+	.reweight_task		= reweight_task_fair,
- 	.prio_changed		= prio_changed_fair,
- 	.switched_from		= switched_from_fair,
- 	.switched_to		= switched_to_fair,
-diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
-index 6135fbe83d68..3b6540cc436a 100644
---- a/kernel/sched/idle.c
-+++ b/kernel/sched/idle.c
-@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
- 
- static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
- {
-+	scx_update_idle(rq, false);
- }
- 
- static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
- {
- 	update_idle_core(rq);
-+	scx_update_idle(rq, true);
- 	schedstat_inc(rq->sched_goidle);
- }
- 
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 6e6a45087015..920540d876a6 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -187,9 +187,19 @@ static inline int idle_policy(int policy)
- {
- 	return policy == SCHED_IDLE;
- }
-+
-+static inline int normal_policy(int policy)
-+{
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	if (policy == SCHED_EXT)
-+		return true;
-+#endif
-+	return policy == SCHED_NORMAL;
-+}
-+
- static inline int fair_policy(int policy)
- {
--	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
-+	return normal_policy(policy) || policy == SCHED_BATCH;
- }
- 
- static inline int rt_policy(int policy)
-@@ -237,6 +247,24 @@ static inline void update_avg(u64 *avg, u64 sample)
- #define shr_bound(val, shift)							\
- 	(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
- 
-+/*
-+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
-+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
-+ * maps pretty well onto the shares value used by scheduler and the round-trip
-+ * conversions preserve the original value over the entire range.
-+ */
-+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
-+{
-+	return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
-+}
-+
-+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
-+{
-+	return clamp_t(unsigned long,
-+		       DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
-+		       CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
-+}
-+
- /*
-  * !! For sched_setattr_nocheck() (kernel) only !!
-  *
-@@ -420,6 +448,11 @@ struct task_group {
- 	struct rt_bandwidth	rt_bandwidth;
- #endif
- 
-+#ifdef CONFIG_EXT_GROUP_SCHED
-+	u32			scx_flags;	/* SCX_TG_* */
-+	u32			scx_weight;
-+#endif
-+
- 	struct rcu_head		rcu;
- 	struct list_head	list;
- 
-@@ -475,6 +508,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
- 	return walk_tg_tree_from(&root_task_group, down, up, data);
- }
- 
-+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-+{
-+	return css ? container_of(css, struct task_group, css) : NULL;
-+}
-+
- extern int tg_nop(struct task_group *tg, void *data);
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -531,6 +569,11 @@ extern void set_task_rq_fair(struct sched_entity *se,
- static inline void set_task_rq_fair(struct sched_entity *se,
- 			     struct cfs_rq *prev, struct cfs_rq *next) { }
- #endif /* CONFIG_SMP */
-+#else /* CONFIG_FAIR_GROUP_SCHED */
-+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-+{
-+	return 0;
-+}
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- 
- #else /* CONFIG_CGROUP_SCHED */
-@@ -691,6 +734,37 @@ struct cfs_rq {
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- };
- 
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+/* scx_rq->flags, protected by the rq lock */
-+enum scx_rq_flags {
-+	/*
-+	 * A hotplugged CPU starts scheduling before rq_online_scx(). Track
-+	 * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
-+	 * only while the BPF scheduler considers the CPU to be online.
-+	 */
-+	SCX_RQ_ONLINE		= 1 << 0,
-+	SCX_RQ_BALANCING	= 1 << 1,
-+	SCX_RQ_CAN_STOP_TICK	= 1 << 2,
-+};
-+
-+struct scx_rq {
-+	struct scx_dispatch_q	local_dsq;
-+	struct list_head	runnable_list;		/* runnable tasks on this rq */
-+	unsigned long		ops_qseq;
-+	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
-+	u32			nr_running;
-+	u32			flags;
-+	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
-+	bool			cpu_released;
-+	cpumask_var_t		cpus_to_kick;
-+	cpumask_var_t		cpus_to_kick_if_idle;
-+	cpumask_var_t		cpus_to_preempt;
-+	cpumask_var_t		cpus_to_wait;
-+	unsigned long		pnt_seq;
-+	struct irq_work		kick_cpus_irq_work;
-+};
-+#endif /* CONFIG_SCHED_CLASS_EXT */
-+
- static inline int rt_bandwidth_enabled(void)
- {
- 	return sysctl_sched_rt_runtime >= 0;
-@@ -1036,6 +1110,9 @@ struct rq {
- 	struct cfs_rq		cfs;
- 	struct rt_rq		rt;
- 	struct dl_rq		dl;
-+#ifdef CONFIG_SCHED_CLASS_EXT
-+	struct scx_rq		scx;
-+#endif
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 	/* list of leaf cfs_rq on this CPU: */
-@@ -2304,8 +2381,11 @@ struct sched_class {
- 	 * cannot assume the switched_from/switched_to pair is serialized by
- 	 * rq->lock. They are however serialized by p->pi_lock.
- 	 */
-+	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
-+	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
-+			      int newprio);
- 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- 			      int oldprio);
- 
-@@ -2463,7 +2543,7 @@ extern void init_sched_dl_class(void);
- extern void init_sched_rt_class(void);
- extern void init_sched_fair_class(void);
- 
--extern void reweight_task(struct task_struct *p, int prio);
-+extern void __setscheduler_prio(struct task_struct *p, int prio);
- 
- extern void resched_curr(struct rq *rq);
- extern void resched_cpu(int cpu);
-@@ -2541,6 +2621,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
- extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
- extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
- 
-+extern void check_class_changing(struct rq *rq, struct task_struct *p,
-+				 const struct sched_class *prev_class);
-+extern void check_class_changed(struct rq *rq, struct task_struct *p,
-+				const struct sched_class *prev_class,
-+				int oldprio);
-+
- extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
- 
- #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY)
-@@ -3006,6 +3092,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
- #endif
- 
- #ifdef CONFIG_SMP
-+bool update_other_load_avgs(struct rq *rq);
- unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- 				 unsigned long *min,
- 				 unsigned long *max);
-@@ -3048,6 +3135,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
- {
- 	return READ_ONCE(rq->avg_rt.util_avg);
- }
-+#else
-+static inline bool update_other_load_avgs(struct rq *rq) { return false; }
- #endif
- 
- #ifdef CONFIG_UCLAMP_TASK
-@@ -3480,4 +3569,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
- extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
- extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
- 
-+#ifdef CONFIG_CGROUP_SCHED
-+enum cpu_cftype_id {
-+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
-+	CPU_CFTYPE_WEIGHT,
-+	CPU_CFTYPE_WEIGHT_NICE,
-+	CPU_CFTYPE_IDLE,
-+#endif
-+#ifdef CONFIG_CFS_BANDWIDTH
-+	CPU_CFTYPE_MAX,
-+	CPU_CFTYPE_MAX_BURST,
-+#endif
-+#ifdef CONFIG_UCLAMP_TASK_GROUP
-+	CPU_CFTYPE_UCLAMP_MIN,
-+	CPU_CFTYPE_UCLAMP_MAX,
-+#endif
-+	CPU_CFTYPE_CNT,
-+};
-+
-+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
-+#endif /* CONFIG_CGROUP_SCHED */
-+
-+#include "ext.h"
-+
- #endif /* _KERNEL_SCHED_SCHED_H */
-diff --git a/lib/dump_stack.c b/lib/dump_stack.c
-index 222c6d6c8281..9581ef4efec5 100644
---- a/lib/dump_stack.c
-+++ b/lib/dump_stack.c
-@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl)
- 
- 	print_worker_info(log_lvl, current);
- 	print_stop_info(log_lvl, current);
-+	print_scx_info(log_lvl, current);
- }
- 
- /**
-diff --git a/lib/test_bpf.c b/lib/test_bpf.c
-index 207ff87194db..ce5716c3999a 100644
---- a/lib/test_bpf.c
-+++ b/lib/test_bpf.c
-@@ -15706,4 +15706,5 @@ static void __exit test_bpf_exit(void)
- module_init(test_bpf_init);
- module_exit(test_bpf_exit);
- 
-+MODULE_DESCRIPTION("Testsuite for BPF interpreter and BPF JIT compiler");
- MODULE_LICENSE("GPL");
-diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
-index 891cdf61c65a..3ea52b05adfb 100644
---- a/net/bpf/bpf_dummy_struct_ops.c
-+++ b/net/bpf/bpf_dummy_struct_ops.c
-@@ -272,12 +272,12 @@ static int bpf_dummy_init_member(const struct btf_type *t,
- 	return -EOPNOTSUPP;
- }
- 
--static int bpf_dummy_reg(void *kdata)
-+static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
- {
- 	return -EOPNOTSUPP;
- }
- 
--static void bpf_dummy_unreg(void *kdata)
-+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
- {
- }
- 
-diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
-index c3c51b9a6826..816bb0fde718 100644
---- a/net/bridge/netfilter/nf_conntrack_bridge.c
-+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
-@@ -32,7 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
- 					   struct sk_buff *))
- {
- 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
--	bool mono_delivery_time = skb->mono_delivery_time;
-+	u8 tstamp_type = skb->tstamp_type;
- 	unsigned int hlen, ll_rs, mtu;
- 	ktime_t tstamp = skb->tstamp;
- 	struct ip_frag_state state;
-@@ -82,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
- 			if (iter.frag)
- 				ip_fraglist_prepare(skb, &iter);
- 
--			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
-+			skb_set_delivery_time(skb, tstamp, tstamp_type);
- 			err = output(net, sk, data, skb);
- 			if (err || !iter.frag)
- 				break;
-@@ -113,7 +113,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
- 			goto blackhole;
- 		}
- 
--		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
-+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
- 		err = output(net, sk, data, skb2);
- 		if (err)
- 			goto blackhole;
-diff --git a/net/core/dev.c b/net/core/dev.c
-index e1bb6d7856d9..85fe8138f3e4 100644
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -2160,7 +2160,7 @@ EXPORT_SYMBOL(net_disable_timestamp);
- static inline void net_timestamp_set(struct sk_buff *skb)
- {
- 	skb->tstamp = 0;
--	skb->mono_delivery_time = 0;
-+	skb->tstamp_type = SKB_CLOCK_REALTIME;
- 	if (static_branch_unlikely(&netstamp_needed_key))
- 		skb->tstamp = ktime_get_real();
- }
-diff --git a/net/core/filter.c b/net/core/filter.c
-index 2510464692af..7c46ecba3b01 100644
---- a/net/core/filter.c
-+++ b/net/core/filter.c
-@@ -2274,12 +2274,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
- 
- 	err = bpf_out_neigh_v6(net, skb, dev, nh);
- 	if (unlikely(net_xmit_eval(err)))
--		dev->stats.tx_errors++;
-+		DEV_STATS_INC(dev, tx_errors);
- 	else
- 		ret = NET_XMIT_SUCCESS;
- 	goto out_xmit;
- out_drop:
--	dev->stats.tx_errors++;
-+	DEV_STATS_INC(dev, tx_errors);
- 	kfree_skb(skb);
- out_xmit:
- 	return ret;
-@@ -2380,12 +2380,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
- 
- 	err = bpf_out_neigh_v4(net, skb, dev, nh);
- 	if (unlikely(net_xmit_eval(err)))
--		dev->stats.tx_errors++;
-+		DEV_STATS_INC(dev, tx_errors);
- 	else
- 		ret = NET_XMIT_SUCCESS;
- 	goto out_xmit;
- out_drop:
--	dev->stats.tx_errors++;
-+	DEV_STATS_INC(dev, tx_errors);
- 	kfree_skb(skb);
- out_xmit:
- 	return ret;
-@@ -7726,17 +7726,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
- 		return -EOPNOTSUPP;
- 
- 	switch (tstamp_type) {
--	case BPF_SKB_TSTAMP_DELIVERY_MONO:
-+	case BPF_SKB_CLOCK_REALTIME:
-+		skb->tstamp = tstamp;
-+		skb->tstamp_type = SKB_CLOCK_REALTIME;
-+		break;
-+	case BPF_SKB_CLOCK_MONOTONIC:
- 		if (!tstamp)
- 			return -EINVAL;
- 		skb->tstamp = tstamp;
--		skb->mono_delivery_time = 1;
-+		skb->tstamp_type = SKB_CLOCK_MONOTONIC;
- 		break;
--	case BPF_SKB_TSTAMP_UNSPEC:
--		if (tstamp)
-+	case BPF_SKB_CLOCK_TAI:
-+		if (!tstamp)
- 			return -EINVAL;
--		skb->tstamp = 0;
--		skb->mono_delivery_time = 0;
-+		skb->tstamp = tstamp;
-+		skb->tstamp_type = SKB_CLOCK_TAI;
- 		break;
- 	default:
- 		return -EINVAL;
-@@ -9387,16 +9391,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
- {
- 	__u8 value_reg = si->dst_reg;
- 	__u8 skb_reg = si->src_reg;
--	/* AX is needed because src_reg and dst_reg could be the same */
--	__u8 tmp_reg = BPF_REG_AX;
--
--	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
--			      SKB_BF_MONO_TC_OFFSET);
--	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
--				SKB_MONO_DELIVERY_TIME_MASK, 2);
--	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
--	*insn++ = BPF_JMP_A(1);
--	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
-+	BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
-+	BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
-+	BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
-+	BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
-+	*insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
-+	*insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
-+#ifdef __BIG_ENDIAN_BITFIELD
-+	*insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
-+#else
-+	BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
-+#endif
- 
- 	return insn;
- }
-@@ -9439,11 +9444,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
- 		__u8 tmp_reg = BPF_REG_AX;
- 
- 		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
--		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
--					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
--		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
--					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
--		/* skb->tc_at_ingress && skb->mono_delivery_time,
-+		/* check if ingress mask bits is set */
-+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
-+		*insn++ = BPF_JMP_A(4);
-+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
-+		*insn++ = BPF_JMP_A(2);
-+		/* skb->tc_at_ingress && skb->tstamp_type,
- 		 * read 0 as the (rcv) timestamp.
- 		 */
- 		*insn++ = BPF_MOV64_IMM(value_reg, 0);
-@@ -9468,7 +9474,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
- 	 * the bpf prog is aware the tstamp could have delivery time.
- 	 * Thus, write skb->tstamp as is if tstamp_type_access is true.
- 	 * Otherwise, writing at ingress will have to clear the
--	 * mono_delivery_time bit also.
-+	 * skb->tstamp_type bit also.
- 	 */
- 	if (!prog->tstamp_type_access) {
- 		__u8 tmp_reg = BPF_REG_AX;
-@@ -9478,8 +9484,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
- 		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
- 		/* goto <store> */
- 		*insn++ = BPF_JMP_A(2);
--		/* <clear>: mono_delivery_time */
--		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
-+		/* <clear>: skb->tstamp_type */
-+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
- 		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
- 	}
- #endif
-diff --git a/net/core/sock.c b/net/core/sock.c
-index 8629f9aecf91..521e6373d4f7 100644
---- a/net/core/sock.c
-+++ b/net/core/sock.c
-@@ -2262,7 +2262,12 @@ static void sk_init_common(struct sock *sk)
- 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
- 			af_elock_keys + sk->sk_family,
- 			af_family_elock_key_strings[sk->sk_family]);
--	lockdep_set_class_and_name(&sk->sk_callback_lock,
-+	if (sk->sk_kern_sock)
-+		lockdep_set_class_and_name(&sk->sk_callback_lock,
-+			af_kern_callback_keys + sk->sk_family,
-+			af_family_kern_clock_key_strings[sk->sk_family]);
-+	else
-+		lockdep_set_class_and_name(&sk->sk_callback_lock,
- 			af_callback_keys + sk->sk_family,
- 			af_family_clock_key_strings[sk->sk_family]);
- }
-@@ -3460,18 +3465,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
- 	}
- 	sk->sk_uid	=	uid;
- 
--	rwlock_init(&sk->sk_callback_lock);
--	if (sk->sk_kern_sock)
--		lockdep_set_class_and_name(
--			&sk->sk_callback_lock,
--			af_kern_callback_keys + sk->sk_family,
--			af_family_kern_clock_key_strings[sk->sk_family]);
--	else
--		lockdep_set_class_and_name(
--			&sk->sk_callback_lock,
--			af_callback_keys + sk->sk_family,
--			af_family_clock_key_strings[sk->sk_family]);
--
- 	sk->sk_state_change	=	sock_def_wakeup;
- 	sk->sk_data_ready	=	sock_def_readable;
- 	sk->sk_write_space	=	sock_def_write_space;
-diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
-index 56ef873828f4..867d637d86f0 100644
---- a/net/ieee802154/6lowpan/reassembly.c
-+++ b/net/ieee802154/6lowpan/reassembly.c
-@@ -130,7 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
- 		goto err;
- 
- 	fq->q.stamp = skb->tstamp;
--	fq->q.mono_delivery_time = skb->mono_delivery_time;
-+	fq->q.tstamp_type = skb->tstamp_type;
- 	if (frag_type == LOWPAN_DISPATCH_FRAG1)
- 		fq->q.flags |= INET_FRAG_FIRST_IN;
- 
-diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
-index f180befc28bd..4273cac333f6 100644
---- a/net/ipv4/bpf_tcp_ca.c
-+++ b/net/ipv4/bpf_tcp_ca.c
-@@ -260,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t,
- 	return 0;
- }
- 
--static int bpf_tcp_ca_reg(void *kdata)
-+static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link)
- {
- 	return tcp_register_congestion_control(kdata);
- }
- 
--static void bpf_tcp_ca_unreg(void *kdata)
-+static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link)
- {
- 	tcp_unregister_congestion_control(kdata);
- }
- 
--static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
-+static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link)
- {
- 	return tcp_update_congestion_control(kdata, old_kdata);
- }
-diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
-index faaec92a46ac..d179a2c84222 100644
---- a/net/ipv4/inet_fragment.c
-+++ b/net/ipv4/inet_fragment.c
-@@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
- 	skb_mark_not_on_list(head);
- 	head->prev = NULL;
- 	head->tstamp = q->stamp;
--	head->mono_delivery_time = q->mono_delivery_time;
-+	head->tstamp_type = q->tstamp_type;
- 
- 	if (sk)
- 		refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
-diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
-index 08e2c92e25ab..a92664a5ef2e 100644
---- a/net/ipv4/ip_fragment.c
-+++ b/net/ipv4/ip_fragment.c
-@@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
- 		qp->iif = dev->ifindex;
- 
- 	qp->q.stamp = skb->tstamp;
--	qp->q.mono_delivery_time = skb->mono_delivery_time;
-+	qp->q.tstamp_type = skb->tstamp_type;
- 	qp->q.meat += skb->len;
- 	qp->ecn |= ecn;
- 	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
-diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
-index 9500031a1f55..b90d0f78ac80 100644
---- a/net/ipv4/ip_output.c
-+++ b/net/ipv4/ip_output.c
-@@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- {
- 	struct iphdr *iph;
- 	struct sk_buff *skb2;
--	bool mono_delivery_time = skb->mono_delivery_time;
-+	u8 tstamp_type = skb->tstamp_type;
- 	struct rtable *rt = skb_rtable(skb);
- 	unsigned int mtu, hlen, ll_rs;
- 	struct ip_fraglist_iter iter;
-@@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 				}
- 			}
- 
--			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
-+			skb_set_delivery_time(skb, tstamp, tstamp_type);
- 			err = output(net, sk, skb);
- 
- 			if (!err)
-@@ -912,7 +912,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 		/*
- 		 *	Put this fragment into the sending queue.
- 		 */
--		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
-+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
- 		err = output(net, sk, skb2);
- 		if (err)
- 			goto fail;
-@@ -1457,7 +1457,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
- 
- 	skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
- 	skb->mark = cork->mark;
--	skb->tstamp = cork->transmit_time;
-+	if (sk_is_tcp(sk))
-+		skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
-+	else
-+		skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
- 	/*
- 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
- 	 * on dst refcount
-@@ -1649,7 +1652,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
- 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
- 								arg->csum));
- 		nskb->ip_summed = CHECKSUM_NONE;
--		nskb->mono_delivery_time = !!transmit_time;
-+		if (transmit_time)
-+			nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
- 		if (txhash)
- 			skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
- 		ip_push_pending_frames(sk, &fl4);
-diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
-index 4cb43401e0e0..1a0953650356 100644
---- a/net/ipv4/raw.c
-+++ b/net/ipv4/raw.c
-@@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
- 	skb->protocol = htons(ETH_P_IP);
- 	skb->priority = READ_ONCE(sk->sk_priority);
- 	skb->mark = sockc->mark;
--	skb->tstamp = sockc->transmit_time;
-+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
- 	skb_dst_set(skb, &rt->dst);
- 	*rtp = NULL;
- 
-diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
-index b710958393e6..8e891e56c5e0 100644
---- a/net/ipv4/tcp_ipv4.c
-+++ b/net/ipv4/tcp_ipv4.c
-@@ -3620,6 +3620,8 @@ void __init tcp_v4_init(void)
- 		 */
- 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
- 
-+		sk->sk_clockid = CLOCK_MONOTONIC;
-+
- 		per_cpu(ipv4_tcp_sk, cpu) = sk;
- 	}
- 	if (register_pernet_subsys(&tcp_sk_ops))
-diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 3f4bdd2b6476..f68fd3fd1f9f 100644
---- a/net/ipv4/tcp_output.c
-+++ b/net/ipv4/tcp_output.c
-@@ -1304,7 +1304,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
- 	tp = tcp_sk(sk);
- 	prior_wstamp = tp->tcp_wstamp_ns;
- 	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
--	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
-+	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
- 	if (clone_it) {
- 		oskb = skb;
- 
-@@ -1658,7 +1658,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- 
- 	skb_split(skb, buff, len);
- 
--	skb_set_delivery_time(buff, skb->tstamp, true);
-+	skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
- 	tcp_fragment_tstamp(skb, buff);
- 
- 	old_factor = tcp_skb_pcount(skb);
-@@ -2790,7 +2790,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- 			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
- 			tp->tcp_wstamp_ns = tp->tcp_clock_cache;
--			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
-+			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
- 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
- 			tcp_init_tso_segs(skb, mss_now);
- 			tcp_set_tx_in_flight(sk, skb);
-@@ -3780,11 +3780,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
- #ifdef CONFIG_SYN_COOKIES
- 	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
- 		skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
--				      true);
-+				      SKB_CLOCK_MONOTONIC);
- 	else
- #endif
- 	{
--		skb_set_delivery_time(skb, now, true);
-+		skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
- 		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
- 			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
- 	}
-@@ -3871,7 +3871,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
- 	bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
- 				synack_type, &opts);
- 
--	skb_set_delivery_time(skb, now, true);
-+	skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
- 	tcp_add_tx_delay(skb, tp);
- 
- 	return skb;
-@@ -4055,7 +4055,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
- 
- 	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- 
--	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
-+	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
- 
- 	/* Now full SYN+DATA was cloned and sent (or not),
- 	 * remove the SYN from the original skb (syn_data)
-diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
-index 27d8725445e3..e7a19df3125e 100644
---- a/net/ipv6/ip6_output.c
-+++ b/net/ipv6/ip6_output.c
-@@ -859,7 +859,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
- 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
- 				inet6_sk(skb->sk) : NULL;
--	bool mono_delivery_time = skb->mono_delivery_time;
-+	u8 tstamp_type = skb->tstamp_type;
- 	struct ip6_frag_state state;
- 	unsigned int mtu, hlen, nexthdr_offset;
- 	ktime_t tstamp = skb->tstamp;
-@@ -955,7 +955,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 			if (iter.frag)
- 				ip6_fraglist_prepare(skb, &iter);
- 
--			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
-+			skb_set_delivery_time(skb, tstamp, tstamp_type);
- 			err = output(net, sk, skb);
- 			if (!err)
- 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
-@@ -1016,7 +1016,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 		/*
- 		 *	Put this fragment into the sending queue.
- 		 */
--		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
-+		skb_set_delivery_time(frag, tstamp, tstamp_type);
- 		err = output(net, sk, frag);
- 		if (err)
- 			goto fail;
-@@ -1924,7 +1924,10 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
- 
- 	skb->priority = READ_ONCE(sk->sk_priority);
- 	skb->mark = cork->base.mark;
--	skb->tstamp = cork->base.transmit_time;
-+	if (sk_is_tcp(sk))
-+		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
-+	else
-+		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
- 
- 	ip6_cork_steal_dst(skb, cork);
- 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
-diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
-index 53d255838e6a..e0c2347b4dc6 100644
---- a/net/ipv6/netfilter.c
-+++ b/net/ipv6/netfilter.c
-@@ -126,7 +126,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 				  struct sk_buff *))
- {
- 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
--	bool mono_delivery_time = skb->mono_delivery_time;
-+	u8 tstamp_type = skb->tstamp_type;
- 	ktime_t tstamp = skb->tstamp;
- 	struct ip6_frag_state state;
- 	u8 *prevhdr, nexthdr = 0;
-@@ -192,7 +192,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 			if (iter.frag)
- 				ip6_fraglist_prepare(skb, &iter);
- 
--			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
-+			skb_set_delivery_time(skb, tstamp, tstamp_type);
- 			err = output(net, sk, data, skb);
- 			if (err || !iter.frag)
- 				break;
-@@ -225,7 +225,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- 			goto blackhole;
- 		}
- 
--		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
-+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
- 		err = output(net, sk, data, skb2);
- 		if (err)
- 			goto blackhole;
-diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
-index 5e1b50c6a44d..6f0844c9315d 100644
---- a/net/ipv6/netfilter/nf_conntrack_reasm.c
-+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
-@@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
- 		fq->iif = dev->ifindex;
- 
- 	fq->q.stamp = skb->tstamp;
--	fq->q.mono_delivery_time = skb->mono_delivery_time;
-+	fq->q.tstamp_type = skb->tstamp_type;
- 	fq->q.meat += skb->len;
- 	fq->ecn |= ecn;
- 	if (payload_len > fq->q.max_size)
-diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
-index 2eedf255600b..f838366e8256 100644
---- a/net/ipv6/raw.c
-+++ b/net/ipv6/raw.c
-@@ -621,7 +621,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
- 	skb->protocol = htons(ETH_P_IPV6);
- 	skb->priority = READ_ONCE(sk->sk_priority);
- 	skb->mark = sockc->mark;
--	skb->tstamp = sockc->transmit_time;
-+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
- 
- 	skb_put(skb, length);
- 	skb_reset_network_header(skb);
-diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
-index 327caca64257..a48be617a8ab 100644
---- a/net/ipv6/reassembly.c
-+++ b/net/ipv6/reassembly.c
-@@ -198,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
- 		fq->iif = dev->ifindex;
- 
- 	fq->q.stamp = skb->tstamp;
--	fq->q.mono_delivery_time = skb->mono_delivery_time;
-+	fq->q.tstamp_type = skb->tstamp_type;
- 	fq->q.meat += skb->len;
- 	fq->ecn |= ecn;
- 	add_frag_mem_limit(fq->q.fqdir, skb->truesize);
-diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
-index 8c577b651bfc..a8fd473a61ee 100644
---- a/net/ipv6/tcp_ipv6.c
-+++ b/net/ipv6/tcp_ipv6.c
-@@ -975,7 +975,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
- 			mark = inet_twsk(sk)->tw_mark;
- 		else
- 			mark = READ_ONCE(sk->sk_mark);
--		skb_set_delivery_time(buff, tcp_transmit_time(sk), true);
-+		skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
- 	}
- 	if (txhash) {
- 		/* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
-@@ -2382,8 +2382,14 @@ static struct inet_protosw tcpv6_protosw = {
- 
- static int __net_init tcpv6_net_init(struct net *net)
- {
--	return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
--				    SOCK_RAW, IPPROTO_TCP, net);
-+	int res;
-+
-+	res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
-+				   SOCK_RAW, IPPROTO_TCP, net);
-+	if (!res)
-+		net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;
-+
-+	return res;
- }
- 
- static void __net_exit tcpv6_net_exit(struct net *net)
-diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
-index d2492d050fe6..4a136fc3a9c0 100644
---- a/net/netfilter/nf_conntrack_bpf.c
-+++ b/net/netfilter/nf_conntrack_bpf.c
-@@ -32,7 +32,9 @@
-  *		   -EINVAL - Passed NULL for bpf_tuple pointer
-  *		   -EINVAL - opts->reserved is not 0
-  *		   -EINVAL - netns_id is less than -1
-- *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
-+ *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
-+ *		   -EINVAL - opts->ct_zone_id set when
-+			     opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
-  *		   -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
-  *		   -ENONET - No network namespace found for netns_id
-  *		   -ENOENT - Conntrack lookup could not find entry for tuple
-@@ -42,6 +44,8 @@
-  *		 Values:
-  *		   IPPROTO_TCP, IPPROTO_UDP
-  * @dir:       - connection tracking tuple direction.
-+ * @ct_zone_id - connection tracking zone id.
-+ * @ct_zone_dir - connection tracking zone direction.
-  * @reserved   - Reserved member, will be reused for more options in future
-  *		 Values:
-  *		   0
-@@ -51,11 +55,13 @@ struct bpf_ct_opts {
- 	s32 error;
- 	u8 l4proto;
- 	u8 dir;
--	u8 reserved[2];
-+	u16 ct_zone_id;
-+	u8 ct_zone_dir;
-+	u8 reserved[3];
- };
- 
- enum {
--	NF_BPF_CT_OPTS_SZ = 12,
-+	NF_BPF_CT_OPTS_SZ = 16,
- };
- 
- static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
-@@ -104,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
- 			u32 timeout)
- {
- 	struct nf_conntrack_tuple otuple, rtuple;
-+	struct nf_conntrack_zone ct_zone;
- 	struct nf_conn *ct;
- 	int err;
- 
--	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
--	    opts_len != NF_BPF_CT_OPTS_SZ)
-+	if (!opts || !bpf_tuple)
- 		return ERR_PTR(-EINVAL);
-+	if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
-+		return ERR_PTR(-EINVAL);
-+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
-+		if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
-+			return ERR_PTR(-EINVAL);
-+	} else {
-+		if (opts->ct_zone_id)
-+			return ERR_PTR(-EINVAL);
-+	}
- 
- 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
- 		return ERR_PTR(-EINVAL);
-@@ -130,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
- 			return ERR_PTR(-ENONET);
- 	}
- 
--	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
-+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
-+		if (opts->ct_zone_dir == 0)
-+			opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
-+		nf_ct_zone_init(&ct_zone,
-+				opts->ct_zone_id, opts->ct_zone_dir, 0);
-+	} else {
-+		ct_zone = nf_ct_zone_dflt;
-+	}
-+
-+	ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple,
- 				GFP_ATOMIC);
- 	if (IS_ERR(ct))
- 		goto out;
-@@ -152,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
- {
- 	struct nf_conntrack_tuple_hash *hash;
- 	struct nf_conntrack_tuple tuple;
-+	struct nf_conntrack_zone ct_zone;
- 	struct nf_conn *ct;
- 	int err;
- 
--	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
--	    opts_len != NF_BPF_CT_OPTS_SZ)
-+	if (!opts || !bpf_tuple)
- 		return ERR_PTR(-EINVAL);
-+	if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
-+		return ERR_PTR(-EINVAL);
-+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
-+		if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
-+			return ERR_PTR(-EINVAL);
-+	} else {
-+		if (opts->ct_zone_id)
-+			return ERR_PTR(-EINVAL);
-+	}
- 	if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
- 		return ERR_PTR(-EPROTO);
- 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
-@@ -174,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
- 			return ERR_PTR(-ENONET);
- 	}
- 
--	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
-+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
-+		if (opts->ct_zone_dir == 0)
-+			opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
-+		nf_ct_zone_init(&ct_zone,
-+				opts->ct_zone_id, opts->ct_zone_dir, 0);
-+	} else {
-+		ct_zone = nf_ct_zone_dflt;
-+	}
-+
-+	hash = nf_conntrack_find_get(net, &ct_zone, &tuple);
- 	if (opts->netns_id >= 0)
- 		put_net(net);
- 	if (!hash)
-@@ -245,7 +287,7 @@ __bpf_kfunc_start_defs();
-  * @opts	- Additional options for allocation (documented above)
-  *		    Cannot be NULL
-  * @opts__sz	- Length of the bpf_ct_opts structure
-- *		    Must be NF_BPF_CT_OPTS_SZ (12)
-+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
-  */
- __bpf_kfunc struct nf_conn___init *
- bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
-@@ -279,7 +321,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
-  * @opts	- Additional options for lookup (documented above)
-  *		    Cannot be NULL
-  * @opts__sz	- Length of the bpf_ct_opts structure
-- *		    Must be NF_BPF_CT_OPTS_SZ (12)
-+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
-  */
- __bpf_kfunc struct nf_conn *
- bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
-@@ -312,7 +354,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
-  * @opts	- Additional options for allocation (documented above)
-  *		    Cannot be NULL
-  * @opts__sz	- Length of the bpf_ct_opts structure
-- *		    Must be NF_BPF_CT_OPTS_SZ (12)
-+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
-  */
- __bpf_kfunc struct nf_conn___init *
- bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
-@@ -347,7 +389,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
-  * @opts	- Additional options for lookup (documented above)
-  *		    Cannot be NULL
-  * @opts__sz	- Length of the bpf_ct_opts structure
-- *		    Must be NF_BPF_CT_OPTS_SZ (12)
-+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
-  */
- __bpf_kfunc struct nf_conn *
- bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
-diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
-index ea3ebc160e25..fce390887591 100644
---- a/net/packet/af_packet.c
-+++ b/net/packet/af_packet.c
-@@ -2056,8 +2056,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
- 	skb->dev = dev;
- 	skb->priority = READ_ONCE(sk->sk_priority);
- 	skb->mark = READ_ONCE(sk->sk_mark);
--	skb->tstamp = sockc.transmit_time;
--
-+	skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
- 	skb_setup_tx_timestamp(skb, sockc.tsflags);
- 
- 	if (unlikely(extra_len == 4))
-@@ -2584,7 +2583,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
- 	skb->dev = dev;
- 	skb->priority = READ_ONCE(po->sk.sk_priority);
- 	skb->mark = READ_ONCE(po->sk.sk_mark);
--	skb->tstamp = sockc->transmit_time;
-+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid);
- 	skb_setup_tx_timestamp(skb, sockc->tsflags);
- 	skb_zcopy_set_nouarg(skb, ph.raw);
- 
-@@ -3062,7 +3061,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
- 	skb->dev = dev;
- 	skb->priority = READ_ONCE(sk->sk_priority);
- 	skb->mark = sockc.mark;
--	skb->tstamp = sockc.transmit_time;
-+	skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
- 
- 	if (unlikely(extra_len == 4))
- 		skb->no_fcs = 1;
-diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
-index 0e3cf11ae5fc..396b576390d0 100644
---- a/net/sched/act_bpf.c
-+++ b/net/sched/act_bpf.c
-@@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb,
- 		bpf_compute_data_pointers(skb);
- 		filter_res = bpf_prog_run(filter, skb);
- 	}
--	if (unlikely(!skb->tstamp && skb->mono_delivery_time))
--		skb->mono_delivery_time = 0;
-+	if (unlikely(!skb->tstamp && skb->tstamp_type))
-+		skb->tstamp_type = SKB_CLOCK_REALTIME;
- 	if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
- 		skb_orphan(skb);
- 
-diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
-index 5e83e890f6a4..1941ebec23ff 100644
---- a/net/sched/cls_bpf.c
-+++ b/net/sched/cls_bpf.c
-@@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
- 			bpf_compute_data_pointers(skb);
- 			filter_res = bpf_prog_run(prog->filter, skb);
- 		}
--		if (unlikely(!skb->tstamp && skb->mono_delivery_time))
--			skb->mono_delivery_time = 0;
-+		if (unlikely(!skb->tstamp && skb->tstamp_type))
-+			skb->tstamp_type = SKB_CLOCK_REALTIME;
- 
- 		if (prog->exts_integrated) {
- 			res->class   = 0;
-diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
-index 944f13fe164a..7ec7143e2757 100644
---- a/samples/bpf/cpustat_kern.c
-+++ b/samples/bpf/cpustat_kern.c
-@@ -211,7 +211,7 @@ int bpf_prog1(struct cpu_args *ctx)
- SEC("tracepoint/power/cpu_frequency")
- int bpf_prog2(struct cpu_args *ctx)
- {
--	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
-+	u64 *pts, *cstate, *pstate, cur_ts, delta;
- 	u32 key, cpu, pstate_idx;
- 	u64 *val;
- 
-@@ -232,7 +232,6 @@ int bpf_prog2(struct cpu_args *ctx)
- 	if (!cstate)
- 		return 0;
- 
--	prev_state = *pstate;
- 	*pstate = ctx->state;
- 
- 	if (!*pts) {
-diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf
-index 2d6e5ed9081e..bca8a8f26ea4 100644
---- a/scripts/Makefile.btf
-+++ b/scripts/Makefile.btf
-@@ -14,9 +14,7 @@ pahole-flags-$(call test-ge, $(pahole-ver), 121)	+= --btf_gen_floats
- 
- pahole-flags-$(call test-ge, $(pahole-ver), 122)	+= -j
- 
--ifeq ($(pahole-ver), 125)
--pahole-flags-y	+= --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
--endif
-+pahole-flags-$(call test-ge, $(pahole-ver), 125)	+= --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
- 
- else
- 
-diff --git a/tools/Makefile b/tools/Makefile
-index 276f5d0d53a4..278d24723b74 100644
---- a/tools/Makefile
-+++ b/tools/Makefile
-@@ -28,6 +28,7 @@ help:
- 	@echo '  pci                    - PCI tools'
- 	@echo '  perf                   - Linux performance measurement and analysis tool'
- 	@echo '  selftests              - various kernel selftests'
-+	@echo '  sched_ext              - sched_ext example schedulers'
- 	@echo '  bootconfig             - boot config tool'
- 	@echo '  spi                    - spi tools'
- 	@echo '  tmon                   - thermal monitoring and tuning tool'
-@@ -91,6 +92,9 @@ perf: FORCE
- 	$(Q)mkdir -p $(PERF_O) .
- 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
- 
-+sched_ext: FORCE
-+	$(call descend,sched_ext)
-+
- selftests: FORCE
- 	$(call descend,testing/$@)
- 
-@@ -184,6 +188,9 @@ perf_clean:
- 	$(Q)mkdir -p $(PERF_O) .
- 	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
- 
-+sched_ext_clean:
-+	$(call descend,sched_ext,clean)
-+
- selftests_clean:
- 	$(call descend,testing/$(@:_clean=),clean)
- 
-@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \
- 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
- 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
- 		gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
--		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean
-+		intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \
-+		sched_ext_clean
- 
- .PHONY: FORCE
-diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
-index eaba24320fb2..3f6bca03ad2e 100644
---- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
-+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
-@@ -28,7 +28,7 @@ BTF COMMANDS
- | **bpftool** **btf help**
- |
- | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
--| *FORMAT* := { **raw** | **c** }
-+| *FORMAT* := { **raw** | **c** [**unsorted**] }
- | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
- | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
- 
-@@ -63,7 +63,9 @@ bpftool btf dump *BTF_SRC*
-     pahole.
- 
-     **format** option can be used to override default (raw) output format. Raw
--    (**raw**) or C-syntax (**c**) output formats are supported.
-+    (**raw**) or C-syntax (**c**) output formats are supported. With C-style
-+    formatting, the output is sorted by default. Use the **unsorted** option
-+    to avoid sorting the output.
- 
- bpftool btf help
-     Print short help message.
-diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
-index dfa4f1bebbb3..ba927379eb20 100644
---- a/tools/bpf/bpftool/Makefile
-+++ b/tools/bpf/bpftool/Makefile
-@@ -204,10 +204,11 @@ ifeq ($(feature-clang-bpf-co-re),1)
- 
- BUILD_BPF_SKELS := 1
- 
--$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP)
- ifeq ($(VMLINUX_H),)
-+$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP)
- 	$(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) btf dump file $< format c > $@
- else
-+$(OUTPUT)vmlinux.h: $(VMLINUX_H)
- 	$(Q)cp "$(VMLINUX_H)" $@
- endif
- 
-diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
-index 04afe2ac2228..be99d49b8714 100644
---- a/tools/bpf/bpftool/bash-completion/bpftool
-+++ b/tools/bpf/bpftool/bash-completion/bpftool
-@@ -930,6 +930,9 @@ _bpftool()
-                         format)
-                             COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) )
-                             ;;
-+                        c)
-+                            COMPREPLY=( $( compgen -W "unsorted" -- "$cur" ) )
-+                            ;;
-                         *)
-                             # emit extra options
-                             case ${words[3]} in
-diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
-index 91fcb75babe3..af047dedde38 100644
---- a/tools/bpf/bpftool/btf.c
-+++ b/tools/bpf/bpftool/btf.c
-@@ -43,6 +43,13 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
- 	[BTF_KIND_ENUM64]	= "ENUM64",
- };
- 
-+struct sort_datum {
-+	int index;
-+	int type_rank;
-+	const char *sort_name;
-+	const char *own_name;
-+};
-+
- static const char *btf_int_enc_str(__u8 encoding)
- {
- 	switch (encoding) {
-@@ -460,9 +467,122 @@ static void __printf(2, 0) btf_dump_printf(void *ctx,
- 	vfprintf(stdout, fmt, args);
- }
- 
-+static int btf_type_rank(const struct btf *btf, __u32 index, bool has_name)
-+{
-+	const struct btf_type *t = btf__type_by_id(btf, index);
-+	const int kind = btf_kind(t);
-+	const int max_rank = 10;
-+
-+	if (t->name_off)
-+		has_name = true;
-+
-+	switch (kind) {
-+	case BTF_KIND_ENUM:
-+	case BTF_KIND_ENUM64:
-+		return has_name ? 1 : 0;
-+	case BTF_KIND_INT:
-+	case BTF_KIND_FLOAT:
-+		return 2;
-+	case BTF_KIND_STRUCT:
-+	case BTF_KIND_UNION:
-+		return has_name ? 3 : max_rank;
-+	case BTF_KIND_FUNC_PROTO:
-+		return has_name ? 4 : max_rank;
-+	case BTF_KIND_ARRAY:
-+		if (has_name)
-+			return btf_type_rank(btf, btf_array(t)->type, has_name);
-+		return max_rank;
-+	case BTF_KIND_TYPE_TAG:
-+	case BTF_KIND_CONST:
-+	case BTF_KIND_PTR:
-+	case BTF_KIND_VOLATILE:
-+	case BTF_KIND_RESTRICT:
-+	case BTF_KIND_TYPEDEF:
-+	case BTF_KIND_DECL_TAG:
-+		if (has_name)
-+			return btf_type_rank(btf, t->type, has_name);
-+		return max_rank;
-+	default:
-+		return max_rank;
-+	}
-+}
-+
-+static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool from_ref)
-+{
-+	const struct btf_type *t = btf__type_by_id(btf, index);
-+
-+	switch (btf_kind(t)) {
-+	case BTF_KIND_ENUM:
-+	case BTF_KIND_ENUM64: {
-+		int name_off = t->name_off;
-+
-+		/* Use name of the first element for anonymous enums if allowed */
-+		if (!from_ref && !t->name_off && btf_vlen(t))
-+			name_off = btf_enum(t)->name_off;
-+
-+		return btf__name_by_offset(btf, name_off);
-+	}
-+	case BTF_KIND_ARRAY:
-+		return btf_type_sort_name(btf, btf_array(t)->type, true);
-+	case BTF_KIND_TYPE_TAG:
-+	case BTF_KIND_CONST:
-+	case BTF_KIND_PTR:
-+	case BTF_KIND_VOLATILE:
-+	case BTF_KIND_RESTRICT:
-+	case BTF_KIND_TYPEDEF:
-+	case BTF_KIND_DECL_TAG:
-+		return btf_type_sort_name(btf, t->type, true);
-+	default:
-+		return btf__name_by_offset(btf, t->name_off);
-+	}
-+	return NULL;
-+}
-+
-+static int btf_type_compare(const void *left, const void *right)
-+{
-+	const struct sort_datum *d1 = (const struct sort_datum *)left;
-+	const struct sort_datum *d2 = (const struct sort_datum *)right;
-+	int r;
-+
-+	if (d1->type_rank != d2->type_rank)
-+		return d1->type_rank < d2->type_rank ? -1 : 1;
-+
-+	r = strcmp(d1->sort_name, d2->sort_name);
-+	if (r)
-+		return r;
-+
-+	return strcmp(d1->own_name, d2->own_name);
-+}
-+
-+static struct sort_datum *sort_btf_c(const struct btf *btf)
-+{
-+	struct sort_datum *datums;
-+	int n;
-+
-+	n = btf__type_cnt(btf);
-+	datums = malloc(sizeof(struct sort_datum) * n);
-+	if (!datums)
-+		return NULL;
-+
-+	for (int i = 0; i < n; ++i) {
-+		struct sort_datum *d = datums + i;
-+		const struct btf_type *t = btf__type_by_id(btf, i);
-+
-+		d->index = i;
-+		d->type_rank = btf_type_rank(btf, i, false);
-+		d->sort_name = btf_type_sort_name(btf, i, false);
-+		d->own_name = btf__name_by_offset(btf, t->name_off);
-+	}
-+
-+	qsort(datums, n, sizeof(struct sort_datum), btf_type_compare);
-+
-+	return datums;
-+}
-+
- static int dump_btf_c(const struct btf *btf,
--		      __u32 *root_type_ids, int root_type_cnt)
-+		      __u32 *root_type_ids, int root_type_cnt, bool sort_dump)
- {
-+	struct sort_datum *datums = NULL;
- 	struct btf_dump *d;
- 	int err = 0, i;
- 
-@@ -486,8 +606,12 @@ static int dump_btf_c(const struct btf *btf,
- 	} else {
- 		int cnt = btf__type_cnt(btf);
- 
-+		if (sort_dump)
-+			datums = sort_btf_c(btf);
- 		for (i = 1; i < cnt; i++) {
--			err = btf_dump__dump_type(d, i);
-+			int idx = datums ? datums[i].index : i;
-+
-+			err = btf_dump__dump_type(d, idx);
- 			if (err)
- 				goto done;
- 		}
-@@ -500,6 +624,7 @@ static int dump_btf_c(const struct btf *btf,
- 	printf("#endif /* __VMLINUX_H__ */\n");
- 
- done:
-+	free(datums);
- 	btf_dump__free(d);
- 	return err;
- }
-@@ -549,10 +674,10 @@ static bool btf_is_kernel_module(__u32 btf_id)
- 
- static int do_dump(int argc, char **argv)
- {
-+	bool dump_c = false, sort_dump_c = true;
- 	struct btf *btf = NULL, *base = NULL;
- 	__u32 root_type_ids[2];
- 	int root_type_cnt = 0;
--	bool dump_c = false;
- 	__u32 btf_id = -1;
- 	const char *src;
- 	int fd = -1;
-@@ -663,6 +788,9 @@ static int do_dump(int argc, char **argv)
- 				goto done;
- 			}
- 			NEXT_ARG();
-+		} else if (is_prefix(*argv, "unsorted")) {
-+			sort_dump_c = false;
-+			NEXT_ARG();
- 		} else {
- 			p_err("unrecognized option: '%s'", *argv);
- 			err = -EINVAL;
-@@ -691,7 +819,7 @@ static int do_dump(int argc, char **argv)
- 			err = -ENOTSUP;
- 			goto done;
- 		}
--		err = dump_btf_c(btf, root_type_ids, root_type_cnt);
-+		err = dump_btf_c(btf, root_type_ids, root_type_cnt, sort_dump_c);
- 	} else {
- 		err = dump_btf_raw(btf, root_type_ids, root_type_cnt);
- 	}
-@@ -1063,7 +1191,7 @@ static int do_help(int argc, char **argv)
- 		"       %1$s %2$s help\n"
- 		"\n"
- 		"       BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
--		"       FORMAT  := { raw | c }\n"
-+		"       FORMAT  := { raw | c [unsorted] }\n"
- 		"       " HELP_SPEC_MAP "\n"
- 		"       " HELP_SPEC_PROGRAM "\n"
- 		"       " HELP_SPEC_OPTIONS " |\n"
-diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
-index 958e92acca8e..9b75639434b8 100644
---- a/tools/bpf/bpftool/common.c
-+++ b/tools/bpf/bpftool/common.c
-@@ -410,7 +410,7 @@ void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
- {
- 	const char *prog_name = prog_info->name;
- 	const struct btf_type *func_type;
--	const struct bpf_func_info finfo = {};
-+	struct bpf_func_info finfo = {};
- 	struct bpf_prog_info info = {};
- 	__u32 info_len = sizeof(info);
- 	struct btf *prog_btf = NULL;
-diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
-index 7bdbcac3cf62..948dde25034e 100644
---- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
-+++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
-@@ -29,6 +29,7 @@ enum bpf_link_type___local {
- };
- 
- extern const void bpf_link_fops __ksym;
-+extern const void bpf_link_fops_poll __ksym __weak;
- extern const void bpf_map_fops __ksym;
- extern const void bpf_prog_fops __ksym;
- extern const void btf_fops __ksym;
-@@ -84,7 +85,11 @@ int iter(struct bpf_iter__task_file *ctx)
- 		fops = &btf_fops;
- 		break;
- 	case BPF_OBJ_LINK:
--		fops = &bpf_link_fops;
-+		if (&bpf_link_fops_poll &&
-+		    file->f_op == &bpf_link_fops_poll)
-+			fops = &bpf_link_fops_poll;
-+		else
-+			fops = &bpf_link_fops;
- 		break;
- 	default:
- 		return 0;
-diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c
-index 2f80edc682f1..f48c783cb9f7 100644
---- a/tools/bpf/bpftool/skeleton/profiler.bpf.c
-+++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c
-@@ -40,17 +40,17 @@ struct {
- 
- const volatile __u32 num_cpu = 1;
- const volatile __u32 num_metric = 1;
--#define MAX_NUM_MATRICS 4
-+#define MAX_NUM_METRICS 4
- 
- SEC("fentry/XXX")
- int BPF_PROG(fentry_XXX)
- {
--	struct bpf_perf_event_value___local *ptrs[MAX_NUM_MATRICS];
-+	struct bpf_perf_event_value___local *ptrs[MAX_NUM_METRICS];
- 	u32 key = bpf_get_smp_processor_id();
- 	u32 i;
- 
- 	/* look up before reading, to reduce error */
--	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
-+	for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) {
- 		u32 flag = i;
- 
- 		ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag);
-@@ -58,7 +58,7 @@ int BPF_PROG(fentry_XXX)
- 			return 0;
- 	}
- 
--	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
-+	for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) {
- 		struct bpf_perf_event_value___local reading;
- 		int err;
- 
-@@ -99,14 +99,14 @@ fexit_update_maps(u32 id, struct bpf_perf_event_value___local *after)
- SEC("fexit/XXX")
- int BPF_PROG(fexit_XXX)
- {
--	struct bpf_perf_event_value___local readings[MAX_NUM_MATRICS];
-+	struct bpf_perf_event_value___local readings[MAX_NUM_METRICS];
- 	u32 cpu = bpf_get_smp_processor_id();
- 	u32 i, zero = 0;
- 	int err;
- 	u64 *count;
- 
- 	/* read all events before updating the maps, to reduce error */
--	for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) {
-+	for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) {
- 		err = bpf_perf_event_read_value(&events, cpu + i * num_cpu,
- 						(void *)(readings + i),
- 						sizeof(*readings));
-@@ -116,7 +116,7 @@ int BPF_PROG(fexit_XXX)
- 	count = bpf_map_lookup_elem(&counts, &zero);
- 	if (count) {
- 		*count += 1;
--		for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++)
-+		for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++)
- 			fexit_update_maps(i, &readings[i]);
- 	}
- 	return 0;
-diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
-index 90706a47f6ff..25ea393cf084 100644
---- a/tools/include/uapi/linux/bpf.h
-+++ b/tools/include/uapi/linux/bpf.h
-@@ -6207,12 +6207,17 @@ union {					\
- 	__u64 :64;			\
- } __attribute__((aligned(8)))
- 
-+/* The enum used in skb->tstamp_type. It specifies the clock type
-+ * of the time stored in the skb->tstamp.
-+ */
- enum {
--	BPF_SKB_TSTAMP_UNSPEC,
--	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
--	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
--	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
--	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
-+	BPF_SKB_TSTAMP_UNSPEC = 0,		/* DEPRECATED */
-+	BPF_SKB_TSTAMP_DELIVERY_MONO = 1,	/* DEPRECATED */
-+	BPF_SKB_CLOCK_REALTIME = 0,
-+	BPF_SKB_CLOCK_MONOTONIC = 1,
-+	BPF_SKB_CLOCK_TAI = 2,
-+	/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
-+	 * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
- 	 */
- };
- 
-diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
-index 5401f2df463d..d1627a2ca30b 100644
---- a/tools/lib/bpf/libbpf.c
-+++ b/tools/lib/bpf/libbpf.c
-@@ -229,7 +229,30 @@ static const char * const prog_type_name[] = {
- static int __base_pr(enum libbpf_print_level level, const char *format,
- 		     va_list args)
- {
--	if (level == LIBBPF_DEBUG)
-+	const char *env_var = "LIBBPF_LOG_LEVEL";
-+	static enum libbpf_print_level min_level = LIBBPF_INFO;
-+	static bool initialized;
-+
-+	if (!initialized) {
-+		char *verbosity;
-+
-+		initialized = true;
-+		verbosity = getenv(env_var);
-+		if (verbosity) {
-+			if (strcasecmp(verbosity, "warn") == 0)
-+				min_level = LIBBPF_WARN;
-+			else if (strcasecmp(verbosity, "debug") == 0)
-+				min_level = LIBBPF_DEBUG;
-+			else if (strcasecmp(verbosity, "info") == 0)
-+				min_level = LIBBPF_INFO;
-+			else
-+				fprintf(stderr, "libbpf: unrecognized '%s' envvar value: '%s', should be one of 'warn', 'debug', or 'info'.\n",
-+					env_var, verbosity);
-+		}
-+	}
-+
-+	/* if too verbose, skip logging  */
-+	if (level > min_level)
- 		return 0;
- 
- 	return vfprintf(stderr, format, args);
-diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
-index c3f77d9260fe..26e4e35528c5 100644
---- a/tools/lib/bpf/libbpf.h
-+++ b/tools/lib/bpf/libbpf.h
-@@ -98,7 +98,10 @@ typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level,
- 
- /**
-  * @brief **libbpf_set_print()** sets user-provided log callback function to
-- * be used for libbpf warnings and informational messages.
-+ * be used for libbpf warnings and informational messages. If the user callback
-+ * is not set, messages are logged to stderr by default. The verbosity of these
-+ * messages can be controlled by setting the environment variable
-+ * LIBBPF_LOG_LEVEL to either warn, info, or debug.
-  * @param fn The log print function. If NULL, libbpf won't print anything.
-  * @return Pointer to old print function.
-  *
-diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
-index a0dcfb82e455..7e7e686008c6 100644
---- a/tools/lib/bpf/libbpf_internal.h
-+++ b/tools/lib/bpf/libbpf_internal.h
-@@ -597,13 +597,9 @@ static inline int ensure_good_fd(int fd)
- 	return fd;
- }
- 
--static inline int sys_dup2(int oldfd, int newfd)
-+static inline int sys_dup3(int oldfd, int newfd, int flags)
- {
--#ifdef __NR_dup2
--	return syscall(__NR_dup2, oldfd, newfd);
--#else
--	return syscall(__NR_dup3, oldfd, newfd, 0);
--#endif
-+	return syscall(__NR_dup3, oldfd, newfd, flags);
- }
- 
- /* Point *fixed_fd* to the same file that *tmp_fd* points to.
-@@ -614,7 +610,7 @@ static inline int reuse_fd(int fixed_fd, int tmp_fd)
- {
- 	int err;
- 
--	err = sys_dup2(tmp_fd, fixed_fd);
-+	err = sys_dup3(tmp_fd, fixed_fd, O_CLOEXEC);
- 	err = err < 0 ? -errno : 0;
- 	close(tmp_fd); /* clean up temporary FD */
- 	return err;
-diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
-new file mode 100644
-index 000000000000..d6264fe1c8cd
---- /dev/null
-+++ b/tools/sched_ext/.gitignore
-@@ -0,0 +1,2 @@
-+tools/
-+build/
-diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
-new file mode 100644
-index 000000000000..ca3815e572d8
---- /dev/null
-+++ b/tools/sched_ext/Makefile
-@@ -0,0 +1,246 @@
-+# SPDX-License-Identifier: GPL-2.0
-+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+include ../build/Build.include
-+include ../scripts/Makefile.arch
-+include ../scripts/Makefile.include
-+
-+all: all_targets
-+
-+ifneq ($(LLVM),)
-+ifneq ($(filter %/,$(LLVM)),)
-+LLVM_PREFIX := $(LLVM)
-+else ifneq ($(filter -%,$(LLVM)),)
-+LLVM_SUFFIX := $(LLVM)
-+endif
-+
-+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
-+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
-+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
-+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
-+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
-+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
-+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
-+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
-+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
-+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
-+
-+ifeq ($(CROSS_COMPILE),)
-+ifeq ($(CLANG_TARGET_FLAGS),)
-+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk)
-+else
-+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
-+endif # CLANG_TARGET_FLAGS
-+else
-+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
-+endif # CROSS_COMPILE
-+
-+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
-+else
-+CC := $(CROSS_COMPILE)gcc
-+endif # LLVM
-+
-+CURDIR := $(abspath .)
-+TOOLSDIR := $(abspath ..)
-+LIBDIR := $(TOOLSDIR)/lib
-+BPFDIR := $(LIBDIR)/bpf
-+TOOLSINCDIR := $(TOOLSDIR)/include
-+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
-+APIDIR := $(TOOLSINCDIR)/uapi
-+GENDIR := $(abspath ../../include/generated)
-+GENHDR := $(GENDIR)/autoconf.h
-+
-+ifeq ($(O),)
-+OUTPUT_DIR := $(CURDIR)/build
-+else
-+OUTPUT_DIR := $(O)/build
-+endif # O
-+OBJ_DIR := $(OUTPUT_DIR)/obj
-+INCLUDE_DIR := $(OUTPUT_DIR)/include
-+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
-+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
-+BINDIR := $(OUTPUT_DIR)/bin
-+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
-+ifneq ($(CROSS_COMPILE),)
-+HOST_BUILD_DIR		:= $(OBJ_DIR)/host
-+HOST_OUTPUT_DIR	:= host-tools
-+HOST_INCLUDE_DIR	:= $(HOST_OUTPUT_DIR)/include
-+else
-+HOST_BUILD_DIR		:= $(OBJ_DIR)
-+HOST_OUTPUT_DIR	:= $(OUTPUT_DIR)
-+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
-+endif
-+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
-+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
-+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
-+
-+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
-+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
-+		     ../../vmlinux						\
-+		     /sys/kernel/btf/vmlinux					\
-+		     /boot/vmlinux-$(shell uname -r)
-+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
-+ifeq ($(VMLINUX_BTF),)
-+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
-+endif
-+
-+BPFTOOL ?= $(DEFAULT_BPFTOOL)
-+
-+ifneq ($(wildcard $(GENHDR)),)
-+  GENFLAGS := -DHAVE_GENHDR
-+endif
-+
-+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
-+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
-+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include
-+
-+# Silence some warnings when compiled with clang
-+ifneq ($(LLVM),)
-+CFLAGS += -Wno-unused-command-line-argument
-+endif
-+
-+LDFLAGS = -lelf -lz -lpthread
-+
-+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
-+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
-+
-+# Get Clang's default includes on this system, as opposed to those seen by
-+# '-target bpf'. This fixes "missing" files on some architectures/distros,
-+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-+#
-+# Use '-idirafter': Don't interfere with include mechanics except where the
-+# build would have failed anyways.
-+define get_sys_includes
-+$(shell $(1) -v -E - </dev/null 2>&1 \
-+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-+endef
-+
-+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
-+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
-+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
-+	     -I$(INCLUDE_DIR) -I$(APIDIR)					\
-+	     -I../../include							\
-+	     $(call get_sys_includes,$(CLANG))					\
-+	     -Wall -Wno-compare-distinct-pointer-types				\
-+	     -O2 -mcpu=v3
-+
-+# sort removes libbpf duplicates when not cross-building
-+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf			\
-+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
-+	       $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
-+
-+$(MAKE_DIRS):
-+	$(call msg,MKDIR,,$@)
-+	$(Q)mkdir -p $@
-+
-+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
-+	   $(APIDIR)/linux/bpf.h						\
-+	   | $(OBJ_DIR)/libbpf
-+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
-+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
-+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
-+
-+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
-+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
-+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
-+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
-+		    EXTRA_CFLAGS='-g -O0'					\
-+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
-+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
-+		    LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/				\
-+		    prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
-+
-+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
-+ifeq ($(VMLINUX_H),)
-+	$(call msg,GEN,,$@)
-+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
-+else
-+	$(call msg,CP,,$@)
-+	$(Q)cp "$(VMLINUX_H)" $@
-+endif
-+
-+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h		\
-+		       | $(BPFOBJ) $(SCXOBJ_DIR)
-+	$(call msg,CLNG-BPF,,$(notdir $@))
-+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
-+
-+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
-+	$(eval sched=$(notdir $@))
-+	$(call msg,GEN-SKEL,,$(sched))
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
-+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
-+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
-+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
-+
-+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
-+
-+c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
-+
-+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
-+	$(BINDIR)/%: \
-+		$(filter-out %.bpf.c,%.c) \
-+		$(INCLUDE_DIR)/%.bpf.skel.h \
-+		$(SCX_COMMON_DEPS)
-+	$(eval sched=$(notdir $@))
-+	$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
-+	$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
-+
-+$(c-sched-targets): %: $(BINDIR)/%
-+
-+install: all
-+	$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
-+	$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
-+
-+clean:
-+	rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
-+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
-+	rm -f $(c-sched-targets)
-+
-+help:
-+	@echo   'Building targets'
-+	@echo   '================'
-+	@echo   ''
-+	@echo   '  all		  - Compile all schedulers'
-+	@echo   ''
-+	@echo   'Alternatively, you may compile individual schedulers:'
-+	@echo   ''
-+	@printf '  %s\n' $(c-sched-targets)
-+	@echo   ''
-+	@echo   'For any scheduler build target, you may specify an alternative'
-+	@echo   'build output path with the O= environment variable. For example:'
-+	@echo   ''
-+	@echo   '   O=/tmp/sched_ext make all'
-+	@echo   ''
-+	@echo   'will compile all schedulers, and emit the build artifacts to'
-+	@echo   '/tmp/sched_ext/build.'
-+	@echo   ''
-+	@echo   ''
-+	@echo   'Installing targets'
-+	@echo   '=================='
-+	@echo   ''
-+	@echo   '  install	  - Compile and install all schedulers to /usr/bin.'
-+	@echo   '		    You may specify the DESTDIR= environment variable'
-+	@echo   '		    to indicate a prefix for /usr/bin. For example:'
-+	@echo   ''
-+	@echo   '                     DESTDIR=/tmp/sched_ext make install'
-+	@echo   ''
-+	@echo   '		    will build the schedulers in CWD/build, and'
-+	@echo   '		    install the schedulers to /tmp/sched_ext/usr/bin.'
-+	@echo   ''
-+	@echo   ''
-+	@echo   'Cleaning targets'
-+	@echo   '================'
-+	@echo   ''
-+	@echo   '  clean		  - Remove all generated files'
-+
-+all_targets: $(c-sched-targets)
-+
-+.PHONY: all all_targets $(c-sched-targets) clean help
-+
-+# delete failed targets
-+.DELETE_ON_ERROR:
-+
-+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets
-+.SECONDARY:
-diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
-new file mode 100644
-index 000000000000..16a42e4060f6
---- /dev/null
-+++ b/tools/sched_ext/README.md
-@@ -0,0 +1,270 @@
-+SCHED_EXT EXAMPLE SCHEDULERS
-+============================
-+
-+# Introduction
-+
-+This directory contains a number of example sched_ext schedulers. These
-+schedulers are meant to provide examples of different types of schedulers
-+that can be built using sched_ext, and illustrate how various features of
-+sched_ext can be used.
-+
-+Some of the examples are performant, production-ready schedulers. That is, for
-+the correct workload and with the correct tuning, they may be deployed in a
-+production environment with acceptable or possibly even improved performance.
-+Others are just examples that in practice, would not provide acceptable
-+performance (though they could be improved to get there).
-+
-+This README will describe these example schedulers, including describing the
-+types of workloads or scenarios they're designed to accommodate, and whether or
-+not they're production ready. For more details on any of these schedulers,
-+please see the header comment in their .bpf.c file.
-+
-+
-+# Compiling the examples
-+
-+There are a few toolchain dependencies for compiling the example schedulers.
-+
-+## Toolchain dependencies
-+
-+1. clang >= 16.0.0
-+
-+The schedulers are BPF programs, and therefore must be compiled with clang. gcc
-+is actively working on adding a BPF backend compiler as well, but are still
-+missing some features such as BTF type tags which are necessary for using
-+kptrs.
-+
-+2. pahole >= 1.25
-+
-+You may need pahole in order to generate BTF from DWARF.
-+
-+3. rust >= 1.70.0
-+
-+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
-+should be able to use the stable build from rustup, but if that doesn't
-+work, try using the rustup nightly build.
-+
-+There are other requirements as well, such as make, but these are the main /
-+non-trivial ones.
-+
-+## Compiling the kernel
-+
-+In order to run a sched_ext scheduler, you'll have to run a kernel compiled
-+with the patches in this repository, and with a minimum set of necessary
-+Kconfig options:
-+
-+```
-+CONFIG_BPF=y
-+CONFIG_SCHED_CLASS_EXT=y
-+CONFIG_BPF_SYSCALL=y
-+CONFIG_BPF_JIT=y
-+CONFIG_DEBUG_INFO_BTF=y
-+```
-+
-+It's also recommended that you also include the following Kconfig options:
-+
-+```
-+CONFIG_BPF_JIT_ALWAYS_ON=y
-+CONFIG_BPF_JIT_DEFAULT_ON=y
-+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
-+CONFIG_PAHOLE_HAS_BTF_TAG=y
-+```
-+
-+There is a `Kconfig` file in this directory whose contents you can append to
-+your local `.config` file, as long as there are no conflicts with any existing
-+options in the file.
-+
-+## Getting a vmlinux.h file
-+
-+You may notice that most of the example schedulers include a "vmlinux.h" file.
-+This is a large, auto-generated header file that contains all of the types
-+defined in some vmlinux binary that was compiled with
-+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
-+options specified above).
-+
-+The header file is created using `bpftool`, by passing it a vmlinux binary
-+compiled with BTF as follows:
-+
-+```bash
-+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
-+```
-+
-+`bpftool` analyzes all of the BTF encodings in the binary, and produces a
-+header file that can be included by BPF programs to access those types.  For
-+example, using vmlinux.h allows a scheduler to access fields defined directly
-+in vmlinux as follows:
-+
-+```c
-+#include "vmlinux.h"
-+// vmlinux.h is also implicitly included by scx_common.bpf.h.
-+#include "scx_common.bpf.h"
-+
-+/*
-+ * vmlinux.h provides definitions for struct task_struct and
-+ * struct scx_enable_args.
-+ */
-+void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
-+		    struct scx_enable_args *args)
-+{
-+	bpf_printk("Task %s enabled in example scheduler", p->comm);
-+}
-+
-+// vmlinux.h provides the definition for struct sched_ext_ops.
-+SEC(".struct_ops.link")
-+struct sched_ext_ops example_ops {
-+	.enable	= (void *)example_enable,
-+	.name	= "example",
-+}
-+```
-+
-+The scheduler build system will generate this vmlinux.h file as part of the
-+scheduler build pipeline. It looks for a vmlinux file in the following
-+dependency order:
-+
-+1. If the O= environment variable is defined, at `$O/vmlinux`
-+2. If the KBUILD_OUTPUT= environment variable is defined, at
-+   `$KBUILD_OUTPUT/vmlinux`
-+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
-+   compiling the schedulers)
-+3. `/sys/kernel/btf/vmlinux`
-+4. `/boot/vmlinux-$(uname -r)`
-+
-+In other words, if you have compiled a kernel in your local repo, its vmlinux
-+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
-+the kernel you're currently running on. This means that if you're running on a
-+kernel with sched_ext support, you may not need to compile a local kernel at
-+all.
-+
-+### Aside on CO-RE
-+
-+One of the cooler features of BPF is that it supports
-+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
-+Everywhere). This feature allows you to reference fields inside of structs with
-+types defined internal to the kernel, and not have to recompile if you load the
-+BPF program on a different kernel with the field at a different offset. In our
-+example above, we print out a task name with `p->comm`. CO-RE would perform
-+relocations for that access when the program is loaded to ensure that it's
-+referencing the correct offset for the currently running kernel.
-+
-+## Compiling the schedulers
-+
-+Once you have your toolchain setup, and a vmlinux that can be used to generate
-+a full vmlinux.h file, you can compile the schedulers using `make`:
-+
-+```bash
-+$ make -j($nproc)
-+```
-+
-+# Example schedulers
-+
-+This directory contains the following example schedulers. These schedulers are
-+for testing and demonstrating different aspects of sched_ext. While some may be
-+useful in limited scenarios, they are not intended to be practical.
-+
-+For more scheduler implementations, tools and documentation, visit
-+https://github.com/sched-ext/scx.
-+
-+## scx_simple
-+
-+A simple scheduler that provides an example of a minimal sched_ext scheduler.
-+scx_simple can be run in either global weighted vtime mode, or FIFO mode.
-+
-+Though very simple, in limited scenarios, this scheduler can perform reasonably
-+well on single-socket systems with a unified L3 cache.
-+
-+## scx_qmap
-+
-+Another simple, yet slightly more complex scheduler that provides an example of
-+a basic weighted FIFO queuing policy. It also provides examples of some common
-+useful BPF features, such as sleepable per-task storage allocation in the
-+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
-+enqueue tasks. It also illustrates how core-sched support could be implemented.
-+
-+## scx_central
-+
-+A "central" scheduler where scheduling decisions are made from a single CPU.
-+This scheduler illustrates how scheduling decisions can be dispatched from a
-+single CPU, allowing other cores to run with infinite slices, without timer
-+ticks, and without having to incur the overhead of making scheduling decisions.
-+
-+The approach demonstrated by this scheduler may be useful for any workload that
-+benefits from minimizing scheduling overhead and timer ticks. An example of
-+where this could be particularly useful is running VMs, where running with
-+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
-+vmexits.
-+
-+## scx_flatcg
-+
-+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
-+weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
-+layer, by compounding the active weight share at each level. The effect of this
-+is a much more performant CPU controller, which does not need to descend down
-+cgroup trees in order to properly compute a cgroup's share.
-+
-+Similar to scx_simple, in limited scenarios, this scheduler can perform
-+reasonably well on single socket-socket systems with a unified L3 cache and show
-+significantly lowered hierarchical scheduling overhead.
-+
-+
-+# Troubleshooting
-+
-+There are a number of common issues that you may run into when building the
-+schedulers. We'll go over some of the common ones here.
-+
-+## Build Failures
-+
-+### Old version of clang
-+
-+```
-+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
-+        _Static_assert(SCX_DSQ_FLAG_BUILTIN,
-+                       ^~~~~~~~~~~~~~~~~~~~
-+1 error generated.
-+```
-+
-+This means you built the kernel or the schedulers with an older version of
-+clang than what's supported (i.e. older than 16.0.0). To remediate this:
-+
-+1. `which clang` to make sure you're using a sufficiently new version of clang.
-+
-+2. `make fullclean` in the root path of the repository, and rebuild the kernel
-+   and schedulers.
-+
-+3. Rebuild the kernel, and then your example schedulers.
-+
-+The schedulers are also cleaned if you invoke `make mrproper` in the root
-+directory of the tree.
-+
-+### Stale kernel build / incomplete vmlinux.h file
-+
-+As described above, you'll need a `vmlinux.h` file that was generated from a
-+vmlinux built with BTF, and with sched_ext support enabled. If you don't,
-+you'll see errors such as the following which indicate that a type being
-+referenced in a scheduler is unknown:
-+
-+```
-+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
-+
-+const struct scx_exit_info *ei)
-+
-+^
-+```
-+
-+In order to resolve this, please follow the steps above in
-+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
-+schedulers are using a vmlinux.h file that includes the requisite types.
-+
-+## Misc
-+
-+### llvm: [OFF]
-+
-+You may see the following output when building the schedulers:
-+
-+```
-+Auto-detecting system features:
-+...                         clang-bpf-co-re: [ on  ]
-+...                                    llvm: [ OFF ]
-+...                                  libcap: [ on  ]
-+...                                  libbfd: [ on  ]
-+```
-+
-+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.
-diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
-new file mode 100644
-index 000000000000..ad7d139ce907
---- /dev/null
-+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
-@@ -0,0 +1,11 @@
-+/*
-+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
-+ * compiling BPF files although its content doesn't play any role. The file in
-+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
-+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus
-+ * stubs-32.h is selected. However, the file is not there if the system doesn't
-+ * have 32bit glibc devel package installed leading to a build failure.
-+ *
-+ * The problem is worked around by making this file available in the include
-+ * search paths before the system one when building BPF.
-+ */
-diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
-new file mode 100644
-index 000000000000..d0b708e959c1
---- /dev/null
-+++ b/tools/sched_ext/include/scx/common.bpf.h
-@@ -0,0 +1,349 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCX_COMMON_BPF_H
-+#define __SCX_COMMON_BPF_H
-+
-+#include "vmlinux.h"
-+#include <bpf/bpf_helpers.h>
-+#include <bpf/bpf_tracing.h>
-+#include <asm-generic/errno.h>
-+#include "user_exit_info.h"
-+
-+#define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
-+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
-+#define PF_EXITING			0x00000004
-+#define CLOCK_MONOTONIC			1
-+
-+/*
-+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
-+ * lead to really confusing misbehaviors. Let's trigger a build failure.
-+ */
-+static inline void ___vmlinux_h_sanity_check___(void)
-+{
-+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
-+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
-+}
-+
-+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
-+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
-+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
-+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
-+void scx_bpf_dispatch_cancel(void) __ksym;
-+bool scx_bpf_consume(u64 dsq_id) __ksym;
-+bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak;
-+u32 scx_bpf_reenqueue_local(void) __ksym;
-+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
-+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
-+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
-+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak;
-+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
-+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
-+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
-+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
-+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
-+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
-+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
-+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
-+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
-+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
-+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
-+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
-+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
-+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
-+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
-+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
-+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
-+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
-+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
-+
-+/*
-+ * Use the following as @it when calling scx_bpf_consume_task() from whitin
-+ * bpf_for_each() loops.
-+ */
-+#define BPF_FOR_EACH_ITER	(&___it)
-+
-+/* hopefully temporary wrapper to work around BPF restriction */
-+static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
-+					struct task_struct *p)
-+{
-+	unsigned long ptr;
-+	bpf_probe_read_kernel(&ptr, sizeof(ptr), it);
-+	return __scx_bpf_consume_task(ptr, p);
-+}
-+
-+static inline __attribute__((format(printf, 1, 2)))
-+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
-+
-+/*
-+ * Helper macro for initializing the fmt and variadic argument inputs to both
-+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
-+ * refer to the initialized list of inputs to the bstr kfunc.
-+ */
-+#define scx_bpf_bstr_preamble(fmt, args...)					\
-+	static char ___fmt[] = fmt;						\
-+	/*									\
-+	 * Note that __param[] must have at least one				\
-+	 * element to keep the verifier happy.					\
-+	 */									\
-+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
-+										\
-+	_Pragma("GCC diagnostic push")						\
-+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
-+	___bpf_fill(___param, args);						\
-+	_Pragma("GCC diagnostic pop")						\
-+
-+/*
-+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
-+ * instead of an array of u64. Using this macro will cause the scheduler to
-+ * exit cleanly with the specified exit code being passed to user space.
-+ */
-+#define scx_bpf_exit(code, fmt, args...)					\
-+({										\
-+	scx_bpf_bstr_preamble(fmt, args)					\
-+	scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param));		\
-+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
-+})
-+
-+/*
-+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
-+ * instead of an array of u64. Invoking this macro will cause the scheduler to
-+ * exit in an erroneous state, with diagnostic information being passed to the
-+ * user.
-+ */
-+#define scx_bpf_error(fmt, args...)						\
-+({										\
-+	scx_bpf_bstr_preamble(fmt, args)					\
-+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
-+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
-+})
-+
-+/*
-+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
-+ * instead of an array of u64. To be used from ops.dump() and friends.
-+ */
-+#define scx_bpf_dump(fmt, args...)						\
-+({										\
-+	scx_bpf_bstr_preamble(fmt, args)					\
-+	scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param));			\
-+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
-+})
-+
-+#define BPF_STRUCT_OPS(name, args...)						\
-+SEC("struct_ops/"#name)								\
-+BPF_PROG(name, ##args)
-+
-+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
-+SEC("struct_ops.s/"#name)							\
-+BPF_PROG(name, ##args)
-+
-+/**
-+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
-+ * @elfsec: the data section of the BPF program in which to place the array
-+ * @arr: the name of the array
-+ *
-+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
-+ * bss, data, rodata) themselves are maps, a data section can be resized. If
-+ * a data section has an array as its last element, the BTF info for that
-+ * array will be adjusted so that length of the array is extended to meet the
-+ * new length of the data section. This macro annotates an array to have an
-+ * element count of one with the assumption that this array can be resized
-+ * within the userspace program. It also annotates the section specifier so
-+ * this array exists in a custom sub data section which can be resized
-+ * independently.
-+ *
-+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
-+ * array declared with RESIZABLE_ARRAY().
-+ */
-+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
-+
-+/**
-+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
-+ * @base: struct or array to index
-+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
-+ *
-+ * The verifier often gets confused by the instruction sequence the compiler
-+ * generates for indexing struct fields or arrays. This macro forces the
-+ * compiler to generate a code sequence which first calculates the byte offset,
-+ * checks it against the struct or array size and add that byte offset to
-+ * generate the pointer to the member to help the verifier.
-+ *
-+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
-+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
-+ * must check for %NULL and take appropriate action to appease the verifier. To
-+ * avoid confusing the verifier, it's best to check for %NULL and dereference
-+ * immediately.
-+ *
-+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
-+ *	if (!vptr)
-+ *		return error;
-+ *	*vptr = new_value;
-+ *
-+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
-+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
-+ * `MEMBER_VPTR(ptr, ->member)`.
-+ */
-+#define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
-+({										\
-+	u64 __base = (u64)&(base);						\
-+	u64 __addr = (u64)&((base) member) - __base;				\
-+	_Static_assert(sizeof(base) >= sizeof((base) member),			\
-+		       "@base is smaller than @member, is @base a pointer?");	\
-+	asm volatile (								\
-+		"if %0 <= %[max] goto +2\n"					\
-+		"%0 = 0\n"							\
-+		"goto +1\n"							\
-+		"%0 += %1\n"							\
-+		: "+r"(__addr)							\
-+		: "r"(__base),							\
-+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
-+	__addr;									\
-+})
-+
-+/**
-+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
-+ * @arr: array to index into
-+ * @i: array index
-+ * @n: number of elements in array
-+ *
-+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
-+ * element count needs to be explicit.
-+ * It can be used in cases where a global array is defined with an initial
-+ * size but is intended to be be resized before loading the BPF program.
-+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
-+ * size of the array to compute the max, which will result in rejection by
-+ * the verifier.
-+ */
-+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
-+({										\
-+	u64 __base = (u64)arr;							\
-+	u64 __addr = (u64)&(arr[i]) - __base;					\
-+	asm volatile (								\
-+		"if %0 <= %[max] goto +2\n"					\
-+		"%0 = 0\n"							\
-+		"goto +1\n"							\
-+		"%0 += %1\n"							\
-+		: "+r"(__addr)							\
-+		: "r"(__base),							\
-+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
-+	__addr;									\
-+})
-+
-+/*
-+ * BPF core and other generic helpers
-+ */
-+
-+/* list and rbtree */
-+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
-+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
-+
-+/*
-+ * bpf_log2 - Compute the base 2 logarithm of a 32-bit exponential value.
-+ * @v: The value for which we're computing the base 2 logarithm.
-+ */
-+static inline u32 bpf_log2(u32 v)
-+{
-+        u32 r;
-+        u32 shift;
-+
-+        r = (v > 0xFFFF) << 4; v >>= r;
-+        shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
-+        shift = (v > 0xF) << 2; v >>= shift; r |= shift;
-+        shift = (v > 0x3) << 1; v >>= shift; r |= shift;
-+        r |= (v >> 1);
-+        return r;
-+}
-+
-+/*
-+ * bpf_log2l - Compute the base 2 logarithm of a 64-bit exponential value.
-+ * @v: The value for which we're computing the base 2 logarithm.
-+ */
-+static inline u32 bpf_log2l(u64 v)
-+{
-+        u32 hi = v >> 32;
-+        if (hi)
-+                return bpf_log2(hi) + 32 + 1;
-+        else
-+                return bpf_log2(v) + 1;
-+}
-+
-+/* useful compiler attributes */
-+#define likely(x) __builtin_expect(!!(x), 1)
-+#define unlikely(x) __builtin_expect(!!(x), 0)
-+#define __maybe_unused __attribute__((__unused__))
-+
-+
-+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
-+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
-+
-+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
-+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
-+
-+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
-+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
-+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
-+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
-+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
-+				      struct bpf_rb_node *node) __ksym;
-+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
-+			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
-+			void *meta, __u64 off) __ksym;
-+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
-+
-+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
-+
-+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
-+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
-+
-+/* task */
-+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
-+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
-+void bpf_task_release(struct task_struct *p) __ksym;
-+
-+/* cgroup */
-+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
-+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
-+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
-+
-+/* cpumask */
-+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
-+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
-+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
-+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
-+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		     const struct cpumask *src2) __ksym;
-+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		    const struct cpumask *src2) __ksym;
-+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
-+		     const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
-+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
-+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
-+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
-+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
-+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
-+				   const struct cpumask *src2) __ksym;
-+
-+/* rcu */
-+void bpf_rcu_read_lock(void) __ksym;
-+void bpf_rcu_read_unlock(void) __ksym;
-+
-+#include "compat.bpf.h"
-+
-+#endif	/* __SCX_COMMON_BPF_H */
-diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
-new file mode 100644
-index 000000000000..8d5a6775f64d
---- /dev/null
-+++ b/tools/sched_ext/include/scx/common.h
-@@ -0,0 +1,71 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCHED_EXT_COMMON_H
-+#define __SCHED_EXT_COMMON_H
-+
-+#ifdef __KERNEL__
-+#error "Should not be included by BPF programs"
-+#endif
-+
-+#include <stdarg.h>
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <stdint.h>
-+#include <errno.h>
-+
-+typedef uint8_t u8;
-+typedef uint16_t u16;
-+typedef uint32_t u32;
-+typedef uint64_t u64;
-+typedef int8_t s8;
-+typedef int16_t s16;
-+typedef int32_t s32;
-+typedef int64_t s64;
-+
-+#define SCX_BUG(__fmt, ...)							\
-+	do {									\
-+		fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__,	\
-+			strerror(errno));					\
-+		fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__);		\
-+		fprintf(stderr, "\n");						\
-+										\
-+		exit(EXIT_FAILURE);						\
-+	} while (0)
-+
-+#define SCX_BUG_ON(__cond, __fmt, ...)					\
-+	do {								\
-+		if (__cond)						\
-+			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
-+	} while (0)
-+
-+/**
-+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
-+ * @elfsec: the data section of the BPF program in which to the array exists
-+ * @arr: the name of the array
-+ * @n: the desired array element count
-+ *
-+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
-+ * operations. It resizes the map which corresponds to the custom data
-+ * section that contains the target array. As a side effect, the BTF info for
-+ * the array is adjusted so that the array length is sized to cover the new
-+ * data section size. The second operation is reassigning the skeleton pointer
-+ * for that custom data section so that it points to the newly memory mapped
-+ * region.
-+ */
-+#define RESIZE_ARRAY(elfsec, arr, n)						  \
-+	do {									  \
-+		size_t __sz;							  \
-+		bpf_map__set_value_size(skel->maps.elfsec##_##arr,		  \
-+				sizeof(skel->elfsec##_##arr->arr[0]) * (n));	  \
-+		skel->elfsec##_##arr =						  \
-+			bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \
-+	} while (0)
-+
-+#include "user_exit_info.h"
-+#include "compat.h"
-+
-+#endif	/* __SCHED_EXT_COMMON_H */
-diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
-new file mode 100644
-index 000000000000..914baac2e965
---- /dev/null
-+++ b/tools/sched_ext/include/scx/compat.bpf.h
-@@ -0,0 +1,120 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCX_COMPAT_BPF_H
-+#define __SCX_COMPAT_BPF_H
-+
-+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
-+({										\
-+	__type __ret = 0;							\
-+	if (bpf_core_enum_value_exists(__type, __ent))				\
-+		__ret = __ent;							\
-+	__ret;									\
-+})
-+
-+/*
-+ * %SCX_KICK_IDLE is a later addition. To support both before and after, use
-+ * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it.
-+ * Users can use %SCX_KICK_IDLE directly in the future.
-+ */
-+#define __COMPAT_SCX_KICK_IDLE							\
-+	__COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE)
-+
-+/*
-+ * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See
-+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the
-+ * future.
-+ */
-+void scx_bpf_switch_all(void) __ksym __weak;
-+
-+static inline void __COMPAT_scx_bpf_switch_all(void)
-+{
-+	if (!bpf_core_enum_value_exists(enum scx_ops_flags, SCX_OPS_SWITCH_PARTIAL))
-+		scx_bpf_switch_all();
-+}
-+
-+/*
-+ * scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if
-+ * unavailable. Users can use scx_bpf_exit() directly in the future.
-+ */
-+#define __COMPAT_scx_bpf_exit(code, fmt, args...)				\
-+({										\
-+	if (bpf_ksym_exists(scx_bpf_exit_bstr))					\
-+		scx_bpf_exit((code), fmt, ##args);				\
-+	else									\
-+		scx_bpf_error(fmt, ##args);					\
-+})
-+
-+/*
-+ * scx_bpf_dump() is a new addition. Ignore if unavailable. Users can use
-+ * scx_bpf_dump() directly in the future.
-+ */
-+#define __COMPAT_scx_bpf_dump(fmt, args...)					\
-+({										\
-+	if (bpf_ksym_exists(scx_bpf_dump_bstr))					\
-+		scx_bpf_dump(fmt, ##args);					\
-+})
-+
-+/*
-+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good
-+ * way to noop these kfuncs. Provide a test macro. Users can assume existence in
-+ * the future.
-+ */
-+#define __COMPAT_HAS_CPUMASKS							\
-+	bpf_ksym_exists(scx_bpf_nr_cpu_ids)
-+
-+/*
-+ * cpuperf is new. The followings become noop on older kernels. Callers can be
-+ * updated to call cpuperf kfuncs directly in the future.
-+ */
-+static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu)
-+{
-+	if (bpf_ksym_exists(scx_bpf_cpuperf_cap))
-+		return scx_bpf_cpuperf_cap(cpu);
-+	else
-+		return 1024;
-+}
-+
-+static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu)
-+{
-+	if (bpf_ksym_exists(scx_bpf_cpuperf_cur))
-+		return scx_bpf_cpuperf_cur(cpu);
-+	else
-+		return 1024;
-+}
-+
-+static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf)
-+{
-+	if (bpf_ksym_exists(scx_bpf_cpuperf_set))
-+		return scx_bpf_cpuperf_set(cpu, perf);
-+}
-+
-+/*
-+ * Iteration and scx_bpf_consume_task() are new. The following become noop on
-+ * older kernels. The users can switch to bpf_for_each(scx_dsq) and directly
-+ * call scx_bpf_consume_task() in the future.
-+ */
-+#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags)					\
-+	if (bpf_ksym_exists(bpf_iter_scx_dsq_new))				\
-+		bpf_for_each(scx_dsq, (p), (dsq_id), (flags))
-+
-+static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
-+						 struct task_struct *p)
-+{
-+	return false;
-+}
-+
-+/*
-+ * Define sched_ext_ops. This may be expanded to define multiple variants for
-+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
-+ */
-+#define SCX_OPS_DEFINE(__name, ...)						\
-+	SEC(".struct_ops.link")							\
-+	struct sched_ext_ops __name = {						\
-+		__VA_ARGS__,							\
-+	};
-+
-+#endif	/* __SCX_COMPAT_BPF_H */
-diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
-new file mode 100644
-index 000000000000..47ec920f8776
---- /dev/null
-+++ b/tools/sched_ext/include/scx/compat.h
-@@ -0,0 +1,208 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __SCX_COMPAT_H
-+#define __SCX_COMPAT_H
-+
-+#include <bpf/btf.h>
-+#include <fcntl.h>
-+#include <stdlib.h>
-+#include <unistd.h>
-+
-+struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
-+
-+static inline void __COMPAT_load_vmlinux_btf(void)
-+{
-+	if (!__COMPAT_vmlinux_btf) {
-+		__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
-+		SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
-+	}
-+}
-+
-+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
-+{
-+	const struct btf_type *t;
-+	const char *n;
-+	s32 tid;
-+	int i;
-+
-+	__COMPAT_load_vmlinux_btf();
-+
-+	tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
-+	if (tid < 0)
-+		return false;
-+
-+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
-+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
-+
-+	if (btf_is_enum(t)) {
-+		struct btf_enum *e = btf_enum(t);
-+
-+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
-+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
-+			SCX_BUG_ON(!n, "btf__name_by_offset()");
-+			if (!strcmp(n, name)) {
-+				*v = e[i].val;
-+				return true;
-+			}
-+		}
-+	} else if (btf_is_enum64(t)) {
-+		struct btf_enum64 *e = btf_enum64(t);
-+
-+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
-+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
-+			SCX_BUG_ON(!n, "btf__name_by_offset()");
-+			if (!strcmp(n, name)) {
-+				*v = btf_enum64_value(&e[i]);
-+				return true;
-+			}
-+		}
-+	}
-+
-+	return false;
-+}
-+
-+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
-+({										\
-+	u64 __val = 0;								\
-+	__COMPAT_read_enum(__type, __ent, &__val);				\
-+	__val;									\
-+})
-+
-+static inline bool __COMPAT_has_ksym(const char *ksym)
-+{
-+	__COMPAT_load_vmlinux_btf();
-+	return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
-+}
-+
-+static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
-+{
-+	const struct btf_type *t;
-+	const struct btf_member *m;
-+	const char *n;
-+	s32 tid;
-+	int i;
-+
-+	__COMPAT_load_vmlinux_btf();
-+	tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
-+	if (tid < 0)
-+		return false;
-+
-+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
-+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
-+
-+	m = btf_members(t);
-+
-+	for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
-+		n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
-+		SCX_BUG_ON(!n, "btf__name_by_offset()");
-+			if (!strcmp(n, field))
-+				return true;
-+	}
-+
-+	return false;
-+}
-+
-+/*
-+ * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had
-+ * to be called from ops.init(). To support both before and after, use both
-+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined
-+ * in compat.bpf.h. Users can switch to directly using %SCX_OPS_SWITCH_PARTIAL
-+ * in the future.
-+ */
-+#define __COMPAT_SCX_OPS_SWITCH_PARTIAL						\
-+	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
-+
-+/*
-+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. Users
-+ * will be able to assume existence in the future.
-+ */
-+#define __COMPAT_HAS_CPUMASKS							\
-+	__COMPAT_has_ksym("scx_bpf_nr_cpu_ids")
-+
-+/*
-+ * DSQ iterator is new. Users will be able to assume existence in the future.
-+ */
-+#define __COMPAT_HAS_DSQ_ITER							\
-+	__COMPAT_has_ksym("bpf_iter_scx_dsq_new")
-+
-+static inline long scx_hotplug_seq(void)
-+{
-+	int fd;
-+	char buf[32];
-+	ssize_t len;
-+	long val;
-+
-+	fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
-+	if (fd < 0)
-+		return -ENOENT;
-+
-+	len = read(fd, buf, sizeof(buf) - 1);
-+	SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
-+	buf[len] = 0;
-+	close(fd);
-+
-+	val = strtoul(buf, NULL, 10);
-+	SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
-+
-+	return val;
-+}
-+
-+/*
-+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
-+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
-+ * and attach it, backward compatibility is automatically maintained where
-+ * reasonable.
-+ *
-+ * - ops.tick(): Ignored on older kernels with a warning.
-+ * - ops.dump*(): Ignored on older kernels with a warning.
-+ * - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
-+ * - ops.hotplug_seq: Ignored on older kernels.
-+ */
-+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
-+	struct __scx_name *__skel;						\
-+										\
-+	__skel = __scx_name##__open();						\
-+	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
-+										\
-+	if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq"))		\
-+		__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();	\
-+	__skel; 								\
-+})
-+
-+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
-+	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
-+	if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") &&	\
-+	    (__skel)->struct_ops.__ops_name->exit_dump_len) {			\
-+		fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
-+		(__skel)->struct_ops.__ops_name->exit_dump_len = 0;		\
-+	}									\
-+	if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") &&		\
-+	    (__skel)->struct_ops.__ops_name->tick) {				\
-+		fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
-+		(__skel)->struct_ops.__ops_name->tick = NULL;			\
-+	}									\
-+	if (!__COMPAT_struct_has_field("sched_ext_ops", "dump") &&		\
-+	    ((__skel)->struct_ops.__ops_name->dump ||				\
-+	     (__skel)->struct_ops.__ops_name->dump_cpu ||			\
-+	     (__skel)->struct_ops.__ops_name->dump_task)) {			\
-+		fprintf(stderr, "WARNING: kernel doesn't support ops.dump*()\n"); \
-+		(__skel)->struct_ops.__ops_name->dump = NULL;			\
-+		(__skel)->struct_ops.__ops_name->dump_cpu = NULL;		\
-+		(__skel)->struct_ops.__ops_name->dump_task = NULL;		\
-+	}									\
-+	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
-+})
-+
-+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({			\
-+	struct bpf_link *__link;						\
-+	SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel");	\
-+	__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name);		\
-+	SCX_BUG_ON(!__link, "Failed to attach struct_ops");			\
-+	__link;									\
-+})
-+
-+#endif	/* __SCX_COMPAT_H */
-diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
-new file mode 100644
-index 000000000000..2d86d01a9575
---- /dev/null
-+++ b/tools/sched_ext/include/scx/user_exit_info.h
-@@ -0,0 +1,111 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Define struct user_exit_info which is shared between BPF and userspace parts
-+ * to communicate exit status and other information.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#ifndef __USER_EXIT_INFO_H
-+#define __USER_EXIT_INFO_H
-+
-+enum uei_sizes {
-+	UEI_REASON_LEN		= 128,
-+	UEI_MSG_LEN		= 1024,
-+	UEI_DUMP_DFL_LEN	= 32768,
-+};
-+
-+struct user_exit_info {
-+	int		kind;
-+	s64		exit_code;
-+	char		reason[UEI_REASON_LEN];
-+	char		msg[UEI_MSG_LEN];
-+};
-+
-+#ifdef __bpf__
-+
-+#include "vmlinux.h"
-+#include <bpf/bpf_core_read.h>
-+
-+#define UEI_DEFINE(__name)							\
-+	char RESIZABLE_ARRAY(data, __name##_dump);				\
-+	const volatile u32 __name##_dump_len;					\
-+	struct user_exit_info __name SEC(".data")
-+
-+#define UEI_RECORD(__uei_name, __ei) ({						\
-+	bpf_probe_read_kernel_str(__uei_name.reason,				\
-+				  sizeof(__uei_name.reason), (__ei)->reason);	\
-+	bpf_probe_read_kernel_str(__uei_name.msg,				\
-+				  sizeof(__uei_name.msg), (__ei)->msg);		\
-+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
-+				  __uei_name##_dump_len, (__ei)->dump);		\
-+	if (bpf_core_field_exists((__ei)->exit_code))				\
-+		__uei_name.exit_code = (__ei)->exit_code;			\
-+	/* use __sync to force memory barrier */				\
-+	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
-+				    (__ei)->kind);				\
-+})
-+
-+#else	/* !__bpf__ */
-+
-+#include <stdio.h>
-+#include <stdbool.h>
-+
-+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
-+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({				\
-+	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
-+	(__skel)->rodata->__uei_name##_dump_len = __len;			\
-+	RESIZE_ARRAY(data, __uei_name##_dump, __len);				\
-+})
-+
-+#define UEI_EXITED(__skel, __uei_name) ({					\
-+	/* use __sync to force memory barrier */				\
-+	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
-+})
-+
-+#define UEI_REPORT(__skel, __uei_name) ({					\
-+	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
-+	char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
-+	if (__uei_dump[0] != '\0') {						\
-+		fputs("\nDEBUG DUMP\n", stderr);				\
-+		fputs("================================================================================\n\n", stderr); \
-+		fputs(__uei_dump, stderr);					\
-+		fputs("\n================================================================================\n\n", stderr); \
-+	}									\
-+	fprintf(stderr, "EXIT: %s", __uei->reason);				\
-+	if (__uei->msg[0] != '\0')						\
-+		fprintf(stderr, " (%s)", __uei->msg);				\
-+	fputs("\n", stderr);							\
-+	__uei->exit_code;							\
-+})
-+
-+/*
-+ * We can't import vmlinux.h while compiling user C code. Let's duplicate
-+ * scx_exit_code definition.
-+ */
-+enum scx_exit_code {
-+	/* Reasons */
-+	SCX_ECODE_RSN_HOTPLUG		= 1LLU << 32,
-+
-+	/* Actions */
-+	SCX_ECODE_ACT_RESTART		= 1LLU << 48,
-+};
-+
-+enum uei_ecode_mask {
-+	UEI_ECODE_USER_MASK		= ((1LLU << 32) - 1),
-+	UEI_ECODE_SYS_RSN_MASK		= ((1LLU << 16) - 1) << 32,
-+	UEI_ECODE_SYS_ACT_MASK		= ((1LLU << 16) - 1) << 48,
-+};
-+
-+/*
-+ * These macro interpret the ecode returned from UEI_REPORT().
-+ */
-+#define UEI_ECODE_USER(__ecode)		((__ecode) & UEI_ECODE_USER_MASK)
-+#define UEI_ECODE_SYS_RSN(__ecode)	((__ecode) & UEI_ECODE_SYS_RSN_MASK)
-+#define UEI_ECODE_SYS_ACT(__ecode)	((__ecode) & UEI_ECODE_SYS_ACT_MASK)
-+
-+#define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
-+
-+#endif	/* __bpf__ */
-+#endif	/* __USER_EXIT_INFO_H */
-diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
-new file mode 100644
-index 000000000000..b297ccbd70b4
---- /dev/null
-+++ b/tools/sched_ext/scx_central.bpf.c
-@@ -0,0 +1,362 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A central FIFO sched_ext scheduler which demonstrates the followings:
-+ *
-+ * a. Making all scheduling decisions from one CPU:
-+ *
-+ *    The central CPU is the only one making scheduling decisions. All other
-+ *    CPUs kick the central CPU when they run out of tasks to run.
-+ *
-+ *    There is one global BPF queue and the central CPU schedules all CPUs by
-+ *    dispatching from the global queue to each CPU's local dsq from dispatch().
-+ *    This isn't the most straightforward. e.g. It'd be easier to bounce
-+ *    through per-CPU BPF queues. The current design is chosen to maximally
-+ *    utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
-+ *
-+ * b. Tickless operation
-+ *
-+ *    All tasks are dispatched with the infinite slice which allows stopping the
-+ *    ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
-+ *    parameter. The tickless operation can be observed through
-+ *    /proc/interrupts.
-+ *
-+ *    Periodic switching is enforced by a periodic timer checking all CPUs and
-+ *    preempting them as necessary. Unfortunately, BPF timer currently doesn't
-+ *    have a way to pin to a specific CPU, so the periodic timer isn't pinned to
-+ *    the central CPU.
-+ *
-+ * c. Preemption
-+ *
-+ *    Kthreads are unconditionally queued to the head of a matching local dsq
-+ *    and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
-+ *    prioritized over user threads, which is required for ensuring forward
-+ *    progress as e.g. the periodic timer may run on a ksoftirqd and if the
-+ *    ksoftirqd gets starved by a user thread, there may not be anything else to
-+ *    vacate that user thread.
-+ *
-+ *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
-+ *    next tasks.
-+ *
-+ * This scheduler is designed to maximize usage of various SCX mechanisms. A
-+ * more practical implementation would likely put the scheduling loop outside
-+ * the central CPU's dispatch() path and add some form of priority mechanism.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+enum {
-+	FALLBACK_DSQ_ID		= 0,
-+	MS_TO_NS		= 1000LLU * 1000,
-+	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
-+};
-+
-+const volatile s32 central_cpu;
-+const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+
-+bool timer_pinned = true;
-+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
-+u64 nr_overflows;
-+
-+UEI_DEFINE(uei);
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, 4096);
-+	__type(value, s32);
-+} central_q SEC(".maps");
-+
-+/* can't use percpu map due to bad lookups */
-+bool RESIZABLE_ARRAY(data, cpu_gimme_task);
-+u64 RESIZABLE_ARRAY(data, cpu_started_at);
-+
-+struct central_timer {
-+	struct bpf_timer timer;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__uint(max_entries, 1);
-+	__type(key, u32);
-+	__type(value, struct central_timer);
-+} central_timer SEC(".maps");
-+
-+static bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	/*
-+	 * Steer wakeups to the central CPU as much as possible to avoid
-+	 * disturbing other CPUs. It's safe to blindly return the central cpu as
-+	 * select_cpu() is a hint and if @p can't be on it, the kernel will
-+	 * automatically pick a fallback CPU.
-+	 */
-+	return central_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	s32 pid = p->pid;
-+
-+	__sync_fetch_and_add(&nr_total, 1);
-+
-+	/*
-+	 * Push per-cpu kthreads at the head of local dsq's and preempt the
-+	 * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
-+	 * behind other threads which is necessary for forward progress
-+	 * guarantee as we depend on the BPF timer which may run from ksoftirqd.
-+	 */
-+	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
-+		__sync_fetch_and_add(&nr_locals, 1);
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
-+				 enq_flags | SCX_ENQ_PREEMPT);
-+		return;
-+	}
-+
-+	if (bpf_map_push_elem(&central_q, &pid, 0)) {
-+		__sync_fetch_and_add(&nr_overflows, 1);
-+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
-+		return;
-+	}
-+
-+	__sync_fetch_and_add(&nr_queued, 1);
-+
-+	if (!scx_bpf_task_running(p))
-+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+}
-+
-+static bool dispatch_to_cpu(s32 cpu)
-+{
-+	struct task_struct *p;
-+	s32 pid;
-+
-+	bpf_repeat(BPF_MAX_LOOPS) {
-+		if (bpf_map_pop_elem(&central_q, &pid))
-+			break;
-+
-+		__sync_fetch_and_sub(&nr_queued, 1);
-+
-+		p = bpf_task_from_pid(pid);
-+		if (!p) {
-+			__sync_fetch_and_add(&nr_lost_pids, 1);
-+			continue;
-+		}
-+
-+		/*
-+		 * If we can't run the task at the top, do the dumb thing and
-+		 * bounce it to the fallback dsq.
-+		 */
-+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-+			__sync_fetch_and_add(&nr_mismatches, 1);
-+			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
-+			bpf_task_release(p);
-+			/*
-+			 * We might run out of dispatch buffer slots if we continue dispatching
-+			 * to the fallback DSQ, without dispatching to the local DSQ of the
-+			 * target CPU. In such a case, break the loop now as will fail the
-+			 * next dispatch operation.
-+			 */
-+			if (!scx_bpf_dispatch_nr_slots())
-+				break;
-+			continue;
-+		}
-+
-+		/* dispatch to local and mark that @cpu doesn't need more */
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
-+
-+		if (cpu != central_cpu)
-+			scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
-+
-+		bpf_task_release(p);
-+		return true;
-+	}
-+
-+	return false;
-+}
-+
-+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	if (cpu == central_cpu) {
-+		/* dispatch for all other CPUs first */
-+		__sync_fetch_and_add(&nr_dispatches, 1);
-+
-+		bpf_for(cpu, 0, nr_cpu_ids) {
-+			bool *gimme;
-+
-+			if (!scx_bpf_dispatch_nr_slots())
-+				break;
-+
-+			/* central's gimme is never set */
-+			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+			if (gimme && !*gimme)
-+				continue;
-+
-+			if (dispatch_to_cpu(cpu))
-+				*gimme = false;
-+		}
-+
-+		/*
-+		 * Retry if we ran out of dispatch buffer slots as we might have
-+		 * skipped some CPUs and also need to dispatch for self. The ext
-+		 * core automatically retries if the local dsq is empty but we
-+		 * can't rely on that as we're dispatching for other CPUs too.
-+		 * Kick self explicitly to retry.
-+		 */
-+		if (!scx_bpf_dispatch_nr_slots()) {
-+			__sync_fetch_and_add(&nr_retries, 1);
-+			scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+			return;
-+		}
-+
-+		/* look for a task to run on the central CPU */
-+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
-+			return;
-+		dispatch_to_cpu(central_cpu);
-+	} else {
-+		bool *gimme;
-+
-+		if (scx_bpf_consume(FALLBACK_DSQ_ID))
-+			return;
-+
-+		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-+		if (gimme)
-+			*gimme = true;
-+
-+		/*
-+		 * Force dispatch on the scheduling CPU so that it finds a task
-+		 * to run for us.
-+		 */
-+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
-+{
-+	s32 cpu = scx_bpf_task_cpu(p);
-+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+	if (started_at)
-+		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
-+}
-+
-+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
-+{
-+	s32 cpu = scx_bpf_task_cpu(p);
-+	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+	if (started_at)
-+		*started_at = 0;
-+}
-+
-+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
-+{
-+	u64 now = bpf_ktime_get_ns();
-+	u64 nr_to_kick = nr_queued;
-+	s32 i, curr_cpu;
-+
-+	curr_cpu = bpf_get_smp_processor_id();
-+	if (timer_pinned && (curr_cpu != central_cpu)) {
-+		scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
-+			      curr_cpu, central_cpu);
-+		return 0;
-+	}
-+
-+	bpf_for(i, 0, nr_cpu_ids) {
-+		s32 cpu = (nr_timers + i) % nr_cpu_ids;
-+		u64 *started_at;
-+
-+		if (cpu == central_cpu)
-+			continue;
-+
-+		/* kick iff the current one exhausted its slice */
-+		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
-+		if (started_at && *started_at &&
-+		    vtime_before(now, *started_at + slice_ns))
-+			continue;
-+
-+		/* and there's something pending */
-+		if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
-+		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
-+			;
-+		else if (nr_to_kick)
-+			nr_to_kick--;
-+		else
-+			continue;
-+
-+		scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
-+	}
-+
-+	bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
-+	__sync_fetch_and_add(&nr_timers, 1);
-+	return 0;
-+}
-+
-+int BPF_STRUCT_OPS_SLEEPABLE(central_init)
-+{
-+	u32 key = 0;
-+	struct bpf_timer *timer;
-+	int ret;
-+
-+	__COMPAT_scx_bpf_switch_all();
-+	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
-+	if (ret)
-+		return ret;
-+
-+	timer = bpf_map_lookup_elem(&central_timer, &key);
-+	if (!timer)
-+		return -ESRCH;
-+
-+	if (bpf_get_smp_processor_id() != central_cpu) {
-+		scx_bpf_error("init from non-central CPU");
-+		return -EINVAL;
-+	}
-+
-+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
-+	bpf_timer_set_callback(timer, central_timerfn);
-+
-+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
-+	/*
-+	 * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
-+	 * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
-+	 * Retry without the PIN. This would be the perfect use case for
-+	 * bpf_core_enum_value_exists() but the enum type doesn't have a name
-+	 * and can't be used with bpf_core_enum_value_exists(). Oh well...
-+	 */
-+	if (ret == -EINVAL) {
-+		timer_pinned = false;
-+		ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
-+	}
-+	if (ret)
-+		scx_bpf_error("bpf_timer_start failed (%d)", ret);
-+	return ret;
-+}
-+
-+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SCX_OPS_DEFINE(central_ops,
-+	       /*
-+		* We are offloading all scheduling decisions to the central CPU
-+		* and thus being the last task on a given CPU doesn't mean
-+		* anything special. Enqueue the last tasks like any other tasks.
-+		*/
-+	       .flags			= SCX_OPS_ENQ_LAST,
-+
-+	       .select_cpu		= (void *)central_select_cpu,
-+	       .enqueue			= (void *)central_enqueue,
-+	       .dispatch		= (void *)central_dispatch,
-+	       .running			= (void *)central_running,
-+	       .stopping		= (void *)central_stopping,
-+	       .init			= (void *)central_init,
-+	       .exit			= (void *)central_exit,
-+	       .name			= "central");
-diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
-new file mode 100644
-index 000000000000..1e0568624ccc
---- /dev/null
-+++ b/tools/sched_ext/scx_central.c
-@@ -0,0 +1,135 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#define _GNU_SOURCE
-+#include <sched.h>
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <inttypes.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include "scx_central.bpf.skel.h"
-+
-+const char help_fmt[] =
-+"A central FIFO sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-c CPU]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -c CPU        Override the central CPU (default: 0)\n"
-+"  -v            Print libbpf debug messages\n"
-+"  -h            Display this help and exit\n";
-+
-+static bool verbose;
-+static volatile int exit_req;
-+
-+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
-+{
-+	if (level == LIBBPF_DEBUG && !verbose)
-+		return 0;
-+	return vfprintf(stderr, format, args);
-+}
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_central *skel;
-+	struct bpf_link *link;
-+	__u64 seq = 0, ecode;
-+	__s32 opt;
-+	cpu_set_t *cpuset;
-+
-+	libbpf_set_print(libbpf_print_fn);
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+restart:
-+	skel = SCX_OPS_OPEN(central_ops, scx_central);
-+
-+	skel->rodata->central_cpu = 0;
-+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
-+
-+	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
-+		switch (opt) {
-+		case 's':
-+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		case 'c':
-+			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'v':
-+			verbose = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	/* Resize arrays so their element count is equal to cpu count. */
-+	RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
-+	RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids);
-+
-+	SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
-+
-+	/*
-+	 * Affinitize the loading thread to the central CPU, as:
-+	 * - That's where the BPF timer is first invoked in the BPF program.
-+	 * - We probably don't want this user space component to take up a core
-+	 *   from a task that would benefit from avoiding preemption on one of
-+	 *   the tickless cores.
-+	 *
-+	 * Until BPF supports pinning the timer, it's not guaranteed that it
-+	 * will always be invoked on the central CPU. In practice, this
-+	 * suffices the majority of the time.
-+	 */
-+	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
-+	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
-+	CPU_ZERO(cpuset);
-+	CPU_SET(skel->rodata->central_cpu, cpuset);
-+	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
-+		   "Failed to affinitize to central CPU %d (max %d)",
-+		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
-+	CPU_FREE(cpuset);
-+
-+	link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
-+
-+	if (!skel->data->timer_pinned)
-+		printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");
-+
-+	while (!exit_req && !UEI_EXITED(skel, uei)) {
-+		printf("[SEQ %llu]\n", seq++);
-+		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
-+		       skel->bss->nr_total,
-+		       skel->bss->nr_locals,
-+		       skel->bss->nr_queued,
-+		       skel->bss->nr_lost_pids);
-+		printf("timer   :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
-+		       skel->bss->nr_timers,
-+		       skel->bss->nr_dispatches,
-+		       skel->bss->nr_mismatches,
-+		       skel->bss->nr_retries);
-+		printf("overflow:%10" PRIu64 "\n",
-+		       skel->bss->nr_overflows);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	ecode = UEI_REPORT(skel, uei);
-+	scx_central__destroy(skel);
-+
-+	if (UEI_ECODE_RESTART(ecode))
-+		goto restart;
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
-new file mode 100644
-index 000000000000..389bea204150
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.bpf.c
-@@ -0,0 +1,939 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
-+ * hierarchical weight-based cgroup CPU control by flattening the cgroup
-+ * hierarchy into a single layer by compounding the active weight share at each
-+ * level. Consider the following hierarchy with weights in parentheses:
-+ *
-+ * R + A (100) + B (100)
-+ *   |         \ C (100)
-+ *   \ D (200)
-+ *
-+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
-+ * Let's say all three have runnable tasks. The total share that each of these
-+ * three cgroups is entitled to can be calculated by compounding its share at
-+ * each level.
-+ *
-+ * For example, B is competing against C and in that competition its share is
-+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
-+ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
-+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
-+ * eventual shaer is the same at 1/6. D is only competing at the top level and
-+ * its share is 200/(100+200) == 2/3.
-+ *
-+ * So, instead of hierarchically scheduling level-by-level, we can consider it
-+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
-+ * and keep updating the eventual shares as the cgroups' runnable states change.
-+ *
-+ * This flattening of hierarchy can bring a substantial performance gain when
-+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
-+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
-+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
-+ * apache instances competing with 2:1 weight ratio nested four level deep.
-+ *
-+ * However, the gain comes at the cost of not being able to properly handle
-+ * thundering herd of cgroups. For example, if many cgroups which are nested
-+ * behind a low priority parent cgroup wake up around the same time, they may be
-+ * able to consume more CPU cycles than they are entitled to. In many use cases,
-+ * this isn't a real concern especially given the performance gain. Also, there
-+ * are ways to mitigate the problem further by e.g. introducing an extra
-+ * scheduling layer on cgroup delegation boundaries.
-+ *
-+ * The scheduler first picks the cgroup to run and then schedule the tasks
-+ * within by using nested weighted vtime scheduling by default. The
-+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
-+ */
-+#include <scx/common.bpf.h>
-+#include "scx_flatcg.h"
-+
-+/*
-+ * Maximum amount of retries to find a valid cgroup.
-+ */
-+#define CGROUP_MAX_RETRIES 1024
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
-+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
-+const volatile bool fifo_sched;
-+
-+u64 cvtime_now;
-+UEI_DEFINE(uei);
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__type(key, u32);
-+	__type(value, u64);
-+	__uint(max_entries, FCG_NR_STATS);
-+} stats SEC(".maps");
-+
-+static void stat_inc(enum fcg_stat_idx idx)
-+{
-+	u32 idx_v = idx;
-+
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
-+	if (cnt_p)
-+		(*cnt_p)++;
-+}
-+
-+struct fcg_cpu_ctx {
-+	u64			cur_cgid;
-+	u64			cur_at;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__type(key, u32);
-+	__type(value, struct fcg_cpu_ctx);
-+	__uint(max_entries, 1);
-+} cpu_ctx SEC(".maps");
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct fcg_cgrp_ctx);
-+} cgrp_ctx SEC(".maps");
-+
-+struct cgv_node {
-+	struct bpf_rb_node	rb_node;
-+	__u64			cvtime;
-+	__u64			cgid;
-+	struct bpf_refcount	refcount;
-+};
-+
-+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
-+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
-+
-+struct cgv_node_stash {
-+	struct cgv_node __kptr *node;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_HASH);
-+	__uint(max_entries, 16384);
-+	__type(key, __u64);
-+	__type(value, struct cgv_node_stash);
-+} cgv_node_stash SEC(".maps");
-+
-+struct fcg_task_ctx {
-+	u64		bypassed_at;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct fcg_task_ctx);
-+} task_ctx SEC(".maps");
-+
-+/* gets inc'd on weight tree changes to expire the cached hweights */
-+u64 hweight_gen = 1;
-+
-+static u64 div_round_up(u64 dividend, u64 divisor)
-+{
-+	return (dividend + divisor - 1) / divisor;
-+}
-+
-+static bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
-+{
-+	struct cgv_node *cgc_a, *cgc_b;
-+
-+	cgc_a = container_of(a, struct cgv_node, rb_node);
-+	cgc_b = container_of(b, struct cgv_node, rb_node);
-+
-+	return cgc_a->cvtime < cgc_b->cvtime;
-+}
-+
-+static struct fcg_cpu_ctx *find_cpu_ctx(void)
-+{
-+	struct fcg_cpu_ctx *cpuc;
-+	u32 idx = 0;
-+
-+	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
-+	if (!cpuc) {
-+		scx_bpf_error("cpu_ctx lookup failed");
-+		return NULL;
-+	}
-+	return cpuc;
-+}
-+
-+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (!cgc) {
-+		scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
-+		return NULL;
-+	}
-+	return cgc;
-+}
-+
-+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	cgrp = bpf_cgroup_ancestor(cgrp, level);
-+	if (!cgrp) {
-+		scx_bpf_error("ancestor cgroup lookup failed");
-+		return NULL;
-+	}
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		scx_bpf_error("ancestor cgrp_ctx lookup failed");
-+	bpf_cgroup_release(cgrp);
-+	return cgc;
-+}
-+
-+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
-+{
-+	int level;
-+
-+	if (!cgc->nr_active) {
-+		stat_inc(FCG_STAT_HWT_SKIP);
-+		return;
-+	}
-+
-+	if (cgc->hweight_gen == hweight_gen) {
-+		stat_inc(FCG_STAT_HWT_CACHE);
-+		return;
-+	}
-+
-+	stat_inc(FCG_STAT_HWT_UPDATES);
-+	bpf_for(level, 0, cgrp->level + 1) {
-+		struct fcg_cgrp_ctx *cgc;
-+		bool is_active;
-+
-+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
-+		if (!cgc)
-+			break;
-+
-+		if (!level) {
-+			cgc->hweight = FCG_HWEIGHT_ONE;
-+			cgc->hweight_gen = hweight_gen;
-+		} else {
-+			struct fcg_cgrp_ctx *pcgc;
-+
-+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
-+			if (!pcgc)
-+				break;
-+
-+			/*
-+			 * We can be oppotunistic here and not grab the
-+			 * cgv_tree_lock and deal with the occasional races.
-+			 * However, hweight updates are already cached and
-+			 * relatively low-frequency. Let's just do the
-+			 * straightforward thing.
-+			 */
-+			bpf_spin_lock(&cgv_tree_lock);
-+			is_active = cgc->nr_active;
-+			if (is_active) {
-+				cgc->hweight_gen = pcgc->hweight_gen;
-+				cgc->hweight =
-+					div_round_up(pcgc->hweight * cgc->weight,
-+						     pcgc->child_weight_sum);
-+			}
-+			bpf_spin_unlock(&cgv_tree_lock);
-+
-+			if (!is_active) {
-+				stat_inc(FCG_STAT_HWT_RACE);
-+				break;
-+			}
-+		}
-+	}
-+}
-+
-+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
-+{
-+	u64 delta, cvtime, max_budget;
-+
-+	/*
-+	 * A node which is on the rbtree can't be pointed to from elsewhere yet
-+	 * and thus can't be updated and repositioned. Instead, we collect the
-+	 * vtime deltas separately and apply it asynchronously here.
-+	 */
-+	delta = cgc->cvtime_delta;
-+	__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
-+	cvtime = cgv_node->cvtime + delta;
-+
-+	/*
-+	 * Allow a cgroup to carry the maximum budget proportional to its
-+	 * hweight such that a full-hweight cgroup can immediately take up half
-+	 * of the CPUs at the most while staying at the front of the rbtree.
-+	 */
-+	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
-+		(2 * FCG_HWEIGHT_ONE);
-+	if (vtime_before(cvtime, cvtime_now - max_budget))
-+		cvtime = cvtime_now - max_budget;
-+
-+	cgv_node->cvtime = cvtime;
-+}
-+
-+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
-+{
-+	struct cgv_node_stash *stash;
-+	struct cgv_node *cgv_node;
-+	u64 cgid = cgrp->kn->id;
-+
-+	/* paired with cmpxchg in try_pick_next_cgroup() */
-+	if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
-+		stat_inc(FCG_STAT_ENQ_SKIP);
-+		return;
-+	}
-+
-+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-+	if (!stash || !stash->node) {
-+		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
-+		return;
-+	}
-+
-+	cgv_node = bpf_refcount_acquire(stash->node);
-+	if (!cgv_node) {
-+		/*
-+		 * Node never leaves cgv_node_stash, this should only happen if
-+		 * fcg_cgroup_exit deletes the stashed node
-+		 */
-+		stat_inc(FCG_STAT_ENQ_RACE);
-+		return;
-+	}
-+
-+	bpf_spin_lock(&cgv_tree_lock);
-+	cgrp_cap_budget(cgv_node, cgc);
-+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+}
-+
-+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
-+{
-+	/*
-+	 * Tell fcg_stopping() that this bypassed the regular scheduling path
-+	 * and should be force charged to the cgroup. 0 is used to indicate that
-+	 * the task isn't bypassing, so if the current runtime is 0, go back by
-+	 * one nanosecond.
-+	 */
-+	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
-+}
-+
-+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
-+{
-+	struct fcg_task_ctx *taskc;
-+	bool is_idle = false;
-+	s32 cpu;
-+
-+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
-+
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
-+	if (!taskc) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return cpu;
-+	}
-+
-+	/*
-+	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
-+	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
-+	 * the fact.
-+	 */
-+	if (is_idle) {
-+		set_bypassed_at(p, taskc);
-+		stat_inc(FCG_STAT_LOCAL);
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-+	}
-+
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
-+	if (!taskc) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	/*
-+	 * Use the direct dispatching and force charging to deal with tasks with
-+	 * custom affinities so that we don't have to worry about per-cgroup
-+	 * dq's containing tasks that can't be executed from some CPUs.
-+	 */
-+	if (p->nr_cpus_allowed != nr_cpus) {
-+		set_bypassed_at(p, taskc);
-+
-+		/*
-+		 * The global dq is deprioritized as we don't want to let tasks
-+		 * to boost themselves by constraining its cpumask. The
-+		 * deprioritization is rather severe, so let's not apply that to
-+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
-+		 * implement per-cgroup fallback dq's instead so that we have
-+		 * more control over when tasks with custom cpumask get issued.
-+		 */
-+		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
-+			stat_inc(FCG_STAT_LOCAL);
-+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-+		} else {
-+			stat_inc(FCG_STAT_GLOBAL);
-+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+		}
-+		return;
-+	}
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		goto out_release;
-+
-+	if (fifo_sched) {
-+		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
-+	} else {
-+		u64 tvtime = p->scx.dsq_vtime;
-+
-+		/*
-+		 * Limit the amount of budget that an idling task can accumulate
-+		 * to one slice.
-+		 */
-+		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
-+			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
-+
-+		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
-+				       tvtime, enq_flags);
-+	}
-+
-+	cgrp_enqueued(cgrp, cgc);
-+out_release:
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+/*
-+ * Walk the cgroup tree to update the active weight sums as tasks wake up and
-+ * sleep. The weight sums are used as the base when calculating the proportion a
-+ * given cgroup or task is entitled to at each level.
-+ */
-+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+	bool updated = false;
-+	int idx;
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		return;
-+
-+	/*
-+	 * In most cases, a hot cgroup would have multiple threads going to
-+	 * sleep and waking up while the whole cgroup stays active. In leaf
-+	 * cgroups, ->nr_runnable which is updated with __sync operations gates
-+	 * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
-+	 * repeatedly for a busy cgroup which is staying active.
-+	 */
-+	if (runnable) {
-+		if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
-+			return;
-+		stat_inc(FCG_STAT_ACT);
-+	} else {
-+		if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
-+			return;
-+		stat_inc(FCG_STAT_DEACT);
-+	}
-+
-+	/*
-+	 * If @cgrp is becoming runnable, its hweight should be refreshed after
-+	 * it's added to the weight tree so that enqueue has the up-to-date
-+	 * value. If @cgrp is becoming quiescent, the hweight should be
-+	 * refreshed before it's removed from the weight tree so that the usage
-+	 * charging which happens afterwards has access to the latest value.
-+	 */
-+	if (!runnable)
-+		cgrp_refresh_hweight(cgrp, cgc);
-+
-+	/* propagate upwards */
-+	bpf_for(idx, 0, cgrp->level) {
-+		int level = cgrp->level - idx;
-+		struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
-+		bool propagate = false;
-+
-+		cgc = find_ancestor_cgrp_ctx(cgrp, level);
-+		if (!cgc)
-+			break;
-+		if (level) {
-+			pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
-+			if (!pcgc)
-+				break;
-+		}
-+
-+		/*
-+		 * We need the propagation protected by a lock to synchronize
-+		 * against weight changes. There's no reason to drop the lock at
-+		 * each level but bpf_spin_lock() doesn't want any function
-+		 * calls while locked.
-+		 */
-+		bpf_spin_lock(&cgv_tree_lock);
-+
-+		if (runnable) {
-+			if (!cgc->nr_active++) {
-+				updated = true;
-+				if (pcgc) {
-+					propagate = true;
-+					pcgc->child_weight_sum += cgc->weight;
-+				}
-+			}
-+		} else {
-+			if (!--cgc->nr_active) {
-+				updated = true;
-+				if (pcgc) {
-+					propagate = true;
-+					pcgc->child_weight_sum -= cgc->weight;
-+				}
-+			}
-+		}
-+
-+		bpf_spin_unlock(&cgv_tree_lock);
-+
-+		if (!propagate)
-+			break;
-+	}
-+
-+	if (updated)
-+		__sync_fetch_and_add(&hweight_gen, 1);
-+
-+	if (runnable)
-+		cgrp_refresh_hweight(cgrp, cgc);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
-+{
-+	struct cgroup *cgrp;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	update_active_weight_sums(cgrp, true);
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
-+{
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	if (fifo_sched)
-+		return;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (cgc) {
-+		/*
-+		 * @cgc->tvtime_now always progresses forward as tasks start
-+		 * executing. The test and update can be performed concurrently
-+		 * from multiple CPUs and thus racy. Any error should be
-+		 * contained and temporary. Let's just live with it.
-+		 */
-+		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
-+			cgc->tvtime_now = p->scx.dsq_vtime;
-+	}
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct cgroup *cgrp;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	/*
-+	 * Scale the execution time by the inverse of the weight and charge.
-+	 *
-+	 * Note that the default yield implementation yields by setting
-+	 * @p->scx.slice to zero and the following would treat the yielding task
-+	 * as if it has consumed all its slice. If this penalizes yielding tasks
-+	 * too much, determine the execution time by taking explicit timestamps
-+	 * instead of depending on @p->scx.slice.
-+	 */
-+	if (!fifo_sched)
-+		p->scx.dsq_vtime +=
-+			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
-+
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
-+	if (!taskc) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	if (!taskc->bypassed_at)
-+		return;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (cgc) {
-+		__sync_fetch_and_add(&cgc->cvtime_delta,
-+				     p->se.sum_exec_runtime - taskc->bypassed_at);
-+		taskc->bypassed_at = 0;
-+	}
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
-+{
-+	struct cgroup *cgrp;
-+
-+	cgrp = scx_bpf_task_cgroup(p);
-+	update_active_weight_sums(cgrp, false);
-+	bpf_cgroup_release(cgrp);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
-+{
-+	struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
-+
-+	cgc = find_cgrp_ctx(cgrp);
-+	if (!cgc)
-+		return;
-+
-+	if (cgrp->level) {
-+		pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
-+		if (!pcgc)
-+			return;
-+	}
-+
-+	bpf_spin_lock(&cgv_tree_lock);
-+	if (pcgc && cgc->nr_active)
-+		pcgc->child_weight_sum += (s64)weight - cgc->weight;
-+	cgc->weight = weight;
-+	bpf_spin_unlock(&cgv_tree_lock);
-+}
-+
-+static bool try_pick_next_cgroup(u64 *cgidp)
-+{
-+	struct bpf_rb_node *rb_node;
-+	struct cgv_node *cgv_node;
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgroup *cgrp;
-+	u64 cgid;
-+
-+	/* pop the front cgroup and wind cvtime_now accordingly */
-+	bpf_spin_lock(&cgv_tree_lock);
-+
-+	rb_node = bpf_rbtree_first(&cgv_tree);
-+	if (!rb_node) {
-+		bpf_spin_unlock(&cgv_tree_lock);
-+		stat_inc(FCG_STAT_PNC_NO_CGRP);
-+		*cgidp = 0;
-+		return true;
-+	}
-+
-+	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+
-+	if (!rb_node) {
-+		/*
-+		 * This should never happen. bpf_rbtree_first() was called
-+		 * above while the tree lock was held, so the node should
-+		 * always be present.
-+		 */
-+		scx_bpf_error("node could not be removed");
-+		return true;
-+	}
-+
-+	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
-+	cgid = cgv_node->cgid;
-+
-+	if (vtime_before(cvtime_now, cgv_node->cvtime))
-+		cvtime_now = cgv_node->cvtime;
-+
-+	/*
-+	 * If lookup fails, the cgroup's gone. Free and move on. See
-+	 * fcg_cgroup_exit().
-+	 */
-+	cgrp = bpf_cgroup_from_id(cgid);
-+	if (!cgrp) {
-+		stat_inc(FCG_STAT_PNC_GONE);
-+		goto out_free;
-+	}
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (!cgc) {
-+		bpf_cgroup_release(cgrp);
-+		stat_inc(FCG_STAT_PNC_GONE);
-+		goto out_free;
-+	}
-+
-+	if (!scx_bpf_consume(cgid)) {
-+		bpf_cgroup_release(cgrp);
-+		stat_inc(FCG_STAT_PNC_EMPTY);
-+		goto out_stash;
-+	}
-+
-+	/*
-+	 * Successfully consumed from the cgroup. This will be our current
-+	 * cgroup for the new slice. Refresh its hweight.
-+	 */
-+	cgrp_refresh_hweight(cgrp, cgc);
-+
-+	bpf_cgroup_release(cgrp);
-+
-+	/*
-+	 * As the cgroup may have more tasks, add it back to the rbtree. Note
-+	 * that here we charge the full slice upfront and then exact later
-+	 * according to the actual consumption. This prevents lowpri thundering
-+	 * herd from saturating the machine.
-+	 */
-+	bpf_spin_lock(&cgv_tree_lock);
-+	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
-+	cgrp_cap_budget(cgv_node, cgc);
-+	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+	bpf_spin_unlock(&cgv_tree_lock);
-+
-+	*cgidp = cgid;
-+	stat_inc(FCG_STAT_PNC_NEXT);
-+	return true;
-+
-+out_stash:
-+	/*
-+	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
-+	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
-+	 * see their task in the dq below and requeue the cgroup.
-+	 */
-+	__sync_val_compare_and_swap(&cgc->queued, 1, 0);
-+
-+	if (scx_bpf_dsq_nr_queued(cgid)) {
-+		bpf_spin_lock(&cgv_tree_lock);
-+		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
-+		bpf_spin_unlock(&cgv_tree_lock);
-+		stat_inc(FCG_STAT_PNC_RACE);
-+		return false;
-+	}
-+
-+out_free:
-+	bpf_obj_drop(cgv_node);
-+	return false;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	struct fcg_cpu_ctx *cpuc;
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgroup *cgrp;
-+	u64 now = bpf_ktime_get_ns();
-+	bool picked_next = false;
-+
-+	cpuc = find_cpu_ctx();
-+	if (!cpuc)
-+		return;
-+
-+	if (!cpuc->cur_cgid)
-+		goto pick_next_cgroup;
-+
-+	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
-+		if (scx_bpf_consume(cpuc->cur_cgid)) {
-+			stat_inc(FCG_STAT_CNS_KEEP);
-+			return;
-+		}
-+		stat_inc(FCG_STAT_CNS_EMPTY);
-+	} else {
-+		stat_inc(FCG_STAT_CNS_EXPIRE);
-+	}
-+
-+	/*
-+	 * The current cgroup is expiring. It was already charged a full slice.
-+	 * Calculate the actual usage and accumulate the delta.
-+	 */
-+	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
-+	if (!cgrp) {
-+		stat_inc(FCG_STAT_CNS_GONE);
-+		goto pick_next_cgroup;
-+	}
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
-+	if (cgc) {
-+		/*
-+		 * We want to update the vtime delta and then look for the next
-+		 * cgroup to execute but the latter needs to be done in a loop
-+		 * and we can't keep the lock held. Oh well...
-+		 */
-+		bpf_spin_lock(&cgv_tree_lock);
-+		__sync_fetch_and_add(&cgc->cvtime_delta,
-+				     (cpuc->cur_at + cgrp_slice_ns - now) *
-+				     FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
-+		bpf_spin_unlock(&cgv_tree_lock);
-+	} else {
-+		stat_inc(FCG_STAT_CNS_GONE);
-+	}
-+
-+	bpf_cgroup_release(cgrp);
-+
-+pick_next_cgroup:
-+	cpuc->cur_at = now;
-+
-+	if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
-+		cpuc->cur_cgid = 0;
-+		return;
-+	}
-+
-+	bpf_repeat(CGROUP_MAX_RETRIES) {
-+		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
-+			picked_next = true;
-+			break;
-+		}
-+	}
-+
-+	/*
-+	 * This only happens if try_pick_next_cgroup() races against enqueue
-+	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
-+	 * unlikely and likely indicates an underlying bug. There shouldn't be
-+	 * any stall risk as the race is against enqueue.
-+	 */
-+	if (!picked_next)
-+		stat_inc(FCG_STAT_PNC_FAIL);
-+}
-+
-+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
-+		   struct scx_init_task_args *args)
-+{
-+	struct fcg_task_ctx *taskc;
-+	struct fcg_cgrp_ctx *cgc;
-+
-+	/*
-+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-+	 * in this function and the following will automatically use GFP_KERNEL.
-+	 */
-+	taskc = bpf_task_storage_get(&task_ctx, p, 0,
-+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
-+	if (!taskc)
-+		return -ENOMEM;
-+
-+	taskc->bypassed_at = 0;
-+
-+	if (!(cgc = find_cgrp_ctx(args->cgroup)))
-+		return -ENOENT;
-+
-+	p->scx.dsq_vtime = cgc->tvtime_now;
-+
-+	return 0;
-+}
-+
-+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
-+			     struct scx_cgroup_init_args *args)
-+{
-+	struct fcg_cgrp_ctx *cgc;
-+	struct cgv_node *cgv_node;
-+	struct cgv_node_stash empty_stash = {}, *stash;
-+	u64 cgid = cgrp->kn->id;
-+	int ret;
-+
-+	/*
-+	 * Technically incorrect as cgroup ID is full 64bit while dq ID is
-+	 * 63bit. Should not be a problem in practice and easy to spot in the
-+	 * unlikely case that it breaks.
-+	 */
-+	ret = scx_bpf_create_dsq(cgid, -1);
-+	if (ret)
-+		return ret;
-+
-+	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
-+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
-+	if (!cgc) {
-+		ret = -ENOMEM;
-+		goto err_destroy_dsq;
-+	}
-+
-+	cgc->weight = args->weight;
-+	cgc->hweight = FCG_HWEIGHT_ONE;
-+
-+	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
-+				  BPF_NOEXIST);
-+	if (ret) {
-+		if (ret != -ENOMEM)
-+			scx_bpf_error("unexpected stash creation error (%d)",
-+				      ret);
-+		goto err_destroy_dsq;
-+	}
-+
-+	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-+	if (!stash) {
-+		scx_bpf_error("unexpected cgv_node stash lookup failure");
-+		ret = -ENOENT;
-+		goto err_destroy_dsq;
-+	}
-+
-+	cgv_node = bpf_obj_new(struct cgv_node);
-+	if (!cgv_node) {
-+		ret = -ENOMEM;
-+		goto err_del_cgv_node;
-+	}
-+
-+	cgv_node->cgid = cgid;
-+	cgv_node->cvtime = cvtime_now;
-+
-+	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
-+	if (cgv_node) {
-+		scx_bpf_error("unexpected !NULL cgv_node stash");
-+		ret = -EBUSY;
-+		goto err_drop;
-+	}
-+
-+	return 0;
-+
-+err_drop:
-+	bpf_obj_drop(cgv_node);
-+err_del_cgv_node:
-+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
-+err_destroy_dsq:
-+	scx_bpf_destroy_dsq(cgid);
-+	return ret;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
-+{
-+	u64 cgid = cgrp->kn->id;
-+
-+	/*
-+	 * For now, there's no way find and remove the cgv_node if it's on the
-+	 * cgv_tree. Let's drain them in the dispatch path as they get popped
-+	 * off the front of the tree.
-+	 */
-+	bpf_map_delete_elem(&cgv_node_stash, &cgid);
-+	scx_bpf_destroy_dsq(cgid);
-+}
-+
-+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
-+		    struct cgroup *from, struct cgroup *to)
-+{
-+	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
-+	s64 vtime_delta;
-+
-+	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
-+	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
-+		return;
-+
-+	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
-+	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
-+}
-+
-+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SCX_OPS_DEFINE(flatcg_ops,
-+	       .select_cpu		= (void *)fcg_select_cpu,
-+	       .enqueue			= (void *)fcg_enqueue,
-+	       .dispatch		= (void *)fcg_dispatch,
-+	       .runnable		= (void *)fcg_runnable,
-+	       .running			= (void *)fcg_running,
-+	       .stopping		= (void *)fcg_stopping,
-+	       .quiescent		= (void *)fcg_quiescent,
-+	       .init_task		= (void *)fcg_init_task,
-+	       .cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
-+	       .cgroup_init		= (void *)fcg_cgroup_init,
-+	       .cgroup_exit		= (void *)fcg_cgroup_exit,
-+	       .cgroup_move		= (void *)fcg_cgroup_move,
-+	       .exit			= (void *)fcg_exit,
-+	       .flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
-+	       .name			= "flatcg");
-diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
-new file mode 100644
-index 000000000000..5d24ca9c29d9
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.c
-@@ -0,0 +1,233 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <signal.h>
-+#include <unistd.h>
-+#include <libgen.h>
-+#include <limits.h>
-+#include <inttypes.h>
-+#include <fcntl.h>
-+#include <time.h>
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include "scx_flatcg.h"
-+#include "scx_flatcg.bpf.skel.h"
-+
-+#ifndef FILEID_KERNFS
-+#define FILEID_KERNFS		0xfe
-+#endif
-+
-+const char help_fmt[] =
-+"A flattened cgroup hierarchy sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -i INTERVAL   Report interval\n"
-+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-+"  -v            Print libbpf debug messages\n"
-+"  -h            Display this help and exit\n";
-+
-+static bool verbose;
-+static volatile int exit_req;
-+
-+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
-+{
-+	if (level == LIBBPF_DEBUG && !verbose)
-+		return 0;
-+	return vfprintf(stderr, format, args);
-+}
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
-+{
-+	FILE *fp;
-+	char buf[4096];
-+	char *line, *cur = NULL, *tok;
-+	__u64 sum = 0, idle = 0;
-+	__u64 delta_sum, delta_idle;
-+	int idx;
-+
-+	fp = fopen("/proc/stat", "r");
-+	if (!fp) {
-+		perror("fopen(\"/proc/stat\")");
-+		return 0.0;
-+	}
-+
-+	if (!fgets(buf, sizeof(buf), fp)) {
-+		perror("fgets(\"/proc/stat\")");
-+		fclose(fp);
-+		return 0.0;
-+	}
-+	fclose(fp);
-+
-+	line = buf;
-+	for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
-+		char *endp = NULL;
-+		__u64 v;
-+
-+		if (idx == 0) {
-+			line = NULL;
-+			continue;
-+		}
-+		v = strtoull(tok, &endp, 0);
-+		if (!endp || *endp != '\0') {
-+			fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
-+				idx, tok);
-+			continue;
-+		}
-+		sum += v;
-+		if (idx == 4)
-+			idle = v;
-+	}
-+
-+	delta_sum = sum - *last_sum;
-+	delta_idle = idle - *last_idle;
-+	*last_sum = sum;
-+	*last_idle = idle;
-+
-+	return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
-+}
-+
-+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
-+{
-+	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
-+	__u32 idx;
-+
-+	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
-+
-+	for (idx = 0; idx < FCG_NR_STATS; idx++) {
-+		int ret, cpu;
-+
-+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-+					  &idx, cnts[idx]);
-+		if (ret < 0)
-+			continue;
-+		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
-+			stats[idx] += cnts[idx][cpu];
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_flatcg *skel;
-+	struct bpf_link *link;
-+	struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
-+	bool dump_cgrps = false;
-+	__u64 last_cpu_sum = 0, last_cpu_idle = 0;
-+	__u64 last_stats[FCG_NR_STATS] = {};
-+	unsigned long seq = 0;
-+	__s32 opt;
-+	__u64 ecode;
-+
-+	libbpf_set_print(libbpf_print_fn);
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+restart:
-+	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
-+
-+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-+
-+	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
-+		double v;
-+
-+		switch (opt) {
-+		case 's':
-+			v = strtod(optarg, NULL);
-+			skel->rodata->cgrp_slice_ns = v * 1000;
-+			break;
-+		case 'i':
-+			v = strtod(optarg, NULL);
-+			intv_ts.tv_sec = v;
-+			intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
-+			break;
-+		case 'd':
-+			dump_cgrps = true;
-+			break;
-+		case 'f':
-+			skel->rodata->fifo_sched = true;
-+			break;
-+		case 'v':
-+			verbose = true;
-+			break;
-+		case 'h':
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
-+	       (double)skel->rodata->cgrp_slice_ns / 1000000.0,
-+	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
-+	       dump_cgrps);
-+
-+	SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
-+	link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
-+
-+	while (!exit_req && !UEI_EXITED(skel, uei)) {
-+		__u64 acc_stats[FCG_NR_STATS];
-+		__u64 stats[FCG_NR_STATS];
-+		float cpu_util;
-+		int i;
-+
-+		cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
-+
-+		fcg_read_stats(skel, acc_stats);
-+		for (i = 0; i < FCG_NR_STATS; i++)
-+			stats[i] = acc_stats[i] - last_stats[i];
-+
-+		memcpy(last_stats, acc_stats, sizeof(acc_stats));
-+
-+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
-+		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
-+		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
-+		       stats[FCG_STAT_ACT],
-+		       stats[FCG_STAT_DEACT],
-+		       stats[FCG_STAT_GLOBAL],
-+		       stats[FCG_STAT_LOCAL]);
-+		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
-+		       stats[FCG_STAT_HWT_CACHE],
-+		       stats[FCG_STAT_HWT_UPDATES],
-+		       stats[FCG_STAT_HWT_SKIP],
-+		       stats[FCG_STAT_HWT_RACE]);
-+		printf("ENQ   skip:%6llu   race:%6llu\n",
-+		       stats[FCG_STAT_ENQ_SKIP],
-+		       stats[FCG_STAT_ENQ_RACE]);
-+		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
-+		       stats[FCG_STAT_CNS_KEEP],
-+		       stats[FCG_STAT_CNS_EXPIRE],
-+		       stats[FCG_STAT_CNS_EMPTY],
-+		       stats[FCG_STAT_CNS_GONE]);
-+		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
-+		       stats[FCG_STAT_PNC_NEXT],
-+		       stats[FCG_STAT_PNC_EMPTY],
-+		       stats[FCG_STAT_PNC_NO_CGRP],
-+		       stats[FCG_STAT_PNC_GONE],
-+		       stats[FCG_STAT_PNC_RACE],
-+		       stats[FCG_STAT_PNC_FAIL]);
-+		printf("BAD remove:%6llu\n",
-+		       acc_stats[FCG_STAT_BAD_REMOVAL]);
-+		fflush(stdout);
-+
-+		nanosleep(&intv_ts, NULL);
-+	}
-+
-+	bpf_link__destroy(link);
-+	ecode = UEI_REPORT(skel, uei);
-+	scx_flatcg__destroy(skel);
-+
-+	if (UEI_ECODE_RESTART(ecode))
-+		goto restart;
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
-new file mode 100644
-index 000000000000..6f2ea50acb1c
---- /dev/null
-+++ b/tools/sched_ext/scx_flatcg.h
-@@ -0,0 +1,51 @@
-+#ifndef __SCX_EXAMPLE_FLATCG_H
-+#define __SCX_EXAMPLE_FLATCG_H
-+
-+enum {
-+	FCG_HWEIGHT_ONE		= 1LLU << 16,
-+};
-+
-+enum fcg_stat_idx {
-+	FCG_STAT_ACT,
-+	FCG_STAT_DEACT,
-+	FCG_STAT_LOCAL,
-+	FCG_STAT_GLOBAL,
-+
-+	FCG_STAT_HWT_UPDATES,
-+	FCG_STAT_HWT_CACHE,
-+	FCG_STAT_HWT_SKIP,
-+	FCG_STAT_HWT_RACE,
-+
-+	FCG_STAT_ENQ_SKIP,
-+	FCG_STAT_ENQ_RACE,
-+
-+	FCG_STAT_CNS_KEEP,
-+	FCG_STAT_CNS_EXPIRE,
-+	FCG_STAT_CNS_EMPTY,
-+	FCG_STAT_CNS_GONE,
-+
-+	FCG_STAT_PNC_NO_CGRP,
-+	FCG_STAT_PNC_NEXT,
-+	FCG_STAT_PNC_EMPTY,
-+	FCG_STAT_PNC_GONE,
-+	FCG_STAT_PNC_RACE,
-+	FCG_STAT_PNC_FAIL,
-+
-+	FCG_STAT_BAD_REMOVAL,
-+
-+	FCG_NR_STATS,
-+};
-+
-+struct fcg_cgrp_ctx {
-+	u32			nr_active;
-+	u32			nr_runnable;
-+	u32			queued;
-+	u32			weight;
-+	u32			hweight;
-+	u64			child_weight_sum;
-+	u64			hweight_gen;
-+	s64			cvtime_delta;
-+	u64			tvtime_now;
-+};
-+
-+#endif /* __SCX_EXAMPLE_FLATCG_H */
-diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
-new file mode 100644
-index 000000000000..d74c5cf2a251
---- /dev/null
-+++ b/tools/sched_ext/scx_qmap.bpf.c
-@@ -0,0 +1,728 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A simple five-level FIFO queue scheduler.
-+ *
-+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
-+ * assigned to one depending on its compound weight. Each CPU round robins
-+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
-+ * queue0, 2 from queue1, 4 from queue2 and so on.
-+ *
-+ * This scheduler demonstrates:
-+ *
-+ * - BPF-side queueing using PIDs.
-+ * - Sleepable per-task storage allocation using ops.prep_enable().
-+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
-+ *   the CPU away.
-+ * - Core-sched support.
-+ *
-+ * This scheduler is primarily for demonstration and testing of sched_ext
-+ * features and unlikely to be useful for actual workloads.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <scx/common.bpf.h>
-+#include <string.h>
-+
-+enum consts {
-+	ONE_SEC_IN_NS		= 1000000000,
-+	SHARED_DSQ		= 0,
-+};
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile u64 slice_ns = SCX_SLICE_DFL;
-+const volatile u32 stall_user_nth;
-+const volatile u32 stall_kernel_nth;
-+const volatile u32 dsp_inf_loop_after;
-+const volatile u32 dsp_batch;
-+const volatile bool print_shared_dsq;
-+const volatile char exp_prefix[17];
-+const volatile s32 disallow_tgid;
-+const volatile bool suppress_dump;
-+const volatile bool switch_partial;
-+
-+u32 test_error_cnt;
-+
-+UEI_DEFINE(uei);
-+
-+struct qmap {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, 4096);
-+	__type(value, u32);
-+} queue0 SEC(".maps"),
-+  queue1 SEC(".maps"),
-+  queue2 SEC(".maps"),
-+  queue3 SEC(".maps"),
-+  queue4 SEC(".maps");
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
-+	__uint(max_entries, 5);
-+	__type(key, int);
-+	__array(values, struct qmap);
-+} queue_arr SEC(".maps") = {
-+	.values = {
-+		[0] = &queue0,
-+		[1] = &queue1,
-+		[2] = &queue2,
-+		[3] = &queue3,
-+		[4] = &queue4,
-+	},
-+};
-+
-+/*
-+ * If enabled, CPU performance target is set according to the queue index
-+ * according to the following table.
-+ */
-+static const u32 qidx_to_cpuperf_target[] = {
-+	[0] = SCX_CPUPERF_ONE * 0 / 4,
-+	[1] = SCX_CPUPERF_ONE * 1 / 4,
-+	[2] = SCX_CPUPERF_ONE * 2 / 4,
-+	[3] = SCX_CPUPERF_ONE * 3 / 4,
-+	[4] = SCX_CPUPERF_ONE * 4 / 4,
-+};
-+
-+/*
-+ * Per-queue sequence numbers to implement core-sched ordering.
-+ *
-+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the
-+ * sequence number of the latest dispatched task. The distance between the a
-+ * task's seq and the associated queue's head seq is called the queue distance
-+ * and used when comparing two tasks for ordering. See qmap_core_sched_before().
-+ */
-+static u64 core_sched_head_seqs[5];
-+static u64 core_sched_tail_seqs[5];
-+
-+/* Per-task scheduling context */
-+struct task_ctx {
-+	bool	force_local;	/* Dispatch directly to local_dsq */
-+	u64	core_sched_seq;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct task_ctx);
-+} task_ctx_stor SEC(".maps");
-+
-+struct cpu_ctx {
-+	u64	dsp_idx;	/* dispatch index */
-+	u64	dsp_cnt;	/* remaining count */
-+	u32	avg_weight;
-+	u32	cpuperf_target;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(max_entries, 1);
-+	__type(key, u32);
-+	__type(value, struct cpu_ctx);
-+} cpu_ctx_stor SEC(".maps");
-+
-+/* Statistics */
-+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-+u64 nr_core_sched_execed, nr_expedited;
-+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
-+
-+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	struct task_ctx *tctx;
-+	s32 cpu;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return -ESRCH;
-+	}
-+
-+	if (p->nr_cpus_allowed == 1 ||
-+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
-+		tctx->force_local = true;
-+		return prev_cpu;
-+	}
-+
-+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0)
-+		return cpu;
-+
-+	return prev_cpu;
-+}
-+
-+static int weight_to_idx(u32 weight)
-+{
-+	/* Coarsely map the compound weight to a FIFO. */
-+	if (weight <= 25)
-+		return 0;
-+	else if (weight <= 50)
-+		return 1;
-+	else if (weight < 200)
-+		return 2;
-+	else if (weight < 400)
-+		return 3;
-+	else
-+		return 4;
-+}
-+
-+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	static u32 user_cnt, kernel_cnt;
-+	struct task_ctx *tctx;
-+	u32 pid = p->pid;
-+	int idx = weight_to_idx(p->scx.weight);
-+	void *ring;
-+
-+	if (p->flags & PF_KTHREAD) {
-+		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
-+			return;
-+	} else {
-+		if (stall_user_nth && !(++user_cnt % stall_user_nth))
-+			return;
-+	}
-+
-+	if (test_error_cnt && !--test_error_cnt)
-+		scx_bpf_error("test triggering error");
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	/*
-+	 * All enqueued tasks must have their core_sched_seq updated for correct
-+	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-+	 * qmap_ops.flags.
-+	 */
-+	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
-+
-+	/*
-+	 * If qmap_select_cpu() is telling us to or this is the last runnable
-+	 * task on the CPU, enqueue locally.
-+	 */
-+	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
-+		tctx->force_local = false;
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	/*
-+	 * If the task was re-enqueued due to the CPU being preempted by a
-+	 * higher priority scheduling class, just re-enqueue the task directly
-+	 * on the global DSQ. As we want another CPU to pick it up, find and
-+	 * kick an idle CPU.
-+	 */
-+	if (enq_flags & SCX_ENQ_REENQ) {
-+		s32 cpu;
-+
-+		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
-+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+		if (cpu >= 0)
-+			scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
-+		return;
-+	}
-+
-+	ring = bpf_map_lookup_elem(&queue_arr, &idx);
-+	if (!ring) {
-+		scx_bpf_error("failed to find ring %d", idx);
-+		return;
-+	}
-+
-+	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
-+	if (bpf_map_push_elem(ring, &pid, 0)) {
-+		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
-+		return;
-+	}
-+
-+	__sync_fetch_and_add(&nr_enqueued, 1);
-+}
-+
-+/*
-+ * The BPF queue map doesn't support removal and sched_ext can handle spurious
-+ * dispatches. qmap_dequeue() is only used to collect statistics.
-+ */
-+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
-+{
-+	__sync_fetch_and_add(&nr_dequeued, 1);
-+	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
-+		__sync_fetch_and_add(&nr_core_sched_execed, 1);
-+}
-+
-+static void update_core_sched_head_seq(struct task_struct *p)
-+{
-+	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	int idx = weight_to_idx(p->scx.weight);
-+
-+	if (tctx)
-+		core_sched_head_seqs[idx] = tctx->core_sched_seq;
-+	else
-+		scx_bpf_error("task_ctx lookup failed");
-+}
-+
-+static bool consume_shared_dsq(void)
-+{
-+	struct task_struct *p;
-+	bool consumed;
-+
-+	if (exp_prefix[0] == '\0')
-+		return scx_bpf_consume(SHARED_DSQ);
-+
-+	/*
-+	 * To demonstrate the use of scx_bpf_consume_task(), implement silly
-+	 * selective priority boosting mechanism by scanning SHARED_DSQ looking
-+	 * for matching comms and consume them first. This makes difference only
-+	 * when dsp_batch is larger than 1.
-+	 */
-+	consumed = false;
-+	__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) {
-+		char comm[sizeof(exp_prefix)];
-+
-+		memcpy(comm, p->comm, sizeof(exp_prefix) - 1);
-+
-+		if (!bpf_strncmp(comm, sizeof(exp_prefix),
-+				 (const char *)exp_prefix) &&
-+		    __COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
-+			consumed = true;
-+			__sync_fetch_and_add(&nr_expedited, 1);
-+		}
-+	}
-+
-+	return consumed || scx_bpf_consume(SHARED_DSQ);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	struct task_struct *p;
-+	struct cpu_ctx *cpuc;
-+	u32 zero = 0, batch = dsp_batch ?: 1;
-+	void *fifo;
-+	s32 i, pid;
-+
-+	if (consume_shared_dsq())
-+		return;
-+
-+	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
-+		/*
-+		 * PID 2 should be kthreadd which should mostly be idle and off
-+		 * the scheduler. Let's keep dispatching it to force the kernel
-+		 * to call this function over and over again.
-+		 */
-+		p = bpf_task_from_pid(2);
-+		if (p) {
-+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
-+			bpf_task_release(p);
-+			return;
-+		}
-+	}
-+
-+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-+		scx_bpf_error("failed to look up cpu_ctx");
-+		return;
-+	}
-+
-+	for (i = 0; i < 5; i++) {
-+		/* Advance the dispatch cursor and pick the fifo. */
-+		if (!cpuc->dsp_cnt) {
-+			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
-+			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
-+		}
-+
-+		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
-+		if (!fifo) {
-+			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
-+			return;
-+		}
-+
-+		/* Dispatch or advance. */
-+		bpf_repeat(BPF_MAX_LOOPS) {
-+			if (bpf_map_pop_elem(fifo, &pid))
-+				break;
-+
-+			p = bpf_task_from_pid(pid);
-+			if (!p)
-+				continue;
-+
-+			update_core_sched_head_seq(p);
-+			__sync_fetch_and_add(&nr_dispatched, 1);
-+			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
-+			bpf_task_release(p);
-+			batch--;
-+			cpuc->dsp_cnt--;
-+			if (!batch || !scx_bpf_dispatch_nr_slots()) {
-+				consume_shared_dsq();
-+				return;
-+			}
-+			if (!cpuc->dsp_cnt)
-+				break;
-+		}
-+
-+		cpuc->dsp_cnt = 0;
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
-+{
-+	struct cpu_ctx *cpuc;
-+	u32 zero = 0;
-+	int idx;
-+
-+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-+		scx_bpf_error("failed to look up cpu_ctx");
-+		return;
-+	}
-+
-+	/*
-+	 * Use the running avg of weights to select the target cpuperf level.
-+	 * This is a demonstration of the cpuperf feature rather than a
-+	 * practical strategy to regulate CPU frequency.
-+	 */
-+	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
-+	idx = weight_to_idx(cpuc->avg_weight);
-+	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
-+
-+	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
-+}
-+
-+/*
-+ * The distance from the head of the queue scaled by the weight of the queue.
-+ * The lower the number, the older the task and the higher the priority.
-+ */
-+static s64 task_qdist(struct task_struct *p)
-+{
-+	int idx = weight_to_idx(p->scx.weight);
-+	struct task_ctx *tctx;
-+	s64 qdist;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return 0;
-+	}
-+
-+	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
-+
-+	/*
-+	 * As queue index increments, the priority doubles. The queue w/ index 3
-+	 * is dispatched twice more frequently than 2. Reflect the difference by
-+	 * scaling qdists accordingly. Note that the shift amount needs to be
-+	 * flipped depending on the sign to avoid flipping priority direction.
-+	 */
-+	if (qdist >= 0)
-+		return qdist << (4 - idx);
-+	else
-+		return qdist << idx;
-+}
-+
-+/*
-+ * This is called to determine the task ordering when core-sched is picking
-+ * tasks to execute on SMT siblings and should encode about the same ordering as
-+ * the regular scheduling path. Use the priority-scaled distances from the head
-+ * of the queues to compare the two tasks which should be consistent with the
-+ * dispatch path behavior.
-+ */
-+bool BPF_STRUCT_OPS(qmap_core_sched_before,
-+		    struct task_struct *a, struct task_struct *b)
-+{
-+	return task_qdist(a) > task_qdist(b);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
-+{
-+	u32 cnt;
-+
-+	/*
-+	 * Called when @cpu is taken by a higher priority scheduling class. This
-+	 * makes @cpu no longer available for executing sched_ext tasks. As we
-+	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
-+	 * becomes available again, re-enqueue them into the global dsq. See
-+	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
-+	 */
-+	cnt = scx_bpf_reenqueue_local();
-+	if (cnt)
-+		__sync_fetch_and_add(&nr_reenqueued, cnt);
-+}
-+
-+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
-+		   struct scx_init_task_args *args)
-+{
-+	if (p->tgid == disallow_tgid)
-+		p->scx.disallow = true;
-+
-+	/*
-+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-+	 * in this function and the following will automatically use GFP_KERNEL.
-+	 */
-+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-+		return 0;
-+	else
-+		return -ENOMEM;
-+}
-+
-+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
-+{
-+	s32 i, pid;
-+
-+	if (suppress_dump)
-+		return;
-+
-+	bpf_for(i, 0, 5) {
-+		void *fifo;
-+
-+		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
-+			return;
-+
-+		__COMPAT_scx_bpf_dump("QMAP FIFO[%d]:", i);
-+		bpf_repeat(4096) {
-+			if (bpf_map_pop_elem(fifo, &pid))
-+				break;
-+			__COMPAT_scx_bpf_dump(" %d", pid);
-+		}
-+		__COMPAT_scx_bpf_dump("\n");
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
-+{
-+	u32 zero = 0;
-+	struct cpu_ctx *cpuc;
-+
-+	if (suppress_dump || idle)
-+		return;
-+	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
-+		return;
-+
-+	__COMPAT_scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
-+			      cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
-+			      cpuc->cpuperf_target);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
-+{
-+	struct task_ctx *taskc;
-+
-+	if (suppress_dump)
-+		return;
-+	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
-+		return;
-+
-+	__COMPAT_scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
-+			      taskc->force_local, taskc->core_sched_seq);
-+}
-+
-+/*
-+ * Print out the online and possible CPU map using bpf_printk() as a
-+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
-+ */
-+static void print_cpus(void)
-+{
-+	const struct cpumask *possible, *online;
-+	s32 cpu;
-+	char buf[128] = "", *p;
-+	int idx;
-+
-+	if (!__COMPAT_HAS_CPUMASKS)
-+		return;
-+
-+	possible = scx_bpf_get_possible_cpumask();
-+	online = scx_bpf_get_online_cpumask();
-+
-+	idx = 0;
-+	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
-+		if (!(p = MEMBER_VPTR(buf, [idx++])))
-+			break;
-+		if (bpf_cpumask_test_cpu(cpu, online))
-+			*p++ = 'O';
-+		else if (bpf_cpumask_test_cpu(cpu, possible))
-+			*p++ = 'X';
-+		else
-+			*p++ = ' ';
-+
-+		if ((cpu & 7) == 7) {
-+			if (!(p = MEMBER_VPTR(buf, [idx++])))
-+				break;
-+			*p++ = '|';
-+		}
-+	}
-+	buf[sizeof(buf) - 1] = '\0';
-+
-+	scx_bpf_put_cpumask(online);
-+	scx_bpf_put_cpumask(possible);
-+
-+	bpf_printk("CPUS: |%s", buf);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
-+{
-+	bpf_printk("CPU %d coming online", cpu);
-+	/* @cpu is already online at this point */
-+	print_cpus();
-+}
-+
-+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
-+{
-+	bpf_printk("CPU %d going offline", cpu);
-+	/* @cpu is still online at this point */
-+	print_cpus();
-+}
-+
-+struct monitor_timer {
-+	struct bpf_timer timer;
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_ARRAY);
-+	__uint(max_entries, 1);
-+	__type(key, u32);
-+	__type(value, struct monitor_timer);
-+} central_timer SEC(".maps");
-+
-+/*
-+ * Print out the min, avg and max performance levels of CPUs every second to
-+ * demonstrate the cpuperf interface.
-+ */
-+static void monitor_cpuperf(void)
-+{
-+	u32 zero = 0, nr_cpu_ids;
-+	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
-+	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
-+	const struct cpumask *online;
-+	int i, nr_online_cpus = 0;
-+
-+	if (!__COMPAT_HAS_CPUMASKS)
-+		return;
-+
-+	nr_cpu_ids = scx_bpf_nr_cpu_ids();
-+	online = scx_bpf_get_online_cpumask();
-+
-+	bpf_for(i, 0, nr_cpu_ids) {
-+		struct cpu_ctx *cpuc;
-+		u32 cap, cur;
-+
-+		if (!bpf_cpumask_test_cpu(i, online))
-+			continue;
-+		nr_online_cpus++;
-+
-+		/* collect the capacity and current cpuperf */
-+		cap = scx_bpf_cpuperf_cap(i);
-+		cur = scx_bpf_cpuperf_cur(i);
-+
-+		cur_min = cur < cur_min ? cur : cur_min;
-+		cur_max = cur > cur_max ? cur : cur_max;
-+
-+		/*
-+		 * $cur is relative to $cap. Scale it down accordingly so that
-+		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
-+		 * makes sense.
-+		 */
-+		cur_sum += cur * cap / SCX_CPUPERF_ONE;
-+		cap_sum += cap;
-+
-+		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
-+			scx_bpf_error("failed to look up cpu_ctx");
-+			goto out;
-+		}
-+
-+		/* collect target */
-+		cur = cpuc->cpuperf_target;
-+		target_sum += cur;
-+		target_min = cur < target_min ? cur : target_min;
-+		target_max = cur > target_max ? cur : target_max;
-+	}
-+
-+	cpuperf_min = cur_min;
-+	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
-+	cpuperf_max = cur_max;
-+
-+	cpuperf_target_min = target_min;
-+	cpuperf_target_avg = target_sum / nr_online_cpus;
-+	cpuperf_target_max = target_max;
-+out:
-+	scx_bpf_put_cpumask(online);
-+}
-+
-+/*
-+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
-+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
-+ * see meaningful dumps in the trace pipe.
-+ */
-+static void dump_shared_dsq(void)
-+{
-+	struct task_struct *p;
-+	s32 nr;
-+
-+	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
-+		return;
-+
-+	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
-+
-+	bpf_rcu_read_lock();
-+	__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, SCX_DSQ_ITER_REV)
-+		bpf_printk("%s[%d]", p->comm, p->pid);
-+	bpf_rcu_read_unlock();
-+}
-+
-+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
-+{
-+	monitor_cpuperf();
-+
-+	if (print_shared_dsq)
-+		dump_shared_dsq();
-+
-+	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
-+	return 0;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
-+{
-+	u32 key = 0;
-+	struct bpf_timer *timer;
-+	s32 ret;
-+
-+	if (!switch_partial)
-+		__COMPAT_scx_bpf_switch_all();
-+
-+	print_cpus();
-+
-+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
-+	if (ret)
-+		return ret;
-+
-+	timer = bpf_map_lookup_elem(&central_timer, &key);
-+	if (!timer)
-+		return -ESRCH;
-+
-+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
-+	bpf_timer_set_callback(timer, monitor_timerfn);
-+
-+	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
-+}
-+
-+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SCX_OPS_DEFINE(qmap_ops,
-+	       .select_cpu		= (void *)qmap_select_cpu,
-+	       .enqueue			= (void *)qmap_enqueue,
-+	       .dequeue			= (void *)qmap_dequeue,
-+	       .dispatch		= (void *)qmap_dispatch,
-+	       .tick			= (void *)qmap_tick,
-+	       .core_sched_before	= (void *)qmap_core_sched_before,
-+	       .cpu_release		= (void *)qmap_cpu_release,
-+	       .init_task		= (void *)qmap_init_task,
-+	       .dump			= (void *)qmap_dump,
-+	       .dump_cpu		= (void *)qmap_dump_cpu,
-+	       .dump_task		= (void *)qmap_dump_task,
-+	       .cpu_online		= (void *)qmap_cpu_online,
-+	       .cpu_offline		= (void *)qmap_cpu_offline,
-+	       .init			= (void *)qmap_init,
-+	       .exit			= (void *)qmap_exit,
-+	       .flags			= SCX_OPS_ENQ_LAST,
-+	       .timeout_ms		= 5000U,
-+	       .name			= "qmap");
-diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
-new file mode 100644
-index 000000000000..e10ceb170793
---- /dev/null
-+++ b/tools/sched_ext/scx_qmap.c
-@@ -0,0 +1,154 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <unistd.h>
-+#include <inttypes.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include "scx_qmap.bpf.skel.h"
-+
-+const char help_fmt[] =
-+"A simple five-level FIFO queue sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-+"       [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n"
-+"\n"
-+"  -s SLICE_US   Override slice duration\n"
-+"  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
-+"  -t COUNT      Stall every COUNT'th user thread\n"
-+"  -T COUNT      Stall every COUNT'th kernel thread\n"
-+"  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
-+"  -b COUNT      Dispatch upto COUNT tasks together\n"
-+"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
-+"  -E PREFIX     Expedite consumption of threads w/ matching comm, use with -b\n"
-+"                (e.g. match shell on a loaded system)\n"
-+"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
-+"  -D LEN        Set scx_exit_info.dump buffer length\n"
-+"  -S            Suppress qmap-specific debug dump\n"
-+"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
-+"  -v            Print libbpf debug messages\n"
-+"  -h            Display this help and exit\n";
-+
-+static bool verbose;
-+static volatile int exit_req;
-+
-+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
-+{
-+	if (level == LIBBPF_DEBUG && !verbose)
-+		return 0;
-+	return vfprintf(stderr, format, args);
-+}
-+
-+static void sigint_handler(int dummy)
-+{
-+	exit_req = 1;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_qmap *skel;
-+	struct bpf_link *link;
-+	int opt;
-+
-+	libbpf_set_print(libbpf_print_fn);
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
-+
-+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:Spvh")) != -1) {
-+		switch (opt) {
-+		case 's':
-+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
-+			break;
-+		case 'e':
-+			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
-+			break;
-+		case 't':
-+			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'T':
-+			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'l':
-+			skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'b':
-+			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'P':
-+			skel->rodata->print_shared_dsq = true;
-+			break;
-+		case 'E':
-+			strncpy(skel->rodata->exp_prefix, optarg,
-+				sizeof(skel->rodata->exp_prefix) - 1);
-+			break;
-+		case 'd':
-+			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
-+			if (skel->rodata->disallow_tgid < 0)
-+				skel->rodata->disallow_tgid = getpid();
-+			break;
-+		case 'D':
-+			skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
-+			break;
-+		case 'S':
-+			skel->rodata->suppress_dump = true;
-+			break;
-+		case 'p':
-+			skel->rodata->switch_partial = true;
-+			skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
-+			break;
-+		case 'v':
-+			verbose = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	if (!__COMPAT_HAS_DSQ_ITER &&
-+	    (skel->rodata->print_shared_dsq || strlen(skel->rodata->exp_prefix)))
-+		fprintf(stderr, "kernel doesn't support DSQ iteration\n");
-+
-+	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
-+	link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
-+
-+	while (!exit_req && !UEI_EXITED(skel, uei)) {
-+		long nr_enqueued = skel->bss->nr_enqueued;
-+		long nr_dispatched = skel->bss->nr_dispatched;
-+
-+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n",
-+		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-+		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
-+		       skel->bss->nr_core_sched_execed, skel->bss->nr_expedited);
-+		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
-+			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
-+			       skel->bss->cpuperf_min,
-+			       skel->bss->cpuperf_avg,
-+			       skel->bss->cpuperf_max,
-+			       skel->bss->cpuperf_target_min,
-+			       skel->bss->cpuperf_target_avg,
-+			       skel->bss->cpuperf_target_max);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	UEI_REPORT(skel, uei);
-+	scx_qmap__destroy(skel);
-+	/*
-+	 * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
-+	 * on CPU hotplug events.
-+	 */
-+	return 0;
-+}
-diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
-new file mode 100644
-index 000000000000..d457d2a74e1e
---- /dev/null
-+++ b/tools/sched_ext/scx_show_state.py
-@@ -0,0 +1,39 @@
-+#!/usr/bin/env drgn
-+#
-+# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
-+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
-+
-+desc = """
-+This is a drgn script to show the current sched_ext state.
-+For more info on drgn, visit https://github.com/osandov/drgn.
-+"""
-+
-+import drgn
-+import sys
-+
-+def err(s):
-+    print(s, file=sys.stderr, flush=True)
-+    sys.exit(1)
-+
-+def read_int(name):
-+    return int(prog[name].value_())
-+
-+def read_atomic(name):
-+    return prog[name].counter.value_()
-+
-+def read_static_key(name):
-+    return prog[name].key.enabled.counter.value_()
-+
-+def ops_state_str(state):
-+    return prog['scx_ops_enable_state_str'][state].string_().decode()
-+
-+ops = prog['scx_ops']
-+enable_state = read_atomic("scx_ops_enable_state_var")
-+
-+print(f'ops           : {ops.name.string_().decode()}')
-+print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
-+print(f'switching_all : {read_int("scx_switching_all")}')
-+print(f'switched_all  : {read_static_key("__scx_switched_all")}')
-+print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
-+print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
-+print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
-diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
-new file mode 100644
-index 000000000000..6fc66ab9877a
---- /dev/null
-+++ b/tools/sched_ext/scx_simple.bpf.c
-@@ -0,0 +1,157 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A simple scheduler.
-+ *
-+ * By default, it operates as a simple global weighted vtime scheduler and can
-+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
-+ *
-+ * - Statistics tracking how many tasks are queued to local and global dsq's.
-+ * - Termination notification for userspace.
-+ *
-+ * While very simple, this scheduler should work reasonably well on CPUs with a
-+ * uniform L3 cache topology. While preemption is not implemented, the fact that
-+ * the scheduling queue is shared across all CPUs means that whatever is at the
-+ * front of the queue is likely to be executed fairly quickly given enough
-+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
-+ * but comes with the usual problems with FIFO scheduling where saturating
-+ * threads can easily drown out interactive ones.
-+ *
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+const volatile bool fifo_sched;
-+
-+static u64 vtime_now;
-+UEI_DEFINE(uei);
-+
-+/*
-+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
-+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
-+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume
-+ * from. If scx_simple only supported global FIFO scheduling, then we could
-+ * just use SCX_DSQ_GLOBAL.
-+ */
-+#define SHARED_DSQ 0
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-+	__uint(key_size, sizeof(u32));
-+	__uint(value_size, sizeof(u64));
-+	__uint(max_entries, 2);			/* [local, global] */
-+} stats SEC(".maps");
-+
-+static void stat_inc(u32 idx)
-+{
-+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
-+	if (cnt_p)
-+		(*cnt_p)++;
-+}
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
-+{
-+	bool is_idle = false;
-+	s32 cpu;
-+
-+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
-+	if (is_idle) {
-+		stat_inc(0);	/* count local queueing */
-+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-+	}
-+
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	stat_inc(1);	/* count global queueing */
-+
-+	if (fifo_sched) {
-+		scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
-+	} else {
-+		u64 vtime = p->scx.dsq_vtime;
-+
-+		/*
-+		 * Limit the amount of budget that an idling task can accumulate
-+		 * to one slice.
-+		 */
-+		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
-+			vtime = vtime_now - SCX_SLICE_DFL;
-+
-+		scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
-+				       enq_flags);
-+	}
-+}
-+
-+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	scx_bpf_consume(SHARED_DSQ);
-+}
-+
-+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
-+{
-+	if (fifo_sched)
-+		return;
-+
-+	/*
-+	 * Global vtime always progresses forward as tasks start executing. The
-+	 * test and update can be performed concurrently from multiple CPUs and
-+	 * thus racy. Any error should be contained and temporary. Let's just
-+	 * live with it.
-+	 */
-+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
-+		vtime_now = p->scx.dsq_vtime;
-+}
-+
-+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
-+{
-+	if (fifo_sched)
-+		return;
-+
-+	/*
-+	 * Scale the execution time by the inverse of the weight and charge.
-+	 *
-+	 * Note that the default yield implementation yields by setting
-+	 * @p->scx.slice to zero and the following would treat the yielding task
-+	 * as if it has consumed all its slice. If this penalizes yielding tasks
-+	 * too much, determine the execution time by taking explicit timestamps
-+	 * instead of depending on @p->scx.slice.
-+	 */
-+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
-+}
-+
-+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
-+{
-+	p->scx.dsq_vtime = vtime_now;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
-+{
-+	__COMPAT_scx_bpf_switch_all();
-+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
-+}
-+
-+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SCX_OPS_DEFINE(simple_ops,
-+	       .select_cpu		= (void *)simple_select_cpu,
-+	       .enqueue			= (void *)simple_enqueue,
-+	       .dispatch		= (void *)simple_dispatch,
-+	       .running			= (void *)simple_running,
-+	       .stopping		= (void *)simple_stopping,
-+	       .enable			= (void *)simple_enable,
-+	       .init			= (void *)simple_init,
-+	       .exit			= (void *)simple_exit,
-+	       .name			= "simple");
-diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
-new file mode 100644
-index 000000000000..76d83199545c
---- /dev/null
-+++ b/tools/sched_ext/scx_simple.c
-@@ -0,0 +1,107 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include "scx_simple.bpf.skel.h"
-+
-+const char help_fmt[] =
-+"A simple sched_ext scheduler.\n"
-+"\n"
-+"See the top-level comment in .bpf.c for more details.\n"
-+"\n"
-+"Usage: %s [-f] [-v]\n"
-+"\n"
-+"  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-+"  -v            Print libbpf debug messages\n"
-+"  -h            Display this help and exit\n";
-+
-+static bool verbose;
-+static volatile int exit_req;
-+
-+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
-+{
-+	if (level == LIBBPF_DEBUG && !verbose)
-+		return 0;
-+	return vfprintf(stderr, format, args);
-+}
-+
-+static void sigint_handler(int simple)
-+{
-+	exit_req = 1;
-+}
-+
-+static void read_stats(struct scx_simple *skel, __u64 *stats)
-+{
-+	int nr_cpus = libbpf_num_possible_cpus();
-+	__u64 cnts[2][nr_cpus];
-+	__u32 idx;
-+
-+	memset(stats, 0, sizeof(stats[0]) * 2);
-+
-+	for (idx = 0; idx < 2; idx++) {
-+		int ret, cpu;
-+
-+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-+					  &idx, cnts[idx]);
-+		if (ret < 0)
-+			continue;
-+		for (cpu = 0; cpu < nr_cpus; cpu++)
-+			stats[idx] += cnts[idx][cpu];
-+	}
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	struct scx_simple *skel;
-+	struct bpf_link *link;
-+	__u32 opt;
-+	__u64 ecode;
-+
-+	libbpf_set_print(libbpf_print_fn);
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+restart:
-+	skel = SCX_OPS_OPEN(simple_ops, scx_simple);
-+
-+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
-+		switch (opt) {
-+		case 'f':
-+			skel->rodata->fifo_sched = true;
-+			break;
-+		case 'v':
-+			verbose = true;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
-+	link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
-+
-+	while (!exit_req && !UEI_EXITED(skel, uei)) {
-+		__u64 stats[2];
-+
-+		read_stats(skel, stats);
-+		printf("local=%llu global=%llu\n", stats[0], stats[1]);
-+		fflush(stdout);
-+		sleep(1);
-+	}
-+
-+	bpf_link__destroy(link);
-+	ecode = UEI_REPORT(skel, uei);
-+	scx_simple__destroy(skel);
-+
-+	if (UEI_ECODE_RESTART(ecode))
-+		goto restart;
-+	return 0;
-+}
-diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
-index b1dd889d5d7d..948eb3962732 100644
---- a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
-+++ b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
-@@ -22,12 +22,12 @@ static int dummy_init_member(const struct btf_type *t,
- 	return 0;
- }
- 
--static int dummy_reg(void *kdata)
-+static int dummy_reg(void *kdata, struct bpf_link *link)
- {
- 	return 0;
- }
- 
--static void dummy_unreg(void *kdata)
-+static void dummy_unreg(void *kdata, struct bpf_link *link)
- {
- }
- 
-diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
-index 2a18bd320e92..0a09732cde4b 100644
---- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
-+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
-@@ -820,7 +820,7 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
- 	.is_valid_access = bpf_testmod_ops_is_valid_access,
- };
- 
--static int bpf_dummy_reg(void *kdata)
-+static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
- {
- 	struct bpf_testmod_ops *ops = kdata;
- 
-@@ -835,7 +835,7 @@ static int bpf_dummy_reg(void *kdata)
- 	return 0;
- }
- 
--static void bpf_dummy_unreg(void *kdata)
-+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
- {
- }
- 
-@@ -871,7 +871,7 @@ struct bpf_struct_ops bpf_bpf_testmod_ops = {
- 	.owner = THIS_MODULE,
- };
- 
--static int bpf_dummy_reg2(void *kdata)
-+static int bpf_dummy_reg2(void *kdata, struct bpf_link *link)
- {
- 	struct bpf_testmod_ops2 *ops = kdata;
- 
-diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
-index eeabd798bc3a..2fb16da78dce 100644
---- a/tools/testing/selftests/bpf/config
-+++ b/tools/testing/selftests/bpf/config
-@@ -80,6 +80,7 @@ CONFIG_NETFILTER_XT_TARGET_CT=y
- CONFIG_NETKIT=y
- CONFIG_NF_CONNTRACK=y
- CONFIG_NF_CONNTRACK_MARK=y
-+CONFIG_NF_CONNTRACK_ZONES=y
- CONFIG_NF_DEFRAG_IPV4=y
- CONFIG_NF_DEFRAG_IPV6=y
- CONFIG_NF_NAT=y
-diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
-index 35250e6cde7f..e20caef06aae 100644
---- a/tools/testing/selftests/bpf/network_helpers.c
-+++ b/tools/testing/selftests/bpf/network_helpers.c
-@@ -94,7 +94,8 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl
- 	if (settimeo(fd, opts->timeout_ms))
- 		goto error_close;
- 
--	if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) {
-+	if (opts->post_socket_cb &&
-+	    opts->post_socket_cb(fd, opts->cb_opts)) {
- 		log_err("Failed to call post_socket_cb");
- 		goto error_close;
- 	}
-@@ -118,22 +119,32 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl
- 	return -1;
- }
- 
--int start_server(int family, int type, const char *addr_str, __u16 port,
--		 int timeout_ms)
-+int start_server_str(int family, int type, const char *addr_str, __u16 port,
-+		     const struct network_helper_opts *opts)
- {
--	struct network_helper_opts opts = {
--		.timeout_ms	= timeout_ms,
--	};
- 	struct sockaddr_storage addr;
- 	socklen_t addrlen;
- 
-+	if (!opts)
-+		opts = &default_opts;
-+
- 	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
- 		return -1;
- 
--	return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
-+	return __start_server(type, (struct sockaddr *)&addr, addrlen, opts);
-+}
-+
-+int start_server(int family, int type, const char *addr_str, __u16 port,
-+		 int timeout_ms)
-+{
-+	struct network_helper_opts opts = {
-+		.timeout_ms	= timeout_ms,
-+	};
-+
-+	return start_server_str(family, type, addr_str, port, &opts);
- }
- 
--static int reuseport_cb(int fd, const struct post_socket_opts *opts)
-+static int reuseport_cb(int fd, void *opts)
- {
- 	int on = 1;
- 
-@@ -338,9 +349,8 @@ int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
- 	if (settimeo(fd, opts->timeout_ms))
- 		goto error_close;
- 
--	if (opts->cc && opts->cc[0] &&
--	    setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc,
--		       strlen(opts->cc) + 1))
-+	if (opts->post_socket_cb &&
-+	    opts->post_socket_cb(fd, opts->cb_opts))
- 		goto error_close;
- 
- 	if (!opts->noconnect)
-diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
-index 883c7ea9d8d5..11eea8e2e4f1 100644
---- a/tools/testing/selftests/bpf/network_helpers.h
-+++ b/tools/testing/selftests/bpf/network_helpers.h
-@@ -21,16 +21,14 @@ typedef __u16 __sum16;
- #define VIP_NUM 5
- #define MAGIC_BYTES 123
- 
--struct post_socket_opts {};
--
- struct network_helper_opts {
--	const char *cc;
- 	int timeout_ms;
- 	bool must_fail;
- 	bool noconnect;
- 	int type;
- 	int proto;
--	int (*post_socket_cb)(int fd, const struct post_socket_opts *opts);
-+	int (*post_socket_cb)(int fd, void *opts);
-+	void *cb_opts;
- };
- 
- /* ipv4 test vector */
-@@ -50,6 +48,8 @@ struct ipv6_packet {
- extern struct ipv6_packet pkt_v6;
- 
- int settimeo(int fd, int timeout_ms);
-+int start_server_str(int family, int type, const char *addr_str, __u16 port,
-+		     const struct network_helper_opts *opts);
- int start_server(int family, int type, const char *addr, __u16 port,
- 		 int timeout_ms);
- int *start_reuseport_server(int family, int type, const char *addr_str,
-diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
-index b30ff6b3b81a..a4a1f93878d4 100644
---- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
-+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
-@@ -104,6 +104,7 @@ static void test_bpf_nf_ct(int mode)
- 
- 	ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple");
- 	ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0");
-+	ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0");
- 	ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1");
- 	ASSERT_EQ(skel->bss->test_einval_len_opts, -EINVAL, "Test EINVAL for len__opts != NF_BPF_CT_OPTS_SZ");
- 	ASSERT_EQ(skel->bss->test_eproto_l4proto, -EPROTO, "Test EPROTO for l4proto != TCP or UDP");
-@@ -122,6 +123,12 @@ static void test_bpf_nf_ct(int mode)
- 	ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark");
- 	ASSERT_EQ(skel->data->test_snat_addr, 0, "Test for source natting");
- 	ASSERT_EQ(skel->data->test_dnat_addr, 0, "Test for destination natting");
-+	ASSERT_EQ(skel->data->test_ct_zone_id_alloc_entry, 0, "Test for alloc new entry in specified ct zone");
-+	ASSERT_EQ(skel->data->test_ct_zone_id_insert_entry, 0, "Test for insert new entry in specified ct zone");
-+	ASSERT_EQ(skel->data->test_ct_zone_id_succ_lookup, 0, "Test for successful lookup in specified ct_zone");
-+	ASSERT_EQ(skel->bss->test_ct_zone_dir_enoent_lookup, -ENOENT, "Test ENOENT for lookup with wrong ct zone dir");
-+	ASSERT_EQ(skel->bss->test_ct_zone_id_enoent_lookup, -ENOENT, "Test ENOENT for lookup in wrong ct zone");
-+
- end:
- 	if (client_fd != -1)
- 		close(client_fd);
-diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
-index 0aca02532794..ebc7d4616880 100644
---- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
-+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
-@@ -23,6 +23,10 @@
- static const unsigned int total_bytes = 10 * 1024 * 1024;
- static int expected_stg = 0xeB9F;
- 
-+struct cb_opts {
-+	const char *cc;
-+};
-+
- static int settcpca(int fd, const char *tcp_ca)
- {
- 	int err;
-@@ -34,12 +38,14 @@ static int settcpca(int fd, const char *tcp_ca)
- 	return 0;
- }
- 
--static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
-+static void do_test(const struct network_helper_opts *opts,
-+		    const struct bpf_map *sk_stg_map)
- {
-+	struct cb_opts *cb_opts = (struct cb_opts *)opts->cb_opts;
- 	int lfd = -1, fd = -1;
- 	int err;
- 
--	lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
-+	lfd = start_server_str(AF_INET6, SOCK_STREAM, NULL, 0, opts);
- 	if (!ASSERT_NEQ(lfd, -1, "socket"))
- 		return;
- 
-@@ -49,7 +55,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
- 		return;
- 	}
- 
--	if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca))
-+	if (settcpca(fd, cb_opts->cc))
- 		goto done;
- 
- 	if (sk_stg_map) {
-@@ -81,8 +87,22 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
- 	close(fd);
- }
- 
-+static int cc_cb(int fd, void *opts)
-+{
-+	struct cb_opts *cb_opts = (struct cb_opts *)opts;
-+
-+	return settcpca(fd, cb_opts->cc);
-+}
-+
- static void test_cubic(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "bpf_cubic",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct bpf_cubic *cubic_skel;
- 	struct bpf_link *link;
- 
-@@ -96,7 +116,7 @@ static void test_cubic(void)
- 		return;
- 	}
- 
--	do_test("bpf_cubic", NULL);
-+	do_test(&opts, NULL);
- 
- 	ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called");
- 
-@@ -106,6 +126,13 @@ static void test_cubic(void)
- 
- static void test_dctcp(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "bpf_dctcp",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct bpf_dctcp *dctcp_skel;
- 	struct bpf_link *link;
- 
-@@ -119,7 +146,7 @@ static void test_dctcp(void)
- 		return;
- 	}
- 
--	do_test("bpf_dctcp", dctcp_skel->maps.sk_stg_map);
-+	do_test(&opts, dctcp_skel->maps.sk_stg_map);
- 	ASSERT_EQ(dctcp_skel->bss->stg_result, expected_stg, "stg_result");
- 
- 	bpf_link__destroy(link);
-@@ -172,10 +199,16 @@ static void test_dctcp_fallback(void)
- {
- 	int err, lfd = -1, cli_fd = -1, srv_fd = -1;
- 	struct network_helper_opts opts = {
--		.cc = "cubic",
-+		.post_socket_cb	= cc_cb,
- 	};
- 	struct bpf_dctcp *dctcp_skel;
- 	struct bpf_link *link = NULL;
-+	struct cb_opts dctcp = {
-+		.cc = "bpf_dctcp",
-+	};
-+	struct cb_opts cubic = {
-+		.cc = "cubic",
-+	};
- 	char srv_cc[16];
- 	socklen_t cc_len = sizeof(srv_cc);
- 
-@@ -190,11 +223,12 @@ static void test_dctcp_fallback(void)
- 	if (!ASSERT_OK_PTR(link, "dctcp link"))
- 		goto done;
- 
--	lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
--	if (!ASSERT_GE(lfd, 0, "lfd") ||
--	    !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp"))
-+	opts.cb_opts = &dctcp;
-+	lfd = start_server_str(AF_INET6, SOCK_STREAM, "::1", 0, &opts);
-+	if (!ASSERT_GE(lfd, 0, "lfd"))
- 		goto done;
- 
-+	opts.cb_opts = &cubic;
- 	cli_fd = connect_to_fd_opts(lfd, &opts);
- 	if (!ASSERT_GE(cli_fd, 0, "cli_fd"))
- 		goto done;
-@@ -297,6 +331,13 @@ static void test_unsupp_cong_op(void)
- 
- static void test_update_ca(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "tcp_ca_update",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct tcp_ca_update *skel;
- 	struct bpf_link *link;
- 	int saved_ca1_cnt;
-@@ -309,14 +350,14 @@ static void test_update_ca(void)
- 	link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
- 	ASSERT_OK_PTR(link, "attach_struct_ops");
- 
--	do_test("tcp_ca_update", NULL);
-+	do_test(&opts, NULL);
- 	saved_ca1_cnt = skel->bss->ca1_cnt;
- 	ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
- 
- 	err = bpf_link__update_map(link, skel->maps.ca_update_2);
- 	ASSERT_OK(err, "update_map");
- 
--	do_test("tcp_ca_update", NULL);
-+	do_test(&opts, NULL);
- 	ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
- 	ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
- 
-@@ -326,6 +367,13 @@ static void test_update_ca(void)
- 
- static void test_update_wrong(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "tcp_ca_update",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct tcp_ca_update *skel;
- 	struct bpf_link *link;
- 	int saved_ca1_cnt;
-@@ -338,14 +386,14 @@ static void test_update_wrong(void)
- 	link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
- 	ASSERT_OK_PTR(link, "attach_struct_ops");
- 
--	do_test("tcp_ca_update", NULL);
-+	do_test(&opts, NULL);
- 	saved_ca1_cnt = skel->bss->ca1_cnt;
- 	ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
- 
- 	err = bpf_link__update_map(link, skel->maps.ca_wrong);
- 	ASSERT_ERR(err, "update_map");
- 
--	do_test("tcp_ca_update", NULL);
-+	do_test(&opts, NULL);
- 	ASSERT_GT(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
- 
- 	bpf_link__destroy(link);
-@@ -354,6 +402,13 @@ static void test_update_wrong(void)
- 
- static void test_mixed_links(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "tcp_ca_update",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct tcp_ca_update *skel;
- 	struct bpf_link *link, *link_nl;
- 	int err;
-@@ -368,7 +423,7 @@ static void test_mixed_links(void)
- 	link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
- 	ASSERT_OK_PTR(link, "attach_struct_ops");
- 
--	do_test("tcp_ca_update", NULL);
-+	do_test(&opts, NULL);
- 	ASSERT_GT(skel->bss->ca1_cnt, 0, "ca1_ca1_cnt");
- 
- 	err = bpf_link__update_map(link, skel->maps.ca_no_link);
-@@ -455,6 +510,13 @@ static void test_tcp_ca_kfunc(void)
- 
- static void test_cc_cubic(void)
- {
-+	struct cb_opts cb_opts = {
-+		.cc = "bpf_cc_cubic",
-+	};
-+	struct network_helper_opts opts = {
-+		.post_socket_cb	= cc_cb,
-+		.cb_opts	= &cb_opts,
-+	};
- 	struct bpf_cc_cubic *cc_cubic_skel;
- 	struct bpf_link *link;
- 
-@@ -468,7 +530,7 @@ static void test_cc_cubic(void)
- 		return;
- 	}
- 
--	do_test("bpf_cc_cubic", NULL);
-+	do_test(&opts, NULL);
- 
- 	bpf_link__destroy(link);
- 	bpf_cc_cubic__destroy(cc_cubic_skel);
-diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
-index 4c6ada5b270b..73f669014b69 100644
---- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
-+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
-@@ -45,12 +45,6 @@ static int check_load(const char *file, enum bpf_prog_type type)
- 	return err;
- }
- 
--struct scale_test_def {
--	const char *file;
--	enum bpf_prog_type attach_type;
--	bool fails;
--};
--
- static void scale_test(const char *file,
- 		       enum bpf_prog_type attach_type,
- 		       bool should_fail)
-diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
-index 3b7c57fe55a5..08b6391f2f56 100644
---- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
-+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
-@@ -69,15 +69,17 @@ static struct test_case test_cases[] = {
- 	{
- 		N(SCHED_CLS, struct __sk_buff, tstamp),
- 		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
--			 "w11 &= 3;"
--			 "if w11 != 0x3 goto pc+2;"
-+			 "if w11 & 0x4 goto pc+1;"
-+			 "goto pc+4;"
-+			 "if w11 & 0x3 goto pc+1;"
-+			 "goto pc+2;"
- 			 "$dst = 0;"
- 			 "goto pc+1;"
- 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
- 		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
--			 "if w11 & 0x2 goto pc+1;"
-+			 "if w11 & 0x4 goto pc+1;"
- 			 "goto pc+2;"
--			 "w11 &= -2;"
-+			 "w11 &= -4;"
- 			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
- 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
- 	},
-diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
-index 1d3a20f01b60..7cd8be2780ca 100644
---- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
-+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
-@@ -70,7 +70,7 @@ static void *server_thread(void *arg)
- 	return (void *)(long)err;
- }
- 
--static int custom_cb(int fd, const struct post_socket_opts *opts)
-+static int custom_cb(int fd, void *opts)
- {
- 	char buf;
- 	int err;
-diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
-index b1073d36d77a..327d51f59142 100644
---- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
-+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
-@@ -890,9 +890,6 @@ static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
- 
- 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
- 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
--	/* non mono delivery time is not forwarded */
--	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
--		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
- 	for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
- 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
- 
-diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
-index 29e183a80f49..bbcf12696a6b 100644
---- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
-+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
-@@ -3,9 +3,12 @@
- #include <test_progs.h>
- #include <time.h>
- 
-+#include <sys/epoll.h>
-+
- #include "struct_ops_module.skel.h"
- #include "struct_ops_nulled_out_cb.skel.h"
- #include "struct_ops_forgotten_cb.skel.h"
-+#include "struct_ops_detach.skel.h"
- 
- static void check_map_info(struct bpf_map_info *info)
- {
-@@ -242,6 +245,58 @@ static void test_struct_ops_forgotten_cb(void)
- 	struct_ops_forgotten_cb__destroy(skel);
- }
- 
-+/* Detach a link from a user space program */
-+static void test_detach_link(void)
-+{
-+	struct epoll_event ev, events[2];
-+	struct struct_ops_detach *skel;
-+	struct bpf_link *link = NULL;
-+	int fd, epollfd = -1, nfds;
-+	int err;
-+
-+	skel = struct_ops_detach__open_and_load();
-+	if (!ASSERT_OK_PTR(skel, "struct_ops_detach__open_and_load"))
-+		return;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.testmod_do_detach);
-+	if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
-+		goto cleanup;
-+
-+	fd = bpf_link__fd(link);
-+	if (!ASSERT_GE(fd, 0, "link_fd"))
-+		goto cleanup;
-+
-+	epollfd = epoll_create1(0);
-+	if (!ASSERT_GE(epollfd, 0, "epoll_create1"))
-+		goto cleanup;
-+
-+	ev.events = EPOLLHUP;
-+	ev.data.fd = fd;
-+	err = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev);
-+	if (!ASSERT_OK(err, "epoll_ctl"))
-+		goto cleanup;
-+
-+	err = bpf_link__detach(link);
-+	if (!ASSERT_OK(err, "detach_link"))
-+		goto cleanup;
-+
-+	/* Wait for EPOLLHUP */
-+	nfds = epoll_wait(epollfd, events, 2, 500);
-+	if (!ASSERT_EQ(nfds, 1, "epoll_wait"))
-+		goto cleanup;
-+
-+	if (!ASSERT_EQ(events[0].data.fd, fd, "epoll_wait_fd"))
-+		goto cleanup;
-+	if (!ASSERT_TRUE(events[0].events & EPOLLHUP, "events[0].events"))
-+		goto cleanup;
-+
-+cleanup:
-+	if (epollfd >= 0)
-+		close(epollfd);
-+	bpf_link__destroy(link);
-+	struct_ops_detach__destroy(skel);
-+}
-+
- void serial_test_struct_ops_module(void)
- {
- 	if (test__start_subtest("struct_ops_load"))
-@@ -254,5 +309,7 @@ void serial_test_struct_ops_module(void)
- 		test_struct_ops_nulled_out_cb();
- 	if (test__start_subtest("struct_ops_forgotten_cb"))
- 		test_struct_ops_forgotten_cb();
-+	if (test__start_subtest("test_detach_link"))
-+		test_detach_link();
- }
- 
-diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
-index 1c9c4ec1be11..6816ff064516 100644
---- a/tools/testing/selftests/bpf/prog_tests/verifier.c
-+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
-@@ -86,6 +86,7 @@
- #include "verifier_xadd.skel.h"
- #include "verifier_xdp.skel.h"
- #include "verifier_xdp_direct_packet_access.skel.h"
-+#include "verifier_bits_iter.skel.h"
- 
- #define MAX_ENTRIES 11
- 
-@@ -202,6 +203,7 @@ void test_verifier_var_off(void)              { RUN(verifier_var_off); }
- void test_verifier_xadd(void)                 { RUN(verifier_xadd); }
- void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
- void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
-+void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
- 
- static int init_test_val_map(struct bpf_object *obj, char *map_name)
- {
-diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
-index c5969ca6f26b..564835ba7d51 100644
---- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
-+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
-@@ -6,12 +6,6 @@
- 
- char _license[] SEC("license") = "GPL";
- 
--struct key_t {
--	int a;
--	int b;
--	int c;
--};
--
- struct {
- 	__uint(type, BPF_MAP_TYPE_ARRAY);
- 	__uint(max_entries, 3);
-diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
-index 85fa710fad90..9f0e0705b2bf 100644
---- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
-+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
-@@ -6,12 +6,6 @@
- 
- char _license[] SEC("license") = "GPL";
- 
--struct key_t {
--	int a;
--	int b;
--	int c;
--};
--
- struct {
- 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
- 	__uint(max_entries, 3);
-diff --git a/tools/testing/selftests/bpf/progs/struct_ops_detach.c b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
-new file mode 100644
-index 000000000000..56b787a89876
---- /dev/null
-+++ b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
-@@ -0,0 +1,10 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
-+#include <vmlinux.h>
-+#include <bpf/bpf_helpers.h>
-+#include "../bpf_testmod/bpf_testmod.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+SEC(".struct_ops.link")
-+struct bpf_testmod_ops testmod_do_detach;
-diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
-index 77ad8adf68da..0289d8ce2b80 100644
---- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
-+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
-@@ -9,10 +9,14 @@
- #define EINVAL 22
- #define ENOENT 2
- 
-+#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL)
-+#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY)
-+
- extern unsigned long CONFIG_HZ __kconfig;
- 
- int test_einval_bpf_tuple = 0;
- int test_einval_reserved = 0;
-+int test_einval_reserved_new = 0;
- int test_einval_netns_id = 0;
- int test_einval_len_opts = 0;
- int test_eproto_l4proto = 0;
-@@ -22,6 +26,11 @@ int test_eafnosupport = 0;
- int test_alloc_entry = -EINVAL;
- int test_insert_entry = -EAFNOSUPPORT;
- int test_succ_lookup = -ENOENT;
-+int test_ct_zone_id_alloc_entry = -EINVAL;
-+int test_ct_zone_id_insert_entry = -EAFNOSUPPORT;
-+int test_ct_zone_id_succ_lookup = -ENOENT;
-+int test_ct_zone_dir_enoent_lookup = 0;
-+int test_ct_zone_id_enoent_lookup = 0;
- u32 test_delta_timeout = 0;
- u32 test_status = 0;
- u32 test_insert_lookup_mark = 0;
-@@ -45,6 +54,17 @@ struct bpf_ct_opts___local {
- 	s32 netns_id;
- 	s32 error;
- 	u8 l4proto;
-+	u8 dir;
-+	u8 reserved[2];
-+};
-+
-+struct bpf_ct_opts___new {
-+	s32 netns_id;
-+	s32 error;
-+	u8 l4proto;
-+	u8 dir;
-+	u16 ct_zone_id;
-+	u8 ct_zone_dir;
- 	u8 reserved[3];
- } __attribute__((preserve_access_index));
- 
-@@ -220,10 +240,97 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
- 	}
- }
- 
-+static __always_inline void
-+nf_ct_opts_new_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
-+						 struct bpf_ct_opts___new *, u32),
-+		    struct nf_conn *(*alloc_fn)(void *, struct bpf_sock_tuple *, u32,
-+						struct bpf_ct_opts___new *, u32),
-+		    void *ctx)
-+{
-+	struct bpf_ct_opts___new opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
-+	struct bpf_sock_tuple bpf_tuple;
-+	struct nf_conn *ct;
-+
-+	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
-+
-+	opts_def.reserved[0] = 1;
-+	ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
-+		       sizeof(opts_def));
-+	opts_def.reserved[0] = 0;
-+	if (ct)
-+		bpf_ct_release(ct);
-+	else
-+		test_einval_reserved_new = opts_def.error;
-+
-+	bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */
-+	bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */
-+	bpf_tuple.ipv4.sport = bpf_get_prandom_u32(); /* src port */
-+	bpf_tuple.ipv4.dport = bpf_get_prandom_u32(); /* dst port */
-+
-+	/* use non-default ct zone */
-+	opts_def.ct_zone_id = 10;
-+	opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG;
-+	ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
-+		      sizeof(opts_def));
-+	if (ct) {
-+		__u16 sport = bpf_get_prandom_u32();
-+		__u16 dport = bpf_get_prandom_u32();
-+		union nf_inet_addr saddr = {};
-+		union nf_inet_addr daddr = {};
-+		struct nf_conn *ct_ins;
-+
-+		bpf_ct_set_timeout(ct, 10000);
-+
-+		/* snat */
-+		saddr.ip = bpf_get_prandom_u32();
-+		bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC___local);
-+		/* dnat */
-+		daddr.ip = bpf_get_prandom_u32();
-+		bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST___local);
-+
-+		ct_ins = bpf_ct_insert_entry(ct);
-+		if (ct_ins) {
-+			struct nf_conn *ct_lk;
-+
-+			/* entry should exist in same ct zone we inserted it */
-+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
-+					  &opts_def, sizeof(opts_def));
-+			if (ct_lk) {
-+				bpf_ct_release(ct_lk);
-+				test_ct_zone_id_succ_lookup = 0;
-+			}
-+
-+			/* entry should not exist with wrong direction */
-+			opts_def.ct_zone_dir = NF_CT_ZONE_DIR_REPL;
-+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
-+					  &opts_def, sizeof(opts_def));
-+			opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG;
-+			if (ct_lk)
-+				bpf_ct_release(ct_lk);
-+			else
-+				test_ct_zone_dir_enoent_lookup = opts_def.error;
-+
-+			/* entry should not exist in default ct zone */
-+			opts_def.ct_zone_id = 0;
-+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
-+					  &opts_def, sizeof(opts_def));
-+			if (ct_lk)
-+				bpf_ct_release(ct_lk);
-+			else
-+				test_ct_zone_id_enoent_lookup = opts_def.error;
-+
-+			bpf_ct_release(ct_ins);
-+			test_ct_zone_id_insert_entry = 0;
-+		}
-+		test_ct_zone_id_alloc_entry = 0;
-+	}
-+}
-+
- SEC("xdp")
- int nf_xdp_ct_test(struct xdp_md *ctx)
- {
- 	nf_ct_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx);
-+	nf_ct_opts_new_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx);
- 	return 0;
- }
- 
-@@ -231,6 +338,7 @@ SEC("tc")
- int nf_skb_ct_test(struct __sk_buff *ctx)
- {
- 	nf_ct_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx);
-+	nf_ct_opts_new_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx);
- 	return 0;
- }
- 
-diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
-index 99d2ea9fb658..f48f85f1bd70 100644
---- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
-+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
-@@ -92,7 +92,7 @@ struct {
- 	__uint(value_size, sizeof(int));
- } tls_sock_map SEC(".maps");
- 
--SEC("sk_skb1")
-+SEC("sk_skb/stream_parser")
- int bpf_prog1(struct __sk_buff *skb)
- {
- 	int *f, two = 2;
-@@ -104,7 +104,7 @@ int bpf_prog1(struct __sk_buff *skb)
- 	return skb->len;
- }
- 
--SEC("sk_skb2")
-+SEC("sk_skb/stream_verdict")
- int bpf_prog2(struct __sk_buff *skb)
- {
- 	__u32 lport = skb->local_port;
-@@ -151,7 +151,7 @@ static inline void bpf_write_pass(struct __sk_buff *skb, int offset)
- 		memcpy(c + offset, "PASS", 4);
- }
- 
--SEC("sk_skb3")
-+SEC("sk_skb/stream_verdict")
- int bpf_prog3(struct __sk_buff *skb)
- {
- 	int err, *f, ret = SK_PASS;
-@@ -177,9 +177,6 @@ int bpf_prog3(struct __sk_buff *skb)
- 		return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
- #endif
- 	}
--	f = bpf_map_lookup_elem(&sock_skb_opts, &one);
--	if (f && *f)
--		ret = SK_DROP;
- 	err = bpf_skb_adjust_room(skb, 4, 0, 0);
- 	if (err)
- 		return SK_DROP;
-@@ -233,7 +230,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
- 	return 0;
- }
- 
--SEC("sk_msg1")
-+SEC("sk_msg")
- int bpf_prog4(struct sk_msg_md *msg)
- {
- 	int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-@@ -263,7 +260,7 @@ int bpf_prog4(struct sk_msg_md *msg)
- 	return SK_PASS;
- }
- 
--SEC("sk_msg2")
-+SEC("sk_msg")
- int bpf_prog6(struct sk_msg_md *msg)
- {
- 	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
-@@ -308,7 +305,7 @@ int bpf_prog6(struct sk_msg_md *msg)
- #endif
- }
- 
--SEC("sk_msg3")
-+SEC("sk_msg")
- int bpf_prog8(struct sk_msg_md *msg)
- {
- 	void *data_end = (void *)(long) msg->data_end;
-@@ -329,7 +326,8 @@ int bpf_prog8(struct sk_msg_md *msg)
- 
- 	return SK_PASS;
- }
--SEC("sk_msg4")
-+
-+SEC("sk_msg")
- int bpf_prog9(struct sk_msg_md *msg)
- {
- 	void *data_end = (void *)(long) msg->data_end;
-@@ -347,7 +345,7 @@ int bpf_prog9(struct sk_msg_md *msg)
- 	return SK_PASS;
- }
- 
--SEC("sk_msg5")
-+SEC("sk_msg")
- int bpf_prog10(struct sk_msg_md *msg)
- {
- 	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
-diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
-index 74ec09f040b7..ca8e8734d901 100644
---- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
-+++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
-@@ -222,17 +222,21 @@ int egress_host(struct __sk_buff *skb)
- 		return TC_ACT_OK;
- 
- 	if (skb_proto(skb_type) == IPPROTO_TCP) {
--		if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
-+		if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
- 		    skb->tstamp)
- 			inc_dtimes(EGRESS_ENDHOST);
- 		else
- 			inc_errs(EGRESS_ENDHOST);
--	} else {
--		if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
-+	} else if (skb_proto(skb_type) == IPPROTO_UDP) {
-+		if (skb->tstamp_type == BPF_SKB_CLOCK_TAI &&
- 		    skb->tstamp)
- 			inc_dtimes(EGRESS_ENDHOST);
- 		else
- 			inc_errs(EGRESS_ENDHOST);
-+	} else {
-+		if (skb->tstamp_type == BPF_SKB_CLOCK_REALTIME &&
-+		    skb->tstamp)
-+			inc_errs(EGRESS_ENDHOST);
- 	}
- 
- 	skb->tstamp = EGRESS_ENDHOST_MAGIC;
-@@ -252,7 +256,7 @@ int ingress_host(struct __sk_buff *skb)
- 	if (!skb_type)
- 		return TC_ACT_OK;
- 
--	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
-+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
- 	    skb->tstamp == EGRESS_FWDNS_MAGIC)
- 		inc_dtimes(INGRESS_ENDHOST);
- 	else
-@@ -315,7 +319,6 @@ int egress_fwdns_prio100(struct __sk_buff *skb)
- SEC("tc")
- int ingress_fwdns_prio101(struct __sk_buff *skb)
- {
--	__u64 expected_dtime = EGRESS_ENDHOST_MAGIC;
- 	int skb_type;
- 
- 	skb_type = skb_get_type(skb);
-@@ -323,29 +326,24 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
- 		/* Should have handled in prio100 */
- 		return TC_ACT_SHOT;
- 
--	if (skb_proto(skb_type) == IPPROTO_UDP)
--		expected_dtime = 0;
--
- 	if (skb->tstamp_type) {
- 		if (fwdns_clear_dtime() ||
--		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
--		    skb->tstamp != expected_dtime)
-+		    (skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC &&
-+		    skb->tstamp_type != BPF_SKB_CLOCK_TAI) ||
-+		    skb->tstamp != EGRESS_ENDHOST_MAGIC)
- 			inc_errs(INGRESS_FWDNS_P101);
- 		else
- 			inc_dtimes(INGRESS_FWDNS_P101);
- 	} else {
--		if (!fwdns_clear_dtime() && expected_dtime)
-+		if (!fwdns_clear_dtime())
- 			inc_errs(INGRESS_FWDNS_P101);
- 	}
- 
--	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
-+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
- 		skb->tstamp = INGRESS_FWDNS_MAGIC;
- 	} else {
- 		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
--				       BPF_SKB_TSTAMP_DELIVERY_MONO))
--			inc_errs(SET_DTIME);
--		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
--					BPF_SKB_TSTAMP_UNSPEC))
-+				       BPF_SKB_CLOCK_MONOTONIC))
- 			inc_errs(SET_DTIME);
- 	}
- 
-@@ -370,7 +368,7 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
- 
- 	if (skb->tstamp_type) {
- 		if (fwdns_clear_dtime() ||
--		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
-+		    skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC ||
- 		    skb->tstamp != INGRESS_FWDNS_MAGIC)
- 			inc_errs(EGRESS_FWDNS_P101);
- 		else
-@@ -380,14 +378,11 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
- 			inc_errs(EGRESS_FWDNS_P101);
- 	}
- 
--	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
-+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
- 		skb->tstamp = EGRESS_FWDNS_MAGIC;
- 	} else {
- 		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
--				       BPF_SKB_TSTAMP_DELIVERY_MONO))
--			inc_errs(SET_DTIME);
--		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
--					BPF_SKB_TSTAMP_UNSPEC))
-+				       BPF_SKB_CLOCK_MONOTONIC))
- 			inc_errs(SET_DTIME);
- 	}
- 
-diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
-new file mode 100644
-index 000000000000..716113c2bce2
---- /dev/null
-+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
-@@ -0,0 +1,153 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+/* Copyright (c) 2024 Yafang Shao <laoar.shao@gmail.com> */
-+
-+#include "vmlinux.h"
-+#include <bpf/bpf_helpers.h>
-+#include <bpf/bpf_tracing.h>
-+
-+#include "bpf_misc.h"
-+#include "task_kfunc_common.h"
-+
-+char _license[] SEC("license") = "GPL";
-+
-+int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign,
-+		      u32 nr_bits) __ksym __weak;
-+int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym __weak;
-+void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym __weak;
-+
-+SEC("iter.s/cgroup")
-+__description("bits iter without destroy")
-+__failure __msg("Unreleased reference")
-+int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp)
-+{
-+	struct bpf_iter_bits it;
-+	u64 data = 1;
-+
-+	bpf_iter_bits_new(&it, &data, 1);
-+	bpf_iter_bits_next(&it);
-+	return 0;
-+}
-+
-+SEC("iter/cgroup")
-+__description("uninitialized iter in ->next()")
-+__failure __msg("expected an initialized iter_bits as arg #1")
-+int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
-+{
-+	struct bpf_iter_bits *it = NULL;
-+
-+	bpf_iter_bits_next(it);
-+	return 0;
-+}
-+
-+SEC("iter/cgroup")
-+__description("uninitialized iter in ->destroy()")
-+__failure __msg("expected an initialized iter_bits as arg #1")
-+int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
-+{
-+	struct bpf_iter_bits it = {};
-+
-+	bpf_iter_bits_destroy(&it);
-+	return 0;
-+}
-+
-+SEC("syscall")
-+__description("null pointer")
-+__success __retval(0)
-+int null_pointer(void)
-+{
-+	int nr = 0;
-+	int *bit;
-+
-+	bpf_for_each(bits, bit, NULL, 1)
-+		nr++;
-+	return nr;
-+}
-+
-+SEC("syscall")
-+__description("bits copy")
-+__success __retval(10)
-+int bits_copy(void)
-+{
-+	u64 data = 0xf7310UL; /* 4 + 3 + 2 + 1 + 0*/
-+	int nr = 0;
-+	int *bit;
-+
-+	bpf_for_each(bits, bit, &data, 1)
-+		nr++;
-+	return nr;
-+}
-+
-+SEC("syscall")
-+__description("bits memalloc")
-+__success __retval(64)
-+int bits_memalloc(void)
-+{
-+	u64 data[2];
-+	int nr = 0;
-+	int *bit;
-+
-+	__builtin_memset(&data, 0xf0, sizeof(data)); /* 4 * 16 */
-+	bpf_for_each(bits, bit, &data[0], sizeof(data) / sizeof(u64))
-+		nr++;
-+	return nr;
-+}
-+
-+SEC("syscall")
-+__description("bit index")
-+__success __retval(8)
-+int bit_index(void)
-+{
-+	u64 data = 0x100;
-+	int bit_idx = 0;
-+	int *bit;
-+
-+	bpf_for_each(bits, bit, &data, 1) {
-+		if (*bit == 0)
-+			continue;
-+		bit_idx = *bit;
-+	}
-+	return bit_idx;
-+}
-+
-+SEC("syscall")
-+__description("bits nomem")
-+__success __retval(0)
-+int bits_nomem(void)
-+{
-+	u64 data[4];
-+	int nr = 0;
-+	int *bit;
-+
-+	__builtin_memset(&data, 0xff, sizeof(data));
-+	bpf_for_each(bits, bit, &data[0], 513) /* Be greater than 512 */
-+		nr++;
-+	return nr;
-+}
-+
-+SEC("syscall")
-+__description("fewer words")
-+__success __retval(1)
-+int fewer_words(void)
-+{
-+	u64 data[2] = {0x1, 0xff};
-+	int nr = 0;
-+	int *bit;
-+
-+	bpf_for_each(bits, bit, &data[0], 1)
-+		nr++;
-+	return nr;
-+}
-+
-+SEC("syscall")
-+__description("zero words")
-+__success __retval(0)
-+int zero_words(void)
-+{
-+	u64 data[2] = {0x1, 0xff};
-+	int nr = 0;
-+	int *bit;
-+
-+	bpf_for_each(bits, bit, &data[0], 0)
-+		nr++;
-+	return nr;
-+}
-diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
-index 92752f5eeded..9cba4ec844a5 100644
---- a/tools/testing/selftests/bpf/test_sockmap.c
-+++ b/tools/testing/selftests/bpf/test_sockmap.c
-@@ -63,7 +63,8 @@ int passed;
- int failed;
- int map_fd[9];
- struct bpf_map *maps[9];
--int prog_fd[11];
-+struct bpf_program *progs[9];
-+struct bpf_link *links[9];
- 
- int txmsg_pass;
- int txmsg_redir;
-@@ -680,7 +681,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
- 				}
- 			}
- 
--			s->bytes_recvd += recv;
-+			if (recv > 0)
-+				s->bytes_recvd += recv;
- 
- 			if (opt->check_recved_len && s->bytes_recvd > total_bytes) {
- 				errno = EMSGSIZE;
-@@ -952,7 +954,8 @@ enum {
- 
- static int run_options(struct sockmap_options *options, int cg_fd,  int test)
- {
--	int i, key, next_key, err, tx_prog_fd = -1, zero = 0;
-+	int i, key, next_key, err, zero = 0;
-+	struct bpf_program *tx_prog;
- 
- 	/* If base test skip BPF setup */
- 	if (test == BASE || test == BASE_SENDPAGE)
-@@ -960,48 +963,44 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
- 
- 	/* Attach programs to sockmap */
- 	if (!txmsg_omit_skb_parser) {
--		err = bpf_prog_attach(prog_fd[0], map_fd[0],
--				      BPF_SK_SKB_STREAM_PARSER, 0);
--		if (err) {
-+		links[0] = bpf_program__attach_sockmap(progs[0], map_fd[0]);
-+		if (!links[0]) {
- 			fprintf(stderr,
--				"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
--				prog_fd[0], map_fd[0], err, strerror(errno));
--			return err;
-+				"ERROR: bpf_program__attach_sockmap (sockmap %i->%i): (%s)\n",
-+				bpf_program__fd(progs[0]), map_fd[0], strerror(errno));
-+			return -1;
- 		}
- 	}
- 
--	err = bpf_prog_attach(prog_fd[1], map_fd[0],
--				BPF_SK_SKB_STREAM_VERDICT, 0);
--	if (err) {
--		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
--			err, strerror(errno));
--		return err;
-+	links[1] = bpf_program__attach_sockmap(progs[1], map_fd[0]);
-+	if (!links[1]) {
-+		fprintf(stderr, "ERROR: bpf_program__attach_sockmap (sockmap): (%s)\n",
-+			strerror(errno));
-+		return -1;
- 	}
- 
- 	/* Attach programs to TLS sockmap */
- 	if (txmsg_ktls_skb) {
- 		if (!txmsg_omit_skb_parser) {
--			err = bpf_prog_attach(prog_fd[0], map_fd[8],
--					      BPF_SK_SKB_STREAM_PARSER, 0);
--			if (err) {
-+			links[2] = bpf_program__attach_sockmap(progs[0], map_fd[8]);
-+			if (!links[2]) {
- 				fprintf(stderr,
--					"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
--					prog_fd[0], map_fd[8], err, strerror(errno));
--				return err;
-+					"ERROR: bpf_program__attach_sockmap (TLS sockmap %i->%i): (%s)\n",
-+					bpf_program__fd(progs[0]), map_fd[8], strerror(errno));
-+				return -1;
- 			}
- 		}
- 
--		err = bpf_prog_attach(prog_fd[2], map_fd[8],
--				      BPF_SK_SKB_STREAM_VERDICT, 0);
--		if (err) {
--			fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n",
--				err, strerror(errno));
--			return err;
-+		links[3] = bpf_program__attach_sockmap(progs[2], map_fd[8]);
-+		if (!links[3]) {
-+			fprintf(stderr, "ERROR: bpf_program__attach_sockmap (TLS sockmap): (%s)\n",
-+				strerror(errno));
-+			return -1;
- 		}
- 	}
- 
- 	/* Attach to cgroups */
--	err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
-+	err = bpf_prog_attach(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS, 0);
- 	if (err) {
- 		fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
- 			err, strerror(errno));
-@@ -1017,30 +1016,31 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
- 
- 	/* Attach txmsg program to sockmap */
- 	if (txmsg_pass)
--		tx_prog_fd = prog_fd[4];
-+		tx_prog = progs[4];
- 	else if (txmsg_redir)
--		tx_prog_fd = prog_fd[5];
-+		tx_prog = progs[5];
- 	else if (txmsg_apply)
--		tx_prog_fd = prog_fd[6];
-+		tx_prog = progs[6];
- 	else if (txmsg_cork)
--		tx_prog_fd = prog_fd[7];
-+		tx_prog = progs[7];
- 	else if (txmsg_drop)
--		tx_prog_fd = prog_fd[8];
-+		tx_prog = progs[8];
- 	else
--		tx_prog_fd = 0;
-+		tx_prog = NULL;
- 
--	if (tx_prog_fd) {
--		int redir_fd, i = 0;
-+	if (tx_prog) {
-+		int redir_fd;
- 
--		err = bpf_prog_attach(tx_prog_fd,
--				      map_fd[1], BPF_SK_MSG_VERDICT, 0);
--		if (err) {
-+		links[4] = bpf_program__attach_sockmap(tx_prog, map_fd[1]);
-+		if (!links[4]) {
- 			fprintf(stderr,
--				"ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
--				err, strerror(errno));
-+				"ERROR: bpf_program__attach_sockmap (txmsg): (%s)\n",
-+				strerror(errno));
-+			err = -1;
- 			goto out;
- 		}
- 
-+		i = 0;
- 		err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
- 		if (err) {
- 			fprintf(stderr,
-@@ -1279,16 +1279,14 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
- 		fprintf(stderr, "unknown test\n");
- out:
- 	/* Detatch and zero all the maps */
--	bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS);
--	bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
--	bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
--	bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER);
--	bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT);
-+	bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS);
- 
--	if (tx_prog_fd >= 0)
--		bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
-+	for (i = 0; i < ARRAY_SIZE(links); i++) {
-+		if (links[i])
-+			bpf_link__detach(links[i]);
-+	}
- 
--	for (i = 0; i < 8; i++) {
-+	for (i = 0; i < ARRAY_SIZE(map_fd); i++) {
- 		key = next_key = 0;
- 		bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
- 		while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) {
-@@ -1783,34 +1781,6 @@ char *map_names[] = {
- 	"tls_sock_map",
- };
- 
--int prog_attach_type[] = {
--	BPF_SK_SKB_STREAM_PARSER,
--	BPF_SK_SKB_STREAM_VERDICT,
--	BPF_SK_SKB_STREAM_VERDICT,
--	BPF_CGROUP_SOCK_OPS,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--	BPF_SK_MSG_VERDICT,
--};
--
--int prog_type[] = {
--	BPF_PROG_TYPE_SK_SKB,
--	BPF_PROG_TYPE_SK_SKB,
--	BPF_PROG_TYPE_SK_SKB,
--	BPF_PROG_TYPE_SOCK_OPS,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--	BPF_PROG_TYPE_SK_MSG,
--};
--
- static int populate_progs(char *bpf_file)
- {
- 	struct bpf_program *prog;
-@@ -1829,17 +1799,10 @@ static int populate_progs(char *bpf_file)
- 		return -1;
- 	}
- 
--	bpf_object__for_each_program(prog, obj) {
--		bpf_program__set_type(prog, prog_type[i]);
--		bpf_program__set_expected_attach_type(prog,
--						      prog_attach_type[i]);
--		i++;
--	}
--
- 	i = bpf_object__load(obj);
- 	i = 0;
- 	bpf_object__for_each_program(prog, obj) {
--		prog_fd[i] = bpf_program__fd(prog);
-+		progs[i] = prog;
- 		i++;
- 	}
- 
-@@ -1853,6 +1816,9 @@ static int populate_progs(char *bpf_file)
- 		}
- 	}
- 
-+	for (i = 0; i < ARRAY_SIZE(links); i++)
-+		links[i] = NULL;
-+
- 	return 0;
- }
- 
-diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
-index 7b5fc98838cd..aebc58c24dc5 100644
---- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
-+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
-@@ -139,14 +139,14 @@ static int run_test(int server_fd, int results_fd, bool xdp)
- 	return ret;
- }
- 
--static int v6only_true(int fd, const struct post_socket_opts *opts)
-+static int v6only_true(int fd, void *opts)
- {
- 	int mode = true;
- 
- 	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
- }
- 
--static int v6only_false(int fd, const struct post_socket_opts *opts)
-+static int v6only_false(int fd, void *opts)
- {
- 	int mode = false;
- 
-diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
-index df04bda1c927..610392dfc4fb 100644
---- a/tools/testing/selftests/bpf/test_verifier.c
-+++ b/tools/testing/selftests/bpf/test_verifier.c
-@@ -1237,11 +1237,6 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
- 	fixup_prog_kfuncs(prog, fd_array, test->fixup_kfunc_btf_id);
- }
- 
--struct libcap {
--	struct __user_cap_header_struct hdr;
--	struct __user_cap_data_struct data[2];
--};
--
- static int set_admin(bool admin)
- {
- 	int err;
-diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore
-new file mode 100644
-index 000000000000..ae5491a114c0
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/.gitignore
-@@ -0,0 +1,6 @@
-+*
-+!*.c
-+!*.h
-+!Makefile
-+!.gitignore
-+!config
-diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
-new file mode 100644
-index 000000000000..0754a2c110a1
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/Makefile
-@@ -0,0 +1,218 @@
-+# SPDX-License-Identifier: GPL-2.0
-+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
-+include ../../../build/Build.include
-+include ../../../scripts/Makefile.arch
-+include ../../../scripts/Makefile.include
-+include ../lib.mk
-+
-+ifneq ($(LLVM),)
-+ifneq ($(filter %/,$(LLVM)),)
-+LLVM_PREFIX := $(LLVM)
-+else ifneq ($(filter -%,$(LLVM)),)
-+LLVM_SUFFIX := $(LLVM)
-+endif
-+
-+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
-+else
-+CC := gcc
-+endif # LLVM
-+
-+ifneq ($(CROSS_COMPILE),)
-+$(error CROSS_COMPILE not supported for scx selftests)
-+endif # CROSS_COMPILE
-+
-+CURDIR := $(abspath .)
-+REPOROOT := $(abspath ../../../..)
-+TOOLSDIR := $(REPOROOT)/tools
-+LIBDIR := $(TOOLSDIR)/lib
-+BPFDIR := $(LIBDIR)/bpf
-+TOOLSINCDIR := $(TOOLSDIR)/include
-+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
-+APIDIR := $(TOOLSINCDIR)/uapi
-+GENDIR := $(REPOROOT)/include/generated
-+GENHDR := $(GENDIR)/autoconf.h
-+SCXTOOLSDIR := $(TOOLSDIR)/sched_ext
-+SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include
-+
-+OUTPUT_DIR := $(CURDIR)/build
-+OBJ_DIR := $(OUTPUT_DIR)/obj
-+INCLUDE_DIR := $(OUTPUT_DIR)/include
-+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
-+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
-+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
-+LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a
-+DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool
-+HOST_BUILD_DIR := $(OBJ_DIR)
-+HOST_OUTPUT_DIR := $(OUTPUT_DIR)
-+
-+VMLINUX_BTF_PATHS ?= ../../../../vmlinux					\
-+		     /sys/kernel/btf/vmlinux					\
-+		     /boot/vmlinux-$(shell uname -r)
-+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
-+ifeq ($(VMLINUX_BTF),)
-+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
-+endif
-+
-+BPFTOOL ?= $(DEFAULT_BPFTOOL)
-+
-+ifneq ($(wildcard $(GENHDR)),)
-+  GENFLAGS := -DHAVE_GENHDR
-+endif
-+
-+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
-+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
-+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR)
-+
-+# Silence some warnings when compiled with clang
-+ifneq ($(LLVM),)
-+CFLAGS += -Wno-unused-command-line-argument
-+endif
-+
-+LDFLAGS = -lelf -lz -lpthread -lzstd
-+
-+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
-+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
-+
-+# Get Clang's default includes on this system, as opposed to those seen by
-+# '-target bpf'. This fixes "missing" files on some architectures/distros,
-+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-+#
-+# Use '-idirafter': Don't interfere with include mechanics except where the
-+# build would have failed anyways.
-+define get_sys_includes
-+$(shell $(1) -v -E - </dev/null 2>&1 \
-+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-+endef
-+
-+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
-+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
-+	     -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat			\
-+	     -I$(INCLUDE_DIR) -I$(APIDIR) -I$(SCXTOOLSINCDIR)			\
-+	     -I$(REPOROOT)/include						\
-+	     $(call get_sys_includes,$(CLANG))					\
-+	     -Wall -Wno-compare-distinct-pointer-types				\
-+	     -Wno-incompatible-function-pointer-types				\
-+	     -O2 -mcpu=v3
-+
-+# sort removes libbpf duplicates when not cross-building
-+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(OBJ_DIR)/libbpf				\
-+	       $(OBJ_DIR)/bpftool $(OBJ_DIR)/resolve_btfids			\
-+	       $(INCLUDE_DIR) $(SCXOBJ_DIR))
-+
-+$(MAKE_DIRS):
-+	$(call msg,MKDIR,,$@)
-+	$(Q)mkdir -p $@
-+
-+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
-+	   $(APIDIR)/linux/bpf.h						\
-+	   | $(OBJ_DIR)/libbpf
-+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/	\
-+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
-+		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
-+
-+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
-+		    $(LIBBPF_OUTPUT) | $(OBJ_DIR)/bpftool
-+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
-+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
-+		    EXTRA_CFLAGS='-g -O0'					\
-+		    OUTPUT=$(OBJ_DIR)/bpftool/					\
-+		    LIBBPF_OUTPUT=$(OBJ_DIR)/libbpf/				\
-+		    LIBBPF_DESTDIR=$(OUTPUT_DIR)/				\
-+		    prefix= DESTDIR=$(OUTPUT_DIR)/ install-bin
-+
-+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
-+ifeq ($(VMLINUX_H),)
-+	$(call msg,GEN,,$@)
-+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
-+else
-+	$(call msg,CP,,$@)
-+	$(Q)cp "$(VMLINUX_H)" $@
-+endif
-+
-+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h	| $(BPFOBJ) $(SCXOBJ_DIR)
-+	$(call msg,CLNG-BPF,,$(notdir $@))
-+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
-+
-+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR)
-+	$(eval sched=$(notdir $@))
-+	$(call msg,GEN-SKEL,,$(sched))
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
-+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
-+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
-+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
-+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
-+
-+################
-+# C schedulers #
-+################
-+
-+override define CLEAN
-+	rm -rf $(OUTPUT_DIR)
-+	rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
-+	rm -f $(TEST_GEN_PROGS)
-+	rm -f runner
-+endef
-+
-+# Every testcase takes all of the BPF progs are dependencies by default. This
-+# allows testcases to load any BPF scheduler, which is useful for testcases
-+# that don't need their own prog to run their test.
-+all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog)))
-+
-+auto-test-targets :=			\
-+	create_dsq			\
-+	enq_last_no_enq_fails		\
-+	enq_select_cpu_fails		\
-+	ddsp_bogus_dsq_fail		\
-+	ddsp_vtimelocal_fail		\
-+	dsp_local_on			\
-+	exit				\
-+	hotplug				\
-+	init_enable_count		\
-+	maximal				\
-+	maybe_null			\
-+	minimal				\
-+	prog_run			\
-+	reload_loop			\
-+	select_cpu_dfl			\
-+	select_cpu_dfl_nodispatch	\
-+	select_cpu_dispatch		\
-+	select_cpu_dispatch_bad_dsq	\
-+	select_cpu_dispatch_dbl_dsp	\
-+	select_cpu_vtime		\
-+	test_example			\
-+
-+testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
-+
-+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR)
-+	$(CC) $(CFLAGS) -c $< -o $@
-+
-+# Create all of the test targets object files, whose testcase objects will be
-+# registered into the runner in ELF constructors.
-+#
-+# Note that we must do double expansion here in order to support conditionally
-+# compiling BPF object files only if one is present, as the wildcard Make
-+# function doesn't support using implicit rules otherwise.
-+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR)
-+	$(eval test=$(patsubst %.o,%.c,$(notdir $@)))
-+	$(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o
-+
-+$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR)
-+	$(CC) $(CFLAGS) -c $< -o $@
-+
-+runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets)
-+	@echo "$(testcase-targets)"
-+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
-+
-+TEST_GEN_PROGS := runner
-+
-+all: runner
-+
-+.PHONY: all clean help
-+
-+.DEFAULT_GOAL := all
-+
-+.DELETE_ON_ERROR:
-+
-+.SECONDARY:
-diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config
-new file mode 100644
-index 000000000000..0de9b4ee249d
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/config
-@@ -0,0 +1,9 @@
-+CONFIG_SCHED_DEBUG=y
-+CONFIG_SCHED_CLASS_EXT=y
-+CONFIG_CGROUPS=y
-+CONFIG_CGROUP_SCHED=y
-+CONFIG_EXT_GROUP_SCHED=y
-+CONFIG_BPF=y
-+CONFIG_BPF_SYSCALL=y
-+CONFIG_DEBUG_INFO=y
-+CONFIG_DEBUG_INFO_BTF=y
-diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
-new file mode 100644
-index 000000000000..23f79ed343f0
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c
-@@ -0,0 +1,58 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Create and destroy DSQs in a loop.
-+ *
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p,
-+		    struct scx_exit_task_args *args)
-+{
-+	scx_bpf_destroy_dsq(p->pid);
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p,
-+			     struct scx_init_task_args *args)
-+{
-+	s32 err;
-+
-+	err = scx_bpf_create_dsq(p->pid, -1);
-+	if (err)
-+		scx_bpf_error("Failed to create DSQ for %s[%d]",
-+			      p->comm, p->pid);
-+
-+	return err;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init)
-+{
-+	u32 i;
-+	s32 err;
-+
-+	bpf_for(i, 0, 1024) {
-+		err = scx_bpf_create_dsq(i, -1);
-+		if (err) {
-+			scx_bpf_error("Failed to create DSQ %d", i);
-+			return 0;
-+		}
-+	}
-+
-+	bpf_for(i, 0, 1024) {
-+		scx_bpf_destroy_dsq(i);
-+	}
-+
-+	return 0;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops create_dsq_ops = {
-+	.init_task		= create_dsq_init_task,
-+	.exit_task		= create_dsq_exit_task,
-+	.init			= create_dsq_init,
-+	.name			= "create_dsq",
-+};
-diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c
-new file mode 100644
-index 000000000000..fa946d9146d4
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/create_dsq.c
-@@ -0,0 +1,57 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "create_dsq.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct create_dsq *skel;
-+
-+	skel = create_dsq__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct create_dsq *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops);
-+	if (!link) {
-+		SCX_ERR("Failed to attach scheduler");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct create_dsq *skel = ctx;
-+
-+	create_dsq__destroy(skel);
-+}
-+
-+struct scx_test create_dsq = {
-+	.name = "create_dsq",
-+	.description = "Create and destroy a dsq in a loop",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&create_dsq)
-diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
-new file mode 100644
-index 000000000000..e97ad41d354a
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
-@@ -0,0 +1,42 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+UEI_DEFINE(uei);
-+
-+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+
-+	if (cpu >= 0) {
-+		/*
-+		 * If we dispatch to a bogus DSQ that will fall back to the
-+		 * builtin global DSQ, we fail gracefully.
-+		 */
-+		scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
-+				       p->scx.dsq_vtime, 0);
-+		return cpu;
-+	}
-+
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops ddsp_bogus_dsq_fail_ops = {
-+	.select_cpu		= ddsp_bogus_dsq_fail_select_cpu,
-+	.exit			= ddsp_bogus_dsq_fail_exit,
-+	.name			= "ddsp_bogus_dsq_fail",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
-new file mode 100644
-index 000000000000..e65d22f23f3b
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c
-@@ -0,0 +1,57 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "ddsp_bogus_dsq_fail.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct ddsp_bogus_dsq_fail *skel;
-+
-+	skel = ddsp_bogus_dsq_fail__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct ddsp_bogus_dsq_fail *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
-+
-+	sleep(1);
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct ddsp_bogus_dsq_fail *skel = ctx;
-+
-+	ddsp_bogus_dsq_fail__destroy(skel);
-+}
-+
-+struct scx_test ddsp_bogus_dsq_fail = {
-+	.name = "ddsp_bogus_dsq_fail",
-+	.description = "Verify we gracefully fail, and fall back to using a "
-+		       "built-in DSQ, if we do a direct dispatch to an invalid"
-+		       " DSQ in ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail)
-diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
-new file mode 100644
-index 000000000000..dde7e7dafbfb
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
-@@ -0,0 +1,39 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+UEI_DEFINE(uei);
-+
-+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+
-+	if (cpu >= 0) {
-+		/* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */
-+		scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
-+				       p->scx.dsq_vtime, 0);
-+		return cpu;
-+	}
-+
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops ddsp_vtimelocal_fail_ops = {
-+	.select_cpu		= ddsp_vtimelocal_fail_select_cpu,
-+	.exit			= ddsp_vtimelocal_fail_exit,
-+	.name			= "ddsp_vtimelocal_fail",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
-new file mode 100644
-index 000000000000..abafee587cd6
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c
-@@ -0,0 +1,56 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <unistd.h>
-+#include "ddsp_vtimelocal_fail.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct ddsp_vtimelocal_fail *skel;
-+
-+	skel = ddsp_vtimelocal_fail__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct ddsp_vtimelocal_fail *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
-+
-+	sleep(1);
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct ddsp_vtimelocal_fail *skel = ctx;
-+
-+	ddsp_vtimelocal_fail__destroy(skel);
-+}
-+
-+struct scx_test ddsp_vtimelocal_fail = {
-+	.name = "ddsp_vtimelocal_fail",
-+	.description = "Verify we gracefully fail, and fall back to using a "
-+		       "built-in DSQ, if we do a direct vtime dispatch to a "
-+		       "built-in DSQ from DSQ in ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&ddsp_vtimelocal_fail)
-diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
-new file mode 100644
-index 000000000000..efb4672decb4
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
-@@ -0,0 +1,65 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+const volatile s32 nr_cpus;
-+
-+UEI_DEFINE(uei);
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_QUEUE);
-+	__uint(max_entries, 8192);
-+	__type(value, s32);
-+} queue SEC(".maps");
-+
-+s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p,
-+		    u64 enq_flags)
-+{
-+	s32 pid = p->pid;
-+
-+	if (bpf_map_push_elem(&queue, &pid, 0))
-+		scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid);
-+}
-+
-+void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	s32 pid, target;
-+	struct task_struct *p;
-+
-+	if (bpf_map_pop_elem(&queue, &pid))
-+		return;
-+
-+	p = bpf_task_from_pid(pid);
-+	if (!p)
-+		return;
-+
-+	target = bpf_get_prandom_u32() % nr_cpus;
-+
-+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
-+	bpf_task_release(p);
-+}
-+
-+void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops dsp_local_on_ops = {
-+	.select_cpu		= dsp_local_on_select_cpu,
-+	.enqueue		= dsp_local_on_enqueue,
-+	.dispatch		= dsp_local_on_dispatch,
-+	.exit			= dsp_local_on_exit,
-+	.name			= "dsp_local_on",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c
-new file mode 100644
-index 000000000000..472851b56854
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c
-@@ -0,0 +1,58 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <unistd.h>
-+#include "dsp_local_on.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct dsp_local_on *skel;
-+
-+	skel = dsp_local_on__open();
-+	SCX_FAIL_IF(!skel, "Failed to open");
-+
-+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-+	SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct dsp_local_on *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
-+
-+	/* Just sleeping is fine, plenty of scheduling events happening */
-+	sleep(1);
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct dsp_local_on *skel = ctx;
-+
-+	dsp_local_on__destroy(skel);
-+}
-+
-+struct scx_test dsp_local_on = {
-+	.name = "dsp_local_on",
-+	.description = "Verify we can directly dispatch tasks to a local DSQs "
-+		       "from osp.dispatch()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&dsp_local_on)
-diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
-new file mode 100644
-index 000000000000..b0b99531d5d5
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c
-@@ -0,0 +1,21 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops enq_last_no_enq_fails_ops = {
-+	.name			= "enq_last_no_enq_fails",
-+	/* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */
-+	.flags			= SCX_OPS_ENQ_LAST,
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
-new file mode 100644
-index 000000000000..2a3eda5e2c0b
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c
-@@ -0,0 +1,60 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "enq_last_no_enq_fails.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct enq_last_no_enq_fails *skel;
-+
-+	skel = enq_last_no_enq_fails__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct enq_last_no_enq_fails *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops);
-+	if (link) {
-+		SCX_ERR("Incorrectly succeeded in to attaching scheduler");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct enq_last_no_enq_fails *skel = ctx;
-+
-+	enq_last_no_enq_fails__destroy(skel);
-+}
-+
-+struct scx_test enq_last_no_enq_fails = {
-+	.name = "enq_last_no_enq_fails",
-+	.description = "Verify we fail to load a scheduler if we specify "
-+		       "the SCX_OPS_ENQ_LAST flag without defining "
-+		       "ops.enqueue()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&enq_last_no_enq_fails)
-diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
-new file mode 100644
-index 000000000000..b3dfc1033cd6
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
-@@ -0,0 +1,43 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+/* Manually specify the signature until the kfunc is added to the scx repo. */
-+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
-+			   bool *found) __ksym;
-+
-+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
-+		    u64 enq_flags)
-+{
-+	/*
-+	 * Need to initialize the variable or the verifier will fail to load.
-+	 * Improving these semantics is actively being worked on.
-+	 */
-+	bool found = false;
-+
-+	/* Can only call from ops.select_cpu() */
-+	scx_bpf_select_cpu_dfl(p, 0, 0, &found);
-+
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops enq_select_cpu_fails_ops = {
-+	.select_cpu		= enq_select_cpu_fails_select_cpu,
-+	.enqueue		= enq_select_cpu_fails_enqueue,
-+	.name			= "enq_select_cpu_fails",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
-new file mode 100644
-index 000000000000..dd1350e5f002
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
-@@ -0,0 +1,61 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "enq_select_cpu_fails.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct enq_select_cpu_fails *skel;
-+
-+	skel = enq_select_cpu_fails__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct enq_select_cpu_fails *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops);
-+	if (!link) {
-+		SCX_ERR("Failed to attach scheduler");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	sleep(1);
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct enq_select_cpu_fails *skel = ctx;
-+
-+	enq_select_cpu_fails__destroy(skel);
-+}
-+
-+struct scx_test enq_select_cpu_fails = {
-+	.name = "enq_select_cpu_fails",
-+	.description = "Verify we fail to call scx_bpf_select_cpu_dfl() "
-+		       "from ops.enqueue()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&enq_select_cpu_fails)
-diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
-new file mode 100644
-index 000000000000..ae12ddaac921
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
-@@ -0,0 +1,84 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+#include "exit_test.h"
-+
-+const volatile int exit_point;
-+UEI_DEFINE(uei);
-+
-+#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point)
-+
-+s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	bool found;
-+
-+	if (exit_point == EXIT_SELECT_CPU)
-+		EXIT_CLEANLY();
-+
-+	return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
-+}
-+
-+void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	if (exit_point == EXIT_ENQUEUE)
-+		EXIT_CLEANLY();
-+
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
-+{
-+	if (exit_point == EXIT_DISPATCH)
-+		EXIT_CLEANLY();
-+
-+	scx_bpf_consume(SCX_DSQ_GLOBAL);
-+}
-+
-+void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
-+{
-+	if (exit_point == EXIT_ENABLE)
-+		EXIT_CLEANLY();
-+}
-+
-+s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p,
-+		    struct scx_init_task_args *args)
-+{
-+	if (exit_point == EXIT_INIT_TASK)
-+		EXIT_CLEANLY();
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init)
-+{
-+	if (exit_point == EXIT_INIT)
-+		EXIT_CLEANLY();
-+
-+	return 0;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops exit_ops = {
-+	.select_cpu		= exit_select_cpu,
-+	.enqueue		= exit_enqueue,
-+	.dispatch		= exit_dispatch,
-+	.init_task		= exit_init_task,
-+	.enable			= exit_enable,
-+	.exit			= exit_exit,
-+	.init			= exit_init,
-+	.name			= "exit",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c
-new file mode 100644
-index 000000000000..31bcd06e21cd
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/exit.c
-@@ -0,0 +1,55 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <sched.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "exit.bpf.skel.h"
-+#include "scx_test.h"
-+
-+#include "exit_test.h"
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	enum exit_test_case tc;
-+
-+	for (tc = 0; tc < NUM_EXITS; tc++) {
-+		struct exit *skel;
-+		struct bpf_link *link;
-+		char buf[16];
-+
-+		skel = exit__open();
-+		skel->rodata->exit_point = tc;
-+		exit__load(skel);
-+		link = bpf_map__attach_struct_ops(skel->maps.exit_ops);
-+		if (!link) {
-+			SCX_ERR("Failed to attach scheduler");
-+			exit__destroy(skel);
-+			return SCX_TEST_FAIL;
-+		}
-+
-+		/* Assumes uei.kind is written last */
-+		while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE))
-+			sched_yield();
-+
-+		SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF));
-+		SCX_EQ(skel->data->uei.exit_code, tc);
-+		sprintf(buf, "%d", tc);
-+		SCX_ASSERT(!strcmp(skel->data->uei.msg, buf));
-+		bpf_link__destroy(link);
-+		exit__destroy(skel);
-+	}
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+struct scx_test exit_test = {
-+	.name = "exit",
-+	.description = "Verify we can cleanly exit a scheduler in multiple places",
-+	.run = run,
-+};
-+REGISTER_SCX_TEST(&exit_test)
-diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h
-new file mode 100644
-index 000000000000..94f0268b9cb8
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/exit_test.h
-@@ -0,0 +1,20 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#ifndef __EXIT_TEST_H__
-+#define __EXIT_TEST_H__
-+
-+enum exit_test_case {
-+	EXIT_SELECT_CPU,
-+	EXIT_ENQUEUE,
-+	EXIT_DISPATCH,
-+	EXIT_ENABLE,
-+	EXIT_INIT_TASK,
-+	EXIT_INIT,
-+	NUM_EXITS,
-+};
-+
-+#endif  // # __EXIT_TEST_H__
-diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c
-new file mode 100644
-index 000000000000..8f2601db39f3
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c
-@@ -0,0 +1,61 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+#include "hotplug_test.h"
-+
-+UEI_DEFINE(uei);
-+
-+void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+static void exit_from_hotplug(s32 cpu, bool onlining)
-+{
-+	/*
-+	 * Ignored, just used to verify that we can invoke blocking kfuncs
-+	 * from the hotplug path.
-+	 */
-+	scx_bpf_create_dsq(0, -1);
-+
-+	s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN;
-+
-+	if (onlining)
-+		code |= HOTPLUG_ONLINING;
-+
-+	scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu,
-+		     onlining ? "online" : "offline");
-+}
-+
-+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu)
-+{
-+	exit_from_hotplug(cpu, true);
-+}
-+
-+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu)
-+{
-+	exit_from_hotplug(cpu, false);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops hotplug_cb_ops = {
-+	.cpu_online		= hotplug_cpu_online,
-+	.cpu_offline		= hotplug_cpu_offline,
-+	.exit			= hotplug_exit,
-+	.name			= "hotplug_cbs",
-+	.timeout_ms		= 1000U,
-+};
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops hotplug_nocb_ops = {
-+	.exit			= hotplug_exit,
-+	.name			= "hotplug_nocbs",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c
-new file mode 100644
-index 000000000000..87bf220b1bce
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/hotplug.c
-@@ -0,0 +1,168 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <sched.h>
-+#include <scx/common.h>
-+#include <sched.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+
-+#include "hotplug_test.h"
-+#include "hotplug.bpf.skel.h"
-+#include "scx_test.h"
-+#include "util.h"
-+
-+const char *online_path = "/sys/devices/system/cpu/cpu1/online";
-+
-+static bool is_cpu_online(void)
-+{
-+	return file_read_long(online_path) > 0;
-+}
-+
-+static void toggle_online_status(bool online)
-+{
-+	long val = online ? 1 : 0;
-+	int ret;
-+
-+	ret = file_write_long(online_path, val);
-+	if (ret != 0)
-+		fprintf(stderr, "Failed to bring CPU %s (%s)",
-+			online ? "online" : "offline", strerror(errno));
-+}
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	if (!is_cpu_online())
-+		return SCX_TEST_SKIP;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined)
-+{
-+	struct hotplug *skel;
-+	struct bpf_link *link;
-+	long kind, code;
-+
-+	SCX_ASSERT(is_cpu_online());
-+
-+	skel = hotplug__open_and_load();
-+	SCX_ASSERT(skel);
-+
-+	/* Testing the offline -> online path, so go offline before starting */
-+	if (onlining)
-+		toggle_online_status(0);
-+
-+	if (cbs_defined) {
-+		kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF);
-+		code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN;
-+		if (onlining)
-+			code |= HOTPLUG_ONLINING;
-+	} else {
-+		kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN);
-+		code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) |
-+		       SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG);
-+	}
-+
-+	if (cbs_defined)
-+		link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops);
-+	else
-+		link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops);
-+
-+	if (!link) {
-+		SCX_ERR("Failed to attach scheduler");
-+		hotplug__destroy(skel);
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	toggle_online_status(onlining ? 1 : 0);
-+
-+	while (!UEI_EXITED(skel, uei))
-+		sched_yield();
-+
-+	SCX_EQ(skel->data->uei.kind, kind);
-+	SCX_EQ(UEI_REPORT(skel, uei), code);
-+
-+	if (!onlining)
-+		toggle_online_status(1);
-+
-+	bpf_link__destroy(link);
-+	hotplug__destroy(skel);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status test_hotplug_attach(void)
-+{
-+	struct hotplug *skel;
-+	struct bpf_link *link;
-+	enum scx_test_status status = SCX_TEST_PASS;
-+	long kind, code;
-+
-+	SCX_ASSERT(is_cpu_online());
-+	SCX_ASSERT(scx_hotplug_seq() > 0);
-+
-+	skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug);
-+	SCX_ASSERT(skel);
-+
-+	SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei);
-+
-+	/*
-+	 * Take the CPU offline to increment the global hotplug seq, which
-+	 * should cause attach to fail due to us setting the hotplug seq above
-+	 */
-+	toggle_online_status(0);
-+	link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops);
-+
-+	toggle_online_status(1);
-+
-+	SCX_ASSERT(link);
-+	while (!UEI_EXITED(skel, uei))
-+		sched_yield();
-+
-+	kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN);
-+	code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) |
-+	       SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG);
-+	SCX_EQ(skel->data->uei.kind, kind);
-+	SCX_EQ(UEI_REPORT(skel, uei), code);
-+
-+	bpf_link__destroy(link);
-+	hotplug__destroy(skel);
-+
-+	return status;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+
-+#define HP_TEST(__onlining, __cbs_defined) ({				\
-+	if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS)	\
-+		return SCX_TEST_FAIL;					\
-+})
-+
-+	HP_TEST(true, true);
-+	HP_TEST(false, true);
-+	HP_TEST(true, false);
-+	HP_TEST(false, false);
-+
-+#undef HP_TEST
-+
-+	return test_hotplug_attach();
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	toggle_online_status(1);
-+}
-+
-+struct scx_test hotplug_test = {
-+	.name = "hotplug",
-+	.description = "Verify hotplug behavior",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&hotplug_test)
-diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h
-new file mode 100644
-index 000000000000..73d236f90787
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/hotplug_test.h
-@@ -0,0 +1,15 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#ifndef __HOTPLUG_TEST_H__
-+#define __HOTPLUG_TEST_H__
-+
-+enum hotplug_test_flags {
-+	HOTPLUG_EXIT_RSN = 1LLU << 0,
-+	HOTPLUG_ONLINING = 1LLU << 1,
-+};
-+
-+#endif  // # __HOTPLUG_TEST_H__
-diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
-new file mode 100644
-index 000000000000..47ea89a626c3
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c
-@@ -0,0 +1,53 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that verifies that we do proper counting of init, enable, etc
-+ * callbacks.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt;
-+u64 init_fork_cnt, init_transition_cnt;
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p,
-+			     struct scx_init_task_args *args)
-+{
-+	__sync_fetch_and_add(&init_task_cnt, 1);
-+
-+	if (args->fork)
-+		__sync_fetch_and_add(&init_fork_cnt, 1);
-+	else
-+		__sync_fetch_and_add(&init_transition_cnt, 1);
-+
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p)
-+{
-+	__sync_fetch_and_add(&exit_task_cnt, 1);
-+}
-+
-+void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p)
-+{
-+	__sync_fetch_and_add(&enable_cnt, 1);
-+}
-+
-+void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p)
-+{
-+	__sync_fetch_and_add(&disable_cnt, 1);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops init_enable_count_ops = {
-+	.init_task	= cnt_init_task,
-+	.exit_task	= cnt_exit_task,
-+	.enable		= cnt_enable,
-+	.disable	= cnt_disable,
-+	.name		= "init_enable_count",
-+};
-diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c
-new file mode 100644
-index 000000000000..ef9da0a50846
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/init_enable_count.c
-@@ -0,0 +1,166 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <sched.h>
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include "scx_test.h"
-+#include "init_enable_count.bpf.skel.h"
-+
-+#define SCHED_EXT 7
-+
-+static struct init_enable_count *
-+open_load_prog(bool global)
-+{
-+	struct init_enable_count *skel;
-+
-+	skel = init_enable_count__open();
-+	SCX_BUG_ON(!skel, "Failed to open skel");
-+
-+	if (!global)
-+		skel->struct_ops.init_enable_count_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
-+
-+	SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel");
-+
-+	return skel;
-+}
-+
-+static enum scx_test_status run_test(bool global)
-+{
-+	struct init_enable_count *skel;
-+	struct bpf_link *link;
-+	const u32 num_children = 5, num_pre_forks = 1024;
-+	int ret, i, status;
-+	struct sched_param param = {};
-+	pid_t pids[num_pre_forks];
-+
-+	skel = open_load_prog(global);
-+
-+	/*
-+	 * Fork a bunch of children before we attach the scheduler so that we
-+	 * ensure (at least in practical terms) that there are more tasks that
-+	 * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that
-+	 * take the fork() path either below or in other processes.
-+	 */
-+	for (i = 0; i < num_pre_forks; i++) {
-+		pids[i] = fork();
-+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
-+		if (pids[i] == 0) {
-+			sleep(1);
-+			exit(0);
-+		}
-+	}
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
-+
-+	for (i = 0; i < num_pre_forks; i++) {
-+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
-+			    "Failed to wait for pre-forked child\n");
-+
-+		SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i,
-+			    status);
-+	}
-+
-+	bpf_link__destroy(link);
-+	SCX_GE(skel->bss->init_task_cnt, num_pre_forks);
-+	SCX_GE(skel->bss->exit_task_cnt, num_pre_forks);
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
-+
-+	/* SCHED_EXT children */
-+	for (i = 0; i < num_children; i++) {
-+		pids[i] = fork();
-+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
-+
-+		if (pids[i] == 0) {
-+			ret = sched_setscheduler(0, SCHED_EXT, &param);
-+			SCX_BUG_ON(ret, "Failed to set sched to sched_ext");
-+
-+			/*
-+			 * Reset to SCHED_OTHER for half of them. Counts for
-+			 * everything should still be the same regardless, as
-+			 * ops.disable() is invoked even if a task is still on
-+			 * SCHED_EXT before it exits.
-+			 */
-+			if (i % 2 == 0) {
-+				ret = sched_setscheduler(0, SCHED_OTHER, &param);
-+				SCX_BUG_ON(ret, "Failed to reset sched to normal");
-+			}
-+			exit(0);
-+		}
-+	}
-+	for (i = 0; i < num_children; i++) {
-+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
-+			    "Failed to wait for SCX child\n");
-+
-+		SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i,
-+			    status);
-+	}
-+
-+	/* SCHED_OTHER children */
-+	for (i = 0; i < num_children; i++) {
-+		pids[i] = fork();
-+		if (pids[i] == 0)
-+			exit(0);
-+	}
-+
-+	for (i = 0; i < num_children; i++) {
-+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
-+			    "Failed to wait for normal child\n");
-+
-+		SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i,
-+			    status);
-+	}
-+
-+	bpf_link__destroy(link);
-+
-+	SCX_GE(skel->bss->init_task_cnt, 2 * num_children);
-+	SCX_GE(skel->bss->exit_task_cnt, 2 * num_children);
-+
-+	if (global) {
-+		SCX_GE(skel->bss->enable_cnt, 2 * num_children);
-+		SCX_GE(skel->bss->disable_cnt, 2 * num_children);
-+	} else {
-+		SCX_EQ(skel->bss->enable_cnt, num_children);
-+		SCX_EQ(skel->bss->disable_cnt, num_children);
-+	}
-+	/*
-+	 * We forked a ton of tasks before we attached the scheduler above, so
-+	 * this should be fine. Technically it could be flaky if a ton of forks
-+	 * are happening at the same time in other processes, but that should
-+	 * be exceedingly unlikely.
-+	 */
-+	SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt);
-+	SCX_GE(skel->bss->init_fork_cnt, 2 * num_children);
-+
-+	init_enable_count__destroy(skel);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	enum scx_test_status status;
-+
-+	status = run_test(true);
-+	if (status != SCX_TEST_PASS)
-+		return status;
-+
-+	return run_test(false);
-+}
-+
-+struct scx_test init_enable_count = {
-+	.name = "init_enable_count",
-+	.description = "Verify we do the correct amount of counting of init, "
-+		       "enable, etc callbacks.",
-+	.run = run,
-+};
-+REGISTER_SCX_TEST(&init_enable_count)
-diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
-new file mode 100644
-index 000000000000..00bfa9cb95d3
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
-@@ -0,0 +1,164 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler with every callback defined.
-+ *
-+ * This scheduler defines every callback.
-+ *
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
-+		   u64 wake_flags)
-+{
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags)
-+{
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
-+{
-+	scx_bpf_consume(SCX_DSQ_GLOBAL);
-+}
-+
-+void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_running, struct task_struct *p)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags)
-+{}
-+
-+bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from,
-+		    struct task_struct *to)
-+{
-+	return false;
-+}
-+
-+bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a,
-+		    struct task_struct *b)
-+{
-+	return false;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p,
-+		    const struct cpumask *cpumask)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu,
-+		    struct scx_cpu_acquire_args *args)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu,
-+		    struct scx_cpu_release_args *args)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu)
-+{}
-+
-+s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p,
-+		   struct scx_init_task_args *args)
-+{
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p,
-+		    struct scx_exit_task_args *args)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
-+{}
-+
-+s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp,
-+		   struct scx_cgroup_init_args *args)
-+{
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp)
-+{}
-+
-+s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p,
-+		   struct cgroup *from, struct cgroup *to)
-+{
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p,
-+		    struct cgroup *from, struct cgroup *to)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
-+	       struct cgroup *from, struct cgroup *to)
-+{}
-+
-+void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
-+{}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
-+{
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
-+{}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops maximal_ops = {
-+	.select_cpu		= maximal_select_cpu,
-+	.enqueue		= maximal_enqueue,
-+	.dequeue		= maximal_dequeue,
-+	.dispatch		= maximal_dispatch,
-+	.runnable		= maximal_runnable,
-+	.running		= maximal_running,
-+	.stopping		= maximal_stopping,
-+	.quiescent		= maximal_quiescent,
-+	.yield			= maximal_yield,
-+	.core_sched_before	= maximal_core_sched_before,
-+	.set_weight		= maximal_set_weight,
-+	.set_cpumask		= maximal_set_cpumask,
-+	.update_idle		= maximal_update_idle,
-+	.cpu_acquire		= maximal_cpu_acquire,
-+	.cpu_release		= maximal_cpu_release,
-+	.cpu_online		= maximal_cpu_online,
-+	.cpu_offline		= maximal_cpu_offline,
-+	.init_task		= maximal_init_task,
-+	.enable			= maximal_enable,
-+	.exit_task		= maximal_exit_task,
-+	.disable		= maximal_disable,
-+	.cgroup_init		= maximal_cgroup_init,
-+	.cgroup_exit		= maximal_cgroup_exit,
-+	.cgroup_prep_move	= maximal_cgroup_prep_move,
-+	.cgroup_move		= maximal_cgroup_move,
-+	.cgroup_cancel_move	= maximal_cgroup_cancel_move,
-+	.cgroup_set_weight	= maximal_cgroup_set_weight,
-+	.init			= maximal_init,
-+	.exit			= maximal_exit,
-+	.name			= "maximal",
-+};
-diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c
-new file mode 100644
-index 000000000000..f38fc973c380
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/maximal.c
-@@ -0,0 +1,51 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "maximal.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct maximal *skel;
-+
-+	skel = maximal__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct maximal *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.maximal_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct maximal *skel = ctx;
-+
-+	maximal__destroy(skel);
-+}
-+
-+struct scx_test maximal = {
-+	.name = "maximal",
-+	.description = "Verify we can load a scheduler with every callback defined",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&maximal)
-diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
-new file mode 100644
-index 000000000000..ad5e694226bb
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c
-@@ -0,0 +1,26 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+u64 vtime_test;
-+
-+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
-+{}
-+
-+void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p)
-+{
-+	if (p != NULL)
-+		vtime_test = p->scx.dsq_vtime;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops maybe_null_success = {
-+	.dispatch               = maybe_null_success_dispatch,
-+	.enable			= maybe_null_running,
-+	.name			= "minimal",
-+};
-diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c
-new file mode 100644
-index 000000000000..3f26b784f9c5
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/maybe_null.c
-@@ -0,0 +1,40 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "maybe_null.bpf.skel.h"
-+#include "maybe_null_fail.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct maybe_null *skel;
-+	struct maybe_null_fail *fail_skel;
-+
-+	skel = maybe_null__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load maybe_null skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	maybe_null__destroy(skel);
-+
-+	fail_skel = maybe_null_fail__open_and_load();
-+	if (fail_skel) {
-+		maybe_null_fail__destroy(fail_skel);
-+		SCX_ERR("Should failed to open and load maybe_null_fail skel");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+struct scx_test maybe_null = {
-+	.name = "maybe_null",
-+	.description = "Verify if PTR_MAYBE_NULL work for .dispatch",
-+	.run = run,
-+};
-+REGISTER_SCX_TEST(&maybe_null)
-diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c
-new file mode 100644
-index 000000000000..1607fe07bead
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c
-@@ -0,0 +1,25 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+u64 vtime_test;
-+
-+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p)
-+{}
-+
-+void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p)
-+{
-+	vtime_test = p->scx.dsq_vtime;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops maybe_null_fail = {
-+	.dispatch               = maybe_null_fail_dispatch,
-+	.enable			= maybe_null_running,
-+	.name			= "minimal",
-+};
-diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c
-new file mode 100644
-index 000000000000..6a7eccef0104
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/minimal.bpf.c
-@@ -0,0 +1,21 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A completely minimal scheduler.
-+ *
-+ * This scheduler defines the absolute minimal set of struct sched_ext_ops
-+ * fields: its name. It should _not_ fail to be loaded, and can be used to
-+ * exercise the default scheduling paths in ext.c.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops minimal_ops = {
-+	.name			= "minimal",
-+};
-diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c
-new file mode 100644
-index 000000000000..6c5db8ebbf8a
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/minimal.c
-@@ -0,0 +1,58 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "minimal.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct minimal *skel;
-+
-+	skel = minimal__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct minimal *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.minimal_ops);
-+	if (!link) {
-+		SCX_ERR("Failed to attach scheduler");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct minimal *skel = ctx;
-+
-+	minimal__destroy(skel);
-+}
-+
-+struct scx_test minimal = {
-+	.name = "minimal",
-+	.description = "Verify we can load a fully minimal scheduler",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&minimal)
-diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c
-new file mode 100644
-index 000000000000..fd2c8f12af16
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c
-@@ -0,0 +1,32 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates that we can invoke sched_ext kfuncs in
-+ * BPF_PROG_TYPE_SYSCALL programs.
-+ *
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+UEI_DEFINE(uei);
-+
-+char _license[] SEC("license") = "GPL";
-+
-+SEC("syscall")
-+int BPF_PROG(prog_run_syscall)
-+{
-+	scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN");
-+	return 0;
-+}
-+
-+void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops prog_run_ops = {
-+	.exit			= prog_run_exit,
-+	.name			= "prog_run",
-+};
-diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c
-new file mode 100644
-index 000000000000..3cd57ef8daaa
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/prog_run.c
-@@ -0,0 +1,78 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <sched.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "prog_run.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct prog_run *skel;
-+
-+	skel = prog_run__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct prog_run *skel = ctx;
-+	struct bpf_link *link;
-+	int prog_fd, err = 0;
-+
-+	prog_fd = bpf_program__fd(skel->progs.prog_run_syscall);
-+	if (prog_fd < 0) {
-+		SCX_ERR("Failed to get BPF_PROG_RUN prog");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	LIBBPF_OPTS(bpf_test_run_opts, topts);
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops);
-+	if (!link) {
-+		SCX_ERR("Failed to attach scheduler");
-+		close(prog_fd);
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	err = bpf_prog_test_run_opts(prog_fd, &topts);
-+	SCX_EQ(err, 0);
-+
-+	/* Assumes uei.kind is written last */
-+	while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE))
-+		sched_yield();
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF));
-+	SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef);
-+	close(prog_fd);
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct prog_run *skel = ctx;
-+
-+	prog_run__destroy(skel);
-+}
-+
-+struct scx_test prog_run = {
-+	.name = "prog_run",
-+	.description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&prog_run)
-diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c
-new file mode 100644
-index 000000000000..5cfba2d6e056
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/reload_loop.c
-@@ -0,0 +1,75 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <pthread.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "maximal.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static struct maximal *skel;
-+static pthread_t threads[2];
-+
-+bool force_exit = false;
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	skel = maximal__open_and_load();
-+	if (!skel) {
-+		SCX_ERR("Failed to open and load skel");
-+		return SCX_TEST_FAIL;
-+	}
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void *do_reload_loop(void *arg)
-+{
-+	u32 i;
-+
-+	for (i = 0; i < 1024 && !force_exit; i++) {
-+		struct bpf_link *link;
-+
-+		link = bpf_map__attach_struct_ops(skel->maps.maximal_ops);
-+		if (link)
-+			bpf_link__destroy(link);
-+	}
-+
-+	return NULL;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	int err;
-+	void *ret;
-+
-+	err = pthread_create(&threads[0], NULL, do_reload_loop, NULL);
-+	SCX_FAIL_IF(err, "Failed to create thread 0");
-+
-+	err = pthread_create(&threads[1], NULL, do_reload_loop, NULL);
-+	SCX_FAIL_IF(err, "Failed to create thread 1");
-+
-+	SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed");
-+	SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed");
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	force_exit = true;
-+	maximal__destroy(skel);
-+}
-+
-+struct scx_test reload_loop = {
-+	.name = "reload_loop",
-+	.description = "Stress test loading and unloading schedulers repeatedly in a tight loop",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&reload_loop)
-diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
-new file mode 100644
-index 000000000000..eab48c7ff309
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/runner.c
-@@ -0,0 +1,201 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <stdio.h>
-+#include <unistd.h>
-+#include <signal.h>
-+#include <libgen.h>
-+#include <bpf/bpf.h>
-+#include "scx_test.h"
-+
-+const char help_fmt[] =
-+"The runner for sched_ext tests.\n"
-+"\n"
-+"The runner is statically linked against all testcases, and runs them all serially.\n"
-+"It's required for the testcases to be serial, as only a single host-wide sched_ext\n"
-+"scheduler may be loaded at any given time."
-+"\n"
-+"Usage: %s [-t TEST] [-h]\n"
-+"\n"
-+"  -t TEST       Only run tests whose name includes this string\n"
-+"  -s            Include print output for skipped tests\n"
-+"  -q            Don't print the test descriptions during run\n"
-+"  -h            Display this help and exit\n";
-+
-+static volatile int exit_req;
-+static bool quiet, print_skipped;
-+
-+#define MAX_SCX_TESTS 2048
-+
-+static struct scx_test __scx_tests[MAX_SCX_TESTS];
-+static unsigned __scx_num_tests = 0;
-+
-+static void sigint_handler(int simple)
-+{
-+	exit_req = 1;
-+}
-+
-+static void print_test_preamble(const struct scx_test *test, bool quiet)
-+{
-+	printf("===== START =====\n");
-+	printf("TEST: %s\n", test->name);
-+	if (!quiet)
-+		printf("DESCRIPTION: %s\n", test->description);
-+	printf("OUTPUT:\n");
-+}
-+
-+static const char *status_to_result(enum scx_test_status status)
-+{
-+	switch (status) {
-+	case SCX_TEST_PASS:
-+	case SCX_TEST_SKIP:
-+		return "ok";
-+	case SCX_TEST_FAIL:
-+		return "not ok";
-+	default:
-+		return "<UNKNOWN>";
-+	}
-+}
-+
-+static void print_test_result(const struct scx_test *test,
-+			      enum scx_test_status status,
-+			      unsigned int testnum)
-+{
-+	const char *result = status_to_result(status);
-+	const char *directive = status == SCX_TEST_SKIP ? "SKIP " : "";
-+
-+	printf("%s %u %s # %s\n", result, testnum, test->name, directive);
-+	printf("=====  END  =====\n");
-+}
-+
-+static bool should_skip_test(const struct scx_test *test, const char * filter)
-+{
-+	return !strstr(test->name, filter);
-+}
-+
-+static enum scx_test_status run_test(const struct scx_test *test)
-+{
-+	enum scx_test_status status;
-+	void *context = NULL;
-+
-+	if (test->setup) {
-+		status = test->setup(&context);
-+		if (status != SCX_TEST_PASS)
-+			return status;
-+	}
-+
-+	status = test->run(context);
-+
-+	if (test->cleanup)
-+		test->cleanup(context);
-+
-+	return status;
-+}
-+
-+static bool test_valid(const struct scx_test *test)
-+{
-+	if (!test) {
-+		fprintf(stderr, "NULL test detected\n");
-+		return false;
-+	}
-+
-+	if (!test->name) {
-+		fprintf(stderr,
-+			"Test with no name found. Must specify test name.\n");
-+		return false;
-+	}
-+
-+	if (!test->description) {
-+		fprintf(stderr, "Test %s requires description.\n", test->name);
-+		return false;
-+	}
-+
-+	if (!test->run) {
-+		fprintf(stderr, "Test %s has no run() callback\n", test->name);
-+		return false;
-+	}
-+
-+	return true;
-+}
-+
-+int main(int argc, char **argv)
-+{
-+	const char *filter = NULL;
-+	unsigned testnum = 0, i;
-+	unsigned passed = 0, skipped = 0, failed = 0;
-+	int opt;
-+
-+	signal(SIGINT, sigint_handler);
-+	signal(SIGTERM, sigint_handler);
-+
-+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-+
-+	while ((opt = getopt(argc, argv, "qst:h")) != -1) {
-+		switch (opt) {
-+		case 'q':
-+			quiet = true;
-+			break;
-+		case 's':
-+			print_skipped = true;
-+			break;
-+		case 't':
-+			filter = optarg;
-+			break;
-+		default:
-+			fprintf(stderr, help_fmt, basename(argv[0]));
-+			return opt != 'h';
-+		}
-+	}
-+
-+	for (i = 0; i < __scx_num_tests; i++) {
-+		enum scx_test_status status;
-+		struct scx_test *test = &__scx_tests[i];
-+
-+		if (filter && should_skip_test(test, filter)) {
-+			/*
-+			 * Printing the skipped tests and their preambles can
-+			 * add a lot of noise to the runner output. Printing
-+			 * this is only really useful for CI, so let's skip it
-+			 * by default.
-+			 */
-+			if (print_skipped) {
-+				print_test_preamble(test, quiet);
-+				print_test_result(test, SCX_TEST_SKIP, ++testnum);
-+			}
-+			continue;
-+		}
-+
-+		print_test_preamble(test, quiet);
-+		status = run_test(test);
-+		print_test_result(test, status, ++testnum);
-+		switch (status) {
-+		case SCX_TEST_PASS:
-+			passed++;
-+			break;
-+		case SCX_TEST_SKIP:
-+			skipped++;
-+			break;
-+		case SCX_TEST_FAIL:
-+			failed++;
-+			break;
-+		}
-+	}
-+	printf("\n\n=============================\n\n");
-+	printf("RESULTS:\n\n");
-+	printf("PASSED:  %u\n", passed);
-+	printf("SKIPPED: %u\n", skipped);
-+	printf("FAILED:  %u\n", failed);
-+
-+	return 0;
-+}
-+
-+void scx_test_register(struct scx_test *test)
-+{
-+	SCX_BUG_ON(!test_valid(test), "Invalid test found");
-+	SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded");
-+
-+	__scx_tests[__scx_num_tests++] = *test;
-+}
-diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h
-new file mode 100644
-index 000000000000..90b8d6915bb7
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/scx_test.h
-@@ -0,0 +1,131 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ */
-+
-+#ifndef __SCX_TEST_H__
-+#define __SCX_TEST_H__
-+
-+#include <errno.h>
-+#include <scx/common.h>
-+#include <scx/compat.h>
-+
-+enum scx_test_status {
-+	SCX_TEST_PASS = 0,
-+	SCX_TEST_SKIP,
-+	SCX_TEST_FAIL,
-+};
-+
-+#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent)
-+
-+struct scx_test {
-+	/**
-+	 * name - The name of the testcase.
-+	 */
-+	const char *name;
-+
-+	/**
-+	 * description - A description of your testcase: what it tests and is
-+	 * meant to validate.
-+	 */
-+	const char *description;
-+
-+	/*
-+	 * setup - Setup the test.
-+	 * @ctx: A pointer to a context object that will be passed to run and
-+	 *	 cleanup.
-+	 *
-+	 * An optional callback that allows a testcase to perform setup for its
-+	 * run. A test may return SCX_TEST_SKIP to skip the run.
-+	 */
-+	enum scx_test_status (*setup)(void **ctx);
-+
-+	/*
-+	 * run - Run the test.
-+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
-+	 *	 setup(), it is NULL.
-+	 *
-+	 * The main test. Callers should return one of:
-+	 *
-+	 * - SCX_TEST_PASS: Test passed
-+	 * - SCX_TEST_SKIP: Test should be skipped
-+	 * - SCX_TEST_FAIL: Test failed
-+	 *
-+	 * This callback must be defined.
-+	 */
-+	enum scx_test_status (*run)(void *ctx);
-+
-+	/*
-+	 * cleanup - Perform cleanup following the test
-+	 * @ctx: Context set in the setup() callback. If @ctx was not set in
-+	 *	 setup(), it is NULL.
-+	 *
-+	 * An optional callback that allows a test to perform cleanup after
-+	 * being run. This callback is run even if the run() callback returns
-+	 * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns
-+	 * SCX_TEST_SKIP or SCX_TEST_FAIL.
-+	 */
-+	void (*cleanup)(void *ctx);
-+};
-+
-+void scx_test_register(struct scx_test *test);
-+
-+#define REGISTER_SCX_TEST(__test)			\
-+	__attribute__((constructor))			\
-+	static void ___scxregister##__LINE__(void)	\
-+	{						\
-+		scx_test_register(__test);		\
-+	}
-+
-+#define SCX_ERR(__fmt, ...)						\
-+	do {								\
-+		fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__);	\
-+		fprintf(stderr, __fmt"\n", ##__VA_ARGS__);			\
-+	} while (0)
-+
-+#define SCX_FAIL(__fmt, ...)						\
-+	do {								\
-+		SCX_ERR(__fmt, ##__VA_ARGS__);				\
-+		return SCX_TEST_FAIL;					\
-+	} while (0)
-+
-+#define SCX_FAIL_IF(__cond, __fmt, ...)					\
-+	do {								\
-+		if (__cond)						\
-+			SCX_FAIL(__fmt, ##__VA_ARGS__);			\
-+	} while (0)
-+
-+#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)",	\
-+				   #_x, #_y, (u64)(_x), (u64)(_y))
-+#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)",	\
-+				   #_x, #_y, (u64)(_x), (u64)(_y))
-+#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)",	\
-+				   #_x, #_y, (u64)(_x), (u64)(_y))
-+#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)",	\
-+				   #_x, #_y, (u64)(_x), (u64)(_y))
-+#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)",	\
-+				   #_x, #_y, (u64)(_x), (u64)(_y))
-+#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)",		\
-+				   #_x, (u64)(_x))
-+
-+#define SCX_ECODE_VAL(__ecode) ({						\
-+        u64 __val = 0;								\
-+	bool __found = false;							\
-+										\
-+	__found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val);	\
-+	SCX_ASSERT(__found);							\
-+	(s64)__val;								\
-+})
-+
-+#define SCX_KIND_VAL(__kind) ({							\
-+        u64 __val = 0;								\
-+	bool __found = false;							\
-+										\
-+	__found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val);		\
-+	SCX_ASSERT(__found);							\
-+	__val;									\
-+})
-+
-+#endif  // # __SCX_TEST_H__
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
-new file mode 100644
-index 000000000000..2ed2991afafe
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
-@@ -0,0 +1,40 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+bool saw_local = false;
-+
-+static bool task_is_test(const struct task_struct *p)
-+{
-+	return !bpf_strncmp(p->comm, 9, "select_cpu");
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
-+		    u64 enq_flags)
-+{
-+	const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask();
-+
-+	if (task_is_test(p) &&
-+	    bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) {
-+		saw_local = true;
-+	}
-+	scx_bpf_put_idle_cpumask(idle_mask);
-+
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_dfl_ops = {
-+	.enqueue		= select_cpu_dfl_enqueue,
-+	.name			= "select_cpu_dfl",
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
-new file mode 100644
-index 000000000000..a53a40c2d2f0
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
-@@ -0,0 +1,72 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_dfl.bpf.skel.h"
-+#include "scx_test.h"
-+
-+#define NUM_CHILDREN 1028
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_dfl *skel;
-+
-+	skel = select_cpu_dfl__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_dfl *skel = ctx;
-+	struct bpf_link *link;
-+	pid_t pids[NUM_CHILDREN];
-+	int i, status;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		pids[i] = fork();
-+		if (pids[i] == 0) {
-+			sleep(1);
-+			exit(0);
-+		}
-+	}
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
-+		SCX_EQ(status, 0);
-+	}
-+
-+	SCX_ASSERT(!skel->bss->saw_local);
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_dfl *skel = ctx;
-+
-+	select_cpu_dfl__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_dfl = {
-+	.name = "select_cpu_dfl",
-+	.description = "Verify the default ops.select_cpu() dispatches tasks "
-+		       "when idles cores are found, and skips ops.enqueue()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_dfl)
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
-new file mode 100644
-index 000000000000..4bb5abb2d369
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
-@@ -0,0 +1,89 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag
-+ * specified.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+bool saw_local = false;
-+
-+/* Per-task scheduling context */
-+struct task_ctx {
-+	bool	force_local;	/* CPU changed by ops.select_cpu() */
-+};
-+
-+struct {
-+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-+	__uint(map_flags, BPF_F_NO_PREALLOC);
-+	__type(key, int);
-+	__type(value, struct task_ctx);
-+} task_ctx_stor SEC(".maps");
-+
-+/* Manually specify the signature until the kfunc is added to the scx repo. */
-+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
-+			   bool *found) __ksym;
-+
-+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	struct task_ctx *tctx;
-+	s32 cpu;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return -ESRCH;
-+	}
-+
-+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags,
-+				     &tctx->force_local);
-+
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
-+		    u64 enq_flags)
-+{
-+	u64 dsq_id = SCX_DSQ_GLOBAL;
-+	struct task_ctx *tctx;
-+
-+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-+	if (!tctx) {
-+		scx_bpf_error("task_ctx lookup failed");
-+		return;
-+	}
-+
-+	if (tctx->force_local) {
-+		dsq_id = SCX_DSQ_LOCAL;
-+		tctx->force_local = false;
-+		saw_local = true;
-+	}
-+
-+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
-+}
-+
-+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
-+		   struct task_struct *p, struct scx_init_task_args *args)
-+{
-+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-+		return 0;
-+	else
-+		return -ENOMEM;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_dfl_nodispatch_ops = {
-+	.select_cpu		= select_cpu_dfl_nodispatch_select_cpu,
-+	.enqueue		= select_cpu_dfl_nodispatch_enqueue,
-+	.init_task		= select_cpu_dfl_nodispatch_init_task,
-+	.name			= "select_cpu_dfl_nodispatch",
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
-new file mode 100644
-index 000000000000..1d85bf4bf3a3
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c
-@@ -0,0 +1,72 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_dfl_nodispatch.bpf.skel.h"
-+#include "scx_test.h"
-+
-+#define NUM_CHILDREN 1028
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_dfl_nodispatch *skel;
-+
-+	skel = select_cpu_dfl_nodispatch__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_dfl_nodispatch *skel = ctx;
-+	struct bpf_link *link;
-+	pid_t pids[NUM_CHILDREN];
-+	int i, status;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		pids[i] = fork();
-+		if (pids[i] == 0) {
-+			sleep(1);
-+			exit(0);
-+		}
-+	}
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
-+		SCX_EQ(status, 0);
-+	}
-+
-+	SCX_ASSERT(skel->bss->saw_local);
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_dfl_nodispatch *skel = ctx;
-+
-+	select_cpu_dfl_nodispatch__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_dfl_nodispatch = {
-+	.name = "select_cpu_dfl_nodispatch",
-+	.description = "Verify behavior of scx_bpf_select_cpu_dfl() in "
-+		       "ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch)
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
-new file mode 100644
-index 000000000000..f0b96a4a04b2
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
-@@ -0,0 +1,41 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	u64 dsq_id = SCX_DSQ_LOCAL;
-+	s32 cpu = prev_cpu;
-+
-+	if (scx_bpf_test_and_clear_cpu_idle(cpu))
-+		goto dispatch;
-+
-+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0)
-+		goto dispatch;
-+
-+	dsq_id = SCX_DSQ_GLOBAL;
-+	cpu = prev_cpu;
-+
-+dispatch:
-+	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0);
-+	return cpu;
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_dispatch_ops = {
-+	.select_cpu		= select_cpu_dispatch_select_cpu,
-+	.name			= "select_cpu_dispatch",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c
-new file mode 100644
-index 000000000000..0309ca8785b3
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c
-@@ -0,0 +1,70 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_dispatch.bpf.skel.h"
-+#include "scx_test.h"
-+
-+#define NUM_CHILDREN 1028
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_dispatch *skel;
-+
-+	skel = select_cpu_dispatch__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_dispatch *skel = ctx;
-+	struct bpf_link *link;
-+	pid_t pids[NUM_CHILDREN];
-+	int i, status;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		pids[i] = fork();
-+		if (pids[i] == 0) {
-+			sleep(1);
-+			exit(0);
-+		}
-+	}
-+
-+	for (i = 0; i < NUM_CHILDREN; i++) {
-+		SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
-+		SCX_EQ(status, 0);
-+	}
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_dispatch *skel = ctx;
-+
-+	select_cpu_dispatch__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_dispatch = {
-+	.name = "select_cpu_dispatch",
-+	.description = "Test direct dispatching to built-in DSQs from "
-+		       "ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_dispatch)
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
-new file mode 100644
-index 000000000000..7b42ddce0f56
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
-@@ -0,0 +1,37 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+UEI_DEFINE(uei);
-+
-+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	/* Dispatching to a random DSQ should fail. */
-+	scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0);
-+
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = {
-+	.select_cpu		= select_cpu_dispatch_bad_dsq_select_cpu,
-+	.exit			= select_cpu_dispatch_bad_dsq_exit,
-+	.name			= "select_cpu_dispatch_bad_dsq",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
-new file mode 100644
-index 000000000000..47eb6ed7627d
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c
-@@ -0,0 +1,56 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_dispatch_bad_dsq.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_dispatch_bad_dsq *skel;
-+
-+	skel = select_cpu_dispatch_bad_dsq__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	sleep(1);
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_dispatch_bad_dsq *skel = ctx;
-+
-+	select_cpu_dispatch_bad_dsq__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_dispatch_bad_dsq = {
-+	.name = "select_cpu_dispatch_bad_dsq",
-+	.description = "Verify graceful failure if we direct-dispatch to a "
-+		       "bogus DSQ in ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq)
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
-new file mode 100644
-index 000000000000..653e3dc0b4dc
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
-@@ -0,0 +1,38 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates the behavior of direct dispatching with a default
-+ * select_cpu implementation.
-+ *
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+UEI_DEFINE(uei);
-+
-+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	/* Dispatching twice in a row is disallowed. */
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-+
-+	return prev_cpu;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei)
-+{
-+	UEI_RECORD(uei, ei);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = {
-+	.select_cpu		= select_cpu_dispatch_dbl_dsp_select_cpu,
-+	.exit			= select_cpu_dispatch_dbl_dsp_exit,
-+	.name			= "select_cpu_dispatch_dbl_dsp",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
-new file mode 100644
-index 000000000000..48ff028a3c46
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c
-@@ -0,0 +1,56 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_dispatch_dbl_dsp *skel;
-+
-+	skel = select_cpu_dispatch_dbl_dsp__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
-+	struct bpf_link *link;
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	sleep(1);
-+
-+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_dispatch_dbl_dsp *skel = ctx;
-+
-+	select_cpu_dispatch_dbl_dsp__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_dispatch_dbl_dsp = {
-+	.name = "select_cpu_dispatch_dbl_dsp",
-+	.description = "Verify graceful failure if we dispatch twice to a "
-+		       "DSQ in ops.select_cpu()",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp)
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
-new file mode 100644
-index 000000000000..7f3ebf4fc2ea
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
-@@ -0,0 +1,92 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * A scheduler that validates that enqueue flags are properly stored and
-+ * applied at dispatch time when a task is directly dispatched from
-+ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and
-+ * making the test a very basic vtime scheduler.
-+ *
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+
-+#include <scx/common.bpf.h>
-+
-+char _license[] SEC("license") = "GPL";
-+
-+volatile bool consumed;
-+
-+static u64 vtime_now;
-+
-+#define VTIME_DSQ 0
-+
-+static inline bool vtime_before(u64 a, u64 b)
-+{
-+	return (s64)(a - b) < 0;
-+}
-+
-+static inline u64 task_vtime(const struct task_struct *p)
-+{
-+	u64 vtime = p->scx.dsq_vtime;
-+
-+	if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
-+		return vtime_now - SCX_SLICE_DFL;
-+	else
-+		return vtime;
-+}
-+
-+s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p,
-+		   s32 prev_cpu, u64 wake_flags)
-+{
-+	s32 cpu;
-+
-+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-+	if (cpu >= 0)
-+		goto ddsp;
-+
-+	cpu = prev_cpu;
-+	scx_bpf_test_and_clear_cpu_idle(cpu);
-+ddsp:
-+	scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
-+	return cpu;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
-+{
-+	if (scx_bpf_consume(VTIME_DSQ))
-+		consumed = true;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p)
-+{
-+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
-+		vtime_now = p->scx.dsq_vtime;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p,
-+		    bool runnable)
-+{
-+	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
-+}
-+
-+void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p)
-+{
-+	p->scx.dsq_vtime = vtime_now;
-+}
-+
-+s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
-+{
-+	return scx_bpf_create_dsq(VTIME_DSQ, -1);
-+}
-+
-+SEC(".struct_ops.link")
-+struct sched_ext_ops select_cpu_vtime_ops = {
-+	.select_cpu		= select_cpu_vtime_select_cpu,
-+	.dispatch		= select_cpu_vtime_dispatch,
-+	.running		= select_cpu_vtime_running,
-+	.stopping		= select_cpu_vtime_stopping,
-+	.enable			= select_cpu_vtime_enable,
-+	.init			= select_cpu_vtime_init,
-+	.name			= "select_cpu_vtime",
-+	.timeout_ms		= 1000U,
-+};
-diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c
-new file mode 100644
-index 000000000000..b4629c2364f5
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c
-@@ -0,0 +1,59 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include <sys/wait.h>
-+#include <unistd.h>
-+#include "select_cpu_vtime.bpf.skel.h"
-+#include "scx_test.h"
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	struct select_cpu_vtime *skel;
-+
-+	skel = select_cpu_vtime__open_and_load();
-+	SCX_FAIL_IF(!skel, "Failed to open and load skel");
-+	*ctx = skel;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	struct select_cpu_vtime *skel = ctx;
-+	struct bpf_link *link;
-+
-+	SCX_ASSERT(!skel->bss->consumed);
-+
-+	link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops);
-+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-+
-+	sleep(1);
-+
-+	SCX_ASSERT(skel->bss->consumed);
-+
-+	bpf_link__destroy(link);
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup(void *ctx)
-+{
-+	struct select_cpu_vtime *skel = ctx;
-+
-+	select_cpu_vtime__destroy(skel);
-+}
-+
-+struct scx_test select_cpu_vtime = {
-+	.name = "select_cpu_vtime",
-+	.description = "Test doing direct vtime-dispatching from "
-+		       "ops.select_cpu(), to a non-built-in DSQ",
-+	.setup = setup,
-+	.run = run,
-+	.cleanup = cleanup,
-+};
-+REGISTER_SCX_TEST(&select_cpu_vtime)
-diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c
-new file mode 100644
-index 000000000000..ce36cdf03cdc
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/test_example.c
-@@ -0,0 +1,49 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <bpf/bpf.h>
-+#include <scx/common.h>
-+#include "scx_test.h"
-+
-+static bool setup_called = false;
-+static bool run_called = false;
-+static bool cleanup_called = false;
-+
-+static int context = 10;
-+
-+static enum scx_test_status setup(void **ctx)
-+{
-+	setup_called = true;
-+	*ctx = &context;
-+
-+	return SCX_TEST_PASS;
-+}
-+
-+static enum scx_test_status run(void *ctx)
-+{
-+	int *arg = ctx;
-+
-+	SCX_ASSERT(setup_called);
-+	SCX_ASSERT(!run_called && !cleanup_called);
-+	SCX_EQ(*arg, context);
-+
-+	run_called = true;
-+	return SCX_TEST_PASS;
-+}
-+
-+static void cleanup (void *ctx)
-+{
-+	SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked");
-+}
-+
-+struct scx_test example = {
-+	.name		= "example",
-+	.description	= "Validate the basic function of the test suite itself",
-+	.setup		= setup,
-+	.run		= run,
-+	.cleanup	= cleanup,
-+};
-+REGISTER_SCX_TEST(&example)
-diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c
-new file mode 100644
-index 000000000000..e47769c91918
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/util.c
-@@ -0,0 +1,71 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
-+ */
-+#include <errno.h>
-+#include <fcntl.h>
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <unistd.h>
-+
-+/* Returns read len on success, or -errno on failure. */
-+static ssize_t read_text(const char *path, char *buf, size_t max_len)
-+{
-+	ssize_t len;
-+	int fd;
-+
-+	fd = open(path, O_RDONLY);
-+	if (fd < 0)
-+		return -errno;
-+
-+	len = read(fd, buf, max_len - 1);
-+
-+	if (len >= 0)
-+		buf[len] = 0;
-+
-+	close(fd);
-+	return len < 0 ? -errno : len;
-+}
-+
-+/* Returns written len on success, or -errno on failure. */
-+static ssize_t write_text(const char *path, char *buf, ssize_t len)
-+{
-+	int fd;
-+	ssize_t written;
-+
-+	fd = open(path, O_WRONLY | O_APPEND);
-+	if (fd < 0)
-+		return -errno;
-+
-+	written = write(fd, buf, len);
-+	close(fd);
-+	return written < 0 ? -errno : written;
-+}
-+
-+long file_read_long(const char *path)
-+{
-+	char buf[128];
-+
-+
-+	if (read_text(path, buf, sizeof(buf)) <= 0)
-+		return -1;
-+
-+	return atol(buf);
-+}
-+
-+int file_write_long(const char *path, long val)
-+{
-+	char buf[64];
-+	int ret;
-+
-+	ret = sprintf(buf, "%lu", val);
-+	if (ret < 0)
-+		return ret;
-+
-+	if (write_text(path, buf, sizeof(buf)) <= 0)
-+		return -1;
-+
-+	return 0;
-+}
-diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h
-new file mode 100644
-index 000000000000..bc13dfec1267
---- /dev/null
-+++ b/tools/testing/selftests/sched_ext/util.h
-@@ -0,0 +1,13 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
-+ * Copyright (c) 2024 David Vernet <void@manifault.com>
-+ */
-+
-+#ifndef __SCX_TEST_UTIL_H__
-+#define __SCX_TEST_UTIL_H__
-+
-+long file_read_long(const char *path);
-+int file_write_long(const char *path, long val);
-+
-+#endif // __SCX_TEST_H__
--- 
-2.45.1.145.g83f1add914
-
diff --git a/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip b/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip
new file mode 100644
index 0000000..ea85a13
--- /dev/null
+++ b/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip
@@ -0,0 +1,13 @@
+Taken from https://github.com/sched-ext/scx/commit/7d9b2cc26473526883297df78e8eee3f2e7b6194.
+
+--- a/lib/scxtest/overrides.h
++++ b/lib/scxtest/overrides.h
+@@ -13,7 +13,7 @@
+  * that we want to get rid of that belongs here.
+  */
+ #define __builtin_preserve_field_info(x,y) 1
+-#define __builtin_preserve_enum_value(x,y) 1
++#define __builtin_preserve_enum_value(x,y,z) 1
+ 
+ #define bpf_addr_space_cast(var, dst_as, src_as)
+
-- 
2.53.0